In [1]:
# Imports
import os

# Packages
import pandas

# LexNLP imports
from lexnlp.nlp.en.segments.sentences import get_sentence_list
from lexnlp.nlp.en.tokens import get_token_list, get_stem_list

# Imports
import gensim.models.word2vec
import gensim.models.doc2vec

In [2]:
# Setup constants
HTML_INPUT_PATH = "../data/text/"

In [3]:
# Build list of paths to review
court_path_list = []
for country in os.listdir(HTML_INPUT_PATH):
    for level_a in os.listdir(os.path.join(HTML_INPUT_PATH, country, "cases")):
        for level_b in os.listdir(os.path.join(HTML_INPUT_PATH, country, "cases", level_a)):
            if level_b.isdigit():
                court_path_list.append({"court_name": level_a,
                                        "court_division": None,
                                        "country": country,
                                        "year": int(level_b),
                                        "path": os.path.join(HTML_INPUT_PATH, country, "cases", level_a, level_b)})
                continue

            for level_c in os.listdir(os.path.join(HTML_INPUT_PATH, country, "cases", level_a, level_b)):
                court_path_list.append({"court_name": level_a,
                                        "court_division": level_b,
                                        "country": country,
                                        "year": int(level_c),
                                        "path": os.path.join(HTML_INPUT_PATH, country, "cases", level_a, level_b, level_c)})

print("Court-years detected: {0}".format(len(court_path_list)))

Court-years detected: 292


In [None]:
# setup key storage
sentences = []
documents = []

# Iterate through court-year paths
for court_path in court_path_list:
    # Get file list
    court_year_file_list = os.listdir(court_path["path"])
    print((court_path["court_name"],
           court_path["court_division"],
           court_path["year"],
           len(court_year_file_list)          
          ))
    
    for case_file_name in court_year_file_list:
        case_file_path = os.path.join(court_path["path"], case_file_name)
        with open(case_file_path, "r") as input_file:
            text_content = input_file.read()
        
        doc_stems = []
        for sentence in get_sentence_list(text_content):
            sentence_stems = [s for s in get_stem_list(sentence, stopword=True, lowercase=True) if s.isalpha()]
            doc_stems.extend(sentence_stems)
            sentences.append(sentence_stems)
        documents.append(gensim.models.doc2vec.TaggedDocument(doc_stems, ["{0}".format(case_file_path)]))

('EWHC', 'Admlty', 2015, 6)
('EWHC', 'Admlty', 2003, 4)
('EWHC', 'Admlty', 2017, 1)
('EWHC', 'Admlty', 2006, 1)
('EWHC', 'Admlty', 2002, 7)
('EWHC', 'Admlty', 2004, 4)
('EWHC', 'Admlty', 2014, 4)
('EWHC', 'Admlty', 2001, 7)
('EWHC', 'Admlty', 2005, 3)
('EWHC', 'Admlty', 2009, 8)
('EWHC', 'Admlty', 1999, 1)
('EWHC', 'Admlty', 2016, 1)
('EWHC', 'Admlty', 2011, 5)
('EWHC', 'Admlty', 2013, 3)
('EWHC', 'Admlty', 2012, 3)
('EWHC', 'Admlty', 2008, 6)
('EWHC', 'Admlty', 2000, 2)
('EWHC', 'Admlty', 2018, 2)
('EWHC', 'Admlty', 2007, 2)
('EWHC', 'Admlty', 2010, 3)
('EWHC', 'Admin', 2015, 714)
('EWHC', 'Admin', 2003, 626)


In [None]:
# word2vec models
min_count = 10
w2v_size_list = [100, 200]
w2v_window_list = [5, 10, 20]
for size in w2v_size_list:
    for window in w2v_window_list:
        w2v_model_cbow = gensim.models.word2vec.Word2Vec(sentences, size=size, window=window, min_count=min_count, workers=1)
        w2v_model_cbow.save("../data/models/w2v_cbow_all_size{0}_window{1}".format(size, window))

# doc2vec models
min_count = 10
d2v_size_list = [100, 200]
d2v_window_list = [5, 10, 20]
for size in d2v_size_list:
    for window in d2v_window_list:
        d2v_model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=size, window=window, min_count=min_count, workers=1)
        d2v_model.save("../data/models/d2v_all_size{0}_window{1}".format(size, window))