In [1]:
from translating_the_law.utils.get_from_disk import open_from_disk

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
import numpy as np
import pandas as pd


In [3]:
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
basic_data = open_from_disk()

Basic data is a list and each item inside the list is a dictionary with these keys

In [6]:
basic_data[0].keys()

dict_keys(['judgement', 'press summary', 'details'])

#### Copy dataset


In [7]:
judg_data = [basic_data[i]["judgement"]['body'] for i in range(len(basic_data))]
dfc = pd.DataFrame(judg_data).copy()

# Preprocessing

In [8]:
dfc.head()

Unnamed: 0,0
0,Hilary Term [2022] UKSC 6 On appeal from: [202...
1,The reporting restrictions made by the High Co...
2,Hilary Term [2022] UKSC 2 On appeal from: [202...
3,Hilary Term [2022] UKSC 2 On appeal from: [202...
4,THE COURT ORDERED that no one shall publish or...


#### Clean text

In [9]:
dfc = dfc.rename(columns={0 : 'text'})
dfc.head()

Unnamed: 0,text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...
1,The reporting restrictions made by the High Co...
2,Hilary Term [2022] UKSC 2 On appeal from: [202...
3,Hilary Term [2022] UKSC 2 On appeal from: [202...
4,THE COURT ORDERED that no one shall publish or...


In [10]:
def remove_punctuation(text):
    for punctuation in string.punctuation: 
        text = text.replace(punctuation, ' ') 
    return text

dfc['clean_text'] = dfc.text.apply(remove_punctuation)

In [11]:
def cleaner(text, **kwargs):
  """params is a list of things to remove: codec, acronyms, numbers, brackets"""
  if 'codec' in kwargs['params']:
    text_encoded = text.encode('ascii', errors = 'ignore')
    text_decode = text_encoded.decode()
    clean_text = " ".join([word for word in text_decode.split()])
    text = clean_text
  if 'numbers' in kwargs['params']:
    pattern = r'[0-9]'
    text = re.sub(pattern, '', text)
  if 'brackets' in kwargs['params']: 
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\[.*?\]', '', text)
  if 'acronyms' in kwargs['params']:
    text = text.split()
    clean_text = []
    for word in text: 
      if any(l.islower() for l in word):
        clean_text.append(word)
    text = ' '.join(clean_text)
  return text


In [12]:
params = ['codec', 'numbers', 'acronyms', 'brackets']
for i in range(len(dfc['clean_text'])):
    strg = dfc['clean_text'][i]
    dfc['clean_text'][i] = cleaner(strg, params=params)

In [13]:
def lowercase (text): 
    lowercased = text.lower() 
    return lowercased

dfc['clean_text'] = dfc.clean_text.apply(lowercase)

dfc

Unnamed: 0,text,clean_text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...,hilary term on appeal from craig appellant v h...
1,The reporting restrictions made by the High Co...,the reporting restrictions made by the high co...
2,Hilary Term [2022] UKSC 2 On appeal from: [202...,hilary term on appeal from admin pwr appellant...
3,Hilary Term [2022] UKSC 2 On appeal from: [202...,hilary term on appeal from admin pwr appellant...
4,THE COURT ORDERED that no one shall publish or...,that no one shall publish or reveal the names ...
...,...,...
941,Michaelmas Term [2009] UKSC 15 On appeal from...,michaelmas term on appeal from civ on the appl...
942,Michaelmas Term [2009] UKSC 16 On appeal from:...,michaelmas term on appeal from civ civ ahmed m...
943,Michaelmas Term [2009] UKSC 16 On appeal from:...,michaelmas term on appeal from civ civ ahmed m...
944,Michaelmas Term [2009] UKSC 16 On appeal from:...,michaelmas term on appeal from civ civ ahmed m...


In [14]:
text = nlp(dfc['clean_text'][0])
for w in text.ents:
    print(w.text,w.label_)
    

the united states of GPE
america GPE
lloyd jones ORG
kitchin PERSON
november DATE
dunne defence st PERSON
the united states GPE
america GPE
kenny PERSON
scotland GPE
kitchin PERSON
scottish NORP
scotland GPE
the european convention ORG
the united kingdom GPE
a period of years DATE
october DATE
scott baker PERSON
the house of commons home affairs committee ORG
some months later DATE
scott PERSON
october DATE
february DATE
one CARDINAL
the united kingdom GPE
the united kingdom GPE
the united kingdom GPE
the united kingdom GPE
the united kingdom GPE
one CARDINAL
one CARDINAL
the united kingdom GPE
the united kingdom GPE
the united kingdom GPE
the united states GPE
america GPE
one CARDINAL
two CARDINAL
secondly ORDINAL
scotland GPE
such day DATE
days DATE
northern ireland GPE
october DATE
scottish NORP
scottish NORP
scottish NORP
the day DATE
the house of commons ORG
scottish NORP
scotland GPE
scottish NORP
december DATE
scottish NORP
scotland GPE
scottish NORP
daily DATE
december DATE
sco

In [15]:
lst_ent = nlp.pipe_labels['ner']
lst_rem_ent = ['PERSON', 'GPE', 'LOC', 'DATE', 'CARDINAL']

In [25]:
def filtering(text):
    text = nlp(text)
    new_str = ''
    for w in text.ents:
        if w.label_ not in lst_rem_ent:
            new_str += str(w.text) + " "
    return new_str

In [16]:
def filtering_2(text):
    text = nlp(text)
    new_lst = []
    for w in text.ents:
        if w.label_ in lst_rem_ent:
            new_lst.append(w.text)
    return new_lst


In [None]:
first_try = filtering_2(dfc['clean_text'][0])
first_try

In [28]:
stop_words = set(stopwords.words('english')) 


def filter_final(text):
    delete_words = filtering_2(text)
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    with_filter = [word for word in without_stopwords if not word in delete_words]
    return with_filter

In [29]:
dfc['clean_text'] = dfc.clean_text.apply(filter_final)

In [26]:
dfc['clean_text'][0]

'hilary'

#### get a list of lists of words and lemmatize

In [32]:
"""
stop_words = set(stopwords.words('english')) 


def remove_stopwords (text):
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

dfc['clean_text'] = dfc.clean_text.apply(remove_stopwords)

dfc.head()
"""

Unnamed: 0,text,clean_text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...,"[lloyd, jones, scottish, european, convention,..."
1,The reporting restrictions made by the High Co...,"[lloyd, jones, bloomberg, news, one, bloomberg..."
2,Hilary Term [2022] UKSC 2 On appeal from: [202...,"[lloyd, jones, birnberg, peirce, ltd, european..."
3,Hilary Term [2022] UKSC 2 On appeal from: [202...,"[lloyd, jones, birnberg, peirce, ltd, european..."
4,THE COURT ORDERED that no one shall publish or...,"[british, british, government, legal, departme..."


In [34]:
def lemma(text):
    lemmatizer = WordNetLemmatizer() # Initiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text] # Lemmatize
    return lemmatized

dfc['clean_text'] = dfc.clean_text.apply(lemma)

dfc['clean_text'][1]

['lloyd',
 'jones',
 'bloomberg',
 'news',
 'one',
 'bloomberg',
 'supreme',
 'court',
 'bloomberg',
 'one',
 'united',
 'nation',
 'convention',
 'third',
 'three',
 'four',
 'ltd',
 'board',
 'bloomberg',
 'european',
 'two',
 'first',
 'secondly',
 'two',
 'two',
 'one',
 'two',
 'bloomberg',
 'bloomberg',
 'one',
 'one',
 'two',
 'two',
 'campbell',
 'v',
 'ltd',
 'douglas',
 'v',
 'hello',
 'ltd',
 'two',
 'one',
 'two',
 'two',
 'two',
 'european',
 'court',
 'human',
 'right',
 'ecthr',
 'supreme',
 'court',
 'newspaper',
 'ltd',
 'civ',
 'para',
 'one',
 'one',
 'axel',
 'axel',
 'first',
 'one',
 'first',
 'second',
 'third',
 'fourth',
 'two',
 'fifth',
 'one',
 'first',
 'second',
 'bloomberg',
 'bloomberg',
 'bloomberg',
 'bloomberg',
 'one',
 'bloomberg',
 'first',
 'one',
 'two',
 'second',
 'third',
 'fourth',
 'first',
 'second',
 'third',
 'fourth',
 'first',
 'one',
 'general',
 'v',
 'ltd',
 'admin',
 'second',
 'house',
 'common',
 'home',
 'affair',
 'committee',
 

# Preprocessing Part 2

Get a list of all the words that we know won't be used to extract context and will mislead our feature extraction

In [37]:
word_to_remove = [
'hilary', 'term', 'appeal', 'page', 'lord', 'lady', 'pwr',
 'michaelmas', 'uksc', 'civ', 'ewca', 'president', 'appealant', 'court', 'judge', 
]

In [45]:
def remove_final(lst):
    res = []
    for word in lst:
        if word in word_to_remove:
            lst.remove(word)
        if word not in res:
            res.append(word)
    return res
    

In [49]:
dfc['clean_text'] = dfc.clean_text.apply(remove_final)
dfc

Unnamed: 0,text,clean_text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...,"[lloyd, jones, scottish, european, convention,..."
1,The reporting restrictions made by the High Co...,"[lloyd, jones, bloomberg, news, one, supreme, ..."
2,Hilary Term [2022] UKSC 2 On appeal from: [202...,"[lloyd, jones, birnberg, peirce, ltd, european..."
3,Hilary Term [2022] UKSC 2 On appeal from: [202...,"[lloyd, jones, birnberg, peirce, ltd, european..."
4,THE COURT ORDERED that no one shall publish or...,"[british, government, legal, department, amnes..."
...,...,...
941,Michaelmas Term [2009] UKSC 15 On appeal from...,"[stone, king, sewell, british, aileen, first, ..."
942,Michaelmas Term [2009] UKSC 16 On appeal from:...,"[civ, mahad, treasury, five, third, three, two..."
943,Michaelmas Term [2009] UKSC 16 On appeal from:...,"[civ, mahad, treasury, five, third, three, two..."
944,Michaelmas Term [2009] UKSC 16 On appeal from:...,"[civ, mahad, treasury, five, third, three, two..."


# LDA : extract topic

In [65]:
dfc['clean_text'] = dfc['clean_text'].astype('str')


In [51]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(dfc['clean_text'])


In [None]:
#tf_idf_vectorizer = TfidfVectorizer(ngram_range = (2,2))

# X = tf_idf_vectorizer.fit_transform(texts)

# X.toarray()

# pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

In [56]:

lda_model = LatentDirichletAllocation(n_components=20, learning_method='online')

lda_vectors = lda_model.fit_transform(data_vectorized)

In [57]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('leighton', 5.973926771015396), ('berwin', 5.629273738554441), ('woman', 2.7029599381506446), ('paisner', 2.6804645732361694), ('ec', 2.576908432795442), ('assessment', 2.401710237908965), ('euro', 1.985676416580449), ('leverton', 1.8185016250826938), ('holiday', 1.8182187755953116), ('dollar', 1.8003586501840936)]
Topic 1:
[('bond', 2.4311675479918224), ('bail', 1.6413229945292496), ('initio', 0.8935053823005197), ('arrest', 0.8424878149121915), ('moorland', 0.8390583133085939), ('void', 0.83886284849497), ('narrower', 0.8386979966653643), ('glasbrook', 0.8383726874030322), ('loucas', 0.8383510076364973), ('aylmerton', 0.8382864047043291)]
Topic 2:
[('respondent', 0.05467791222553416), ('two', 0.05452411010758935), ('conditional', 0.054466453331343236), ('christian', 0.05445756457859115), ('convention', 0.054451084023236986), ('lloyd', 0.054427947722018734), ('parliament', 0.05442577617031691), ('british', 0.0543679058666801), ('ii', 0.054357392471956784), ('scottish', 0.05

vectorizer = CountVectorizer(ngram_range=(2,2))

X_ngram = vectorizer.fit_transform(data.clean_reviews)

cv_nb = cross_validate( MultinomialNB(), X_ngram, data.target, scoring = "accuracy")

cv_nb['test_score'].mean()

In [None]:
# Create Pipeline

# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline
#from sklearn.model_selection import GridSearchCV
#from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
#pipeline = Pipeline([
  #  ('tfidf', TfidfVectorizer()),
 #   ('nb', MultinomialNB()),
])

# Set parameters to search
# parameters = {
  #  'tfidf__ngram_range': ((1,1), (2,2)),
  #  'tfidf__min_df': (0.05,0.1),
  #  'tfidf__max_df': (0.75,1),
  #  'nb__alpha': (0.01,0.1,1,10),}

# Perform grid search on pipeline
# grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
    #                       verbose=1, scoring = "accuracy", 
       #                    refit=True, cv=5)

# grid_search.fit(data.clean_reviews,data.target)

####

# Initialize Model

In [None]:
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield TaggedDocument(list_of_words, [i])
data_for_training = list(tagged_document(dfc['clean_text']))


#### Once trained we now need to initialise the model.

In [None]:
model = Doc2Vec(documents = data_for_training)

#### Build Vocabulary

In [None]:
model.build_vocab(data_for_training)

#### Train the Doc2Vec 

In [None]:
model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

## Analysing the Output

In [None]:
vec1 = model.infer_vector(test_1)

In [None]:
len(vec1)

In [None]:
model