In [1]:
from translating_the_law.utils.get_from_disk import open_from_disk

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
import numpy as np
import pandas as pd


In [3]:
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
basic_data = open_from_disk()

Basic data is a list and each item inside the list is a dictionary with these keys

In [6]:
basic_data[0].keys()

dict_keys(['judgement', 'press summary', 'details'])

#### Copy dataset


In [7]:
judg_data = [basic_data[i]["judgement"]['body'] for i in range(len(basic_data))]
dfc = pd.DataFrame(judg_data).copy()

# Preprocessing

In [8]:
dfc.head()

Unnamed: 0,0
0,Hilary Term [2022] UKSC 6 On appeal from: [202...
1,The reporting restrictions made by the High Co...
2,Hilary Term [2022] UKSC 2 On appeal from: [202...
3,Hilary Term [2022] UKSC 2 On appeal from: [202...
4,THE COURT ORDERED that no one shall publish or...


#### Clean text

In [9]:
dfc = dfc.rename(columns={0 : 'text'})
dfc.head()

Unnamed: 0,text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...
1,The reporting restrictions made by the High Co...
2,Hilary Term [2022] UKSC 2 On appeal from: [202...
3,Hilary Term [2022] UKSC 2 On appeal from: [202...
4,THE COURT ORDERED that no one shall publish or...


In [10]:
def remove_punctuation(text):
    for punctuation in string.punctuation: 
        text = text.replace(punctuation, ' ') 
    return text

dfc['clean_text'] = dfc.text.apply(remove_punctuation)

In [11]:
def cleaner(text, **kwargs):
  """params is a list of things to remove: codec, acronyms, numbers, brackets"""
  if 'codec' in kwargs['params']:
    text_encoded = text.encode('ascii', errors = 'ignore')
    text_decode = text_encoded.decode()
    clean_text = " ".join([word for word in text_decode.split()])
    text = clean_text
  if 'numbers' in kwargs['params']:
    pattern = r'[0-9]'
    text = re.sub(pattern, '', text)
  if 'brackets' in kwargs['params']: 
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\[.*?\]', '', text)
  if 'acronyms' in kwargs['params']:
    text = text.split()
    clean_text = []
    for word in text: 
      if any(l.islower() for l in word):
        clean_text.append(word)
    text = ' '.join(clean_text)
  return text


In [12]:
params = ['codec', 'numbers', 'acronyms', 'brackets']
for i in range(len(dfc['clean_text'])):
    strg = dfc['clean_text'][i]
    dfc['clean_text'][i] = cleaner(strg, params=params)

In [13]:
def lowercase (text): 
    lowercased = text.lower() 
    return lowercased

dfc['clean_text'] = dfc.clean_text.apply(lowercase)

dfc

Unnamed: 0,text,clean_text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...,hilary term on appeal from craig appellant v h...
1,The reporting restrictions made by the High Co...,the reporting restrictions made by the high co...
2,Hilary Term [2022] UKSC 2 On appeal from: [202...,hilary term on appeal from admin pwr appellant...
3,Hilary Term [2022] UKSC 2 On appeal from: [202...,hilary term on appeal from admin pwr appellant...
4,THE COURT ORDERED that no one shall publish or...,that no one shall publish or reveal the names ...
...,...,...
941,Michaelmas Term [2009] UKSC 15 On appeal from...,michaelmas term on appeal from civ on the appl...
942,Michaelmas Term [2009] UKSC 16 On appeal from:...,michaelmas term on appeal from civ civ ahmed m...
943,Michaelmas Term [2009] UKSC 16 On appeal from:...,michaelmas term on appeal from civ civ ahmed m...
944,Michaelmas Term [2009] UKSC 16 On appeal from:...,michaelmas term on appeal from civ civ ahmed m...


In [14]:
lst_rem_ent = ['PERSON', 'GPE', 'LOC', 'DATE', 'CARDINAL']
def filter_ner(text):
    text = str(text)
    text_2 = nlp(text)
    for w in text_2.ents:
        if w.label_ in lst_rem_ent:
            text = text.replace(w.text, "")
    return text

In [15]:
dfc['clean_text'] = dfc.clean_text.apply(filter_ner)

In [16]:
dfc['clean_text'][0]

'hilary term on appeal from craig appellant v her majestys advocate for the government of   and another respondents  before lord reed president lord lloyd js lord  lord burrows lord stephens  heard on  appellant aidan ill fred mackintosh instructed by  respondent her majestys advocate for the government of    mcbrearty lesley irvine instructed by international co operation unit crown office nd respondent her majestys advocate general for  andrew webster instructed by office of the advocate general with whom lord lloyd js lord  lord burrows and lord stephens agree this appeal concerns the powers of the ish ministers they exercise functions in relation to extradition proceedings in  but their powers are limited under the  act by a requirement not to act incompatibly with the rights guaranteed by the european convention on human rights the convention the appeal also raises issues under the constitutional law of  concerning the obligations of the government in relation to the commencement 

In [17]:
stop_words = set(stopwords.words('english')) 


def remove_stopwords (text):
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

dfc['clean_text'] = dfc.clean_text.apply(remove_stopwords)

dfc.head()

Unnamed: 0,text,clean_text
0,Hilary Term [2022] UKSC 6 On appeal from: [202...,"[hilary, term, appeal, craig, appellant, v, ma..."
1,The reporting restrictions made by the High Co...,"[reporting, restrictions, made, high, court, c..."
2,Hilary Term [2022] UKSC 2 On appeal from: [202...,"[hilary, term, appeal, admin, pwr, appellant, ..."
3,Hilary Term [2022] UKSC 2 On appeal from: [202...,"[hilary, term, appeal, admin, pwr, appellant, ..."
4,THE COURT ORDERED that no one shall publish or...,"[shall, publish, reveal, names, addresses, app..."


In [18]:
"""
text = nlp(dfc['clean_text'][0])
for w in text.ents:
    print(w.text,w.label_)
"""

"\ntext = nlp(dfc['clean_text'][0])\nfor w in text.ents:\n    print(w.text,w.label_)\n"

#### get a list of lists of words and lemmatize

In [19]:
def lemma(text):
    lemmatizer = WordNetLemmatizer() # Initiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text] # Lemmatize
    return lemmatized

dfc['clean_text'] = dfc.clean_text.apply(lemma)

# Preprocessing Part 2

Get a list of all the words that we know won't be used to extract context and will mislead our feature extraction

In [20]:
word_to_remove = [
'hilary', 'term', 'appeal', 'page', 'lord', 'lady', 'pwr',
 'michaelmas', 'uksc', 'civ', 'ewca', 'president', 'appealant', 'court', 'judge', 
]

In [21]:
def remove_final(lst):
    res = []
    for word in lst:
        if word in word_to_remove:
            lst.remove(word)
        elif word not in res:
            res.append(word)
    return res

In [22]:
dfc['clean_text'] = dfc.clean_text.apply(remove_final)
dfc['clean_text']

0      [appellant, v, majesty, advocate, government, ...
1      [reporting, restriction, made, high, force, v,...
2      [v, director, public, prosecution, respondent,...
3      [v, director, public, prosecution, respondent,...
4      [shall, publish, reveal, name, address, appell...
                             ...                        
941    [respondent, v, governing, body, admission, ap...
942    [ahmed, mahad, previously, referred, appellant...
943    [ahmed, mahad, previously, referred, appellant...
944    [ahmed, mahad, previously, referred, appellant...
945    [deputy, rodger, heard, appellant, case, instr...
Name: clean_text, Length: 946, dtype: object

# LDA : extract topic

In [23]:
dfc['clean_text'] = dfc['clean_text'].astype('str')


In [24]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = CountVectorizer()

# data_vectorized = vectorizer.fit_transform(dfc['clean_text'])


In [25]:
tf_idf_vectorizer = TfidfVectorizer(min_df=0.1)

X = tf_idf_vectorizer.fit_transform(dfc['clean_text'])

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())



Unnamed: 0,abandd,ability,able,abolished,abroad,absence,absent,absolute,absolutely,abstract,...,written,wrong,wrongful,wrongly,wrote,year,yes,yet,yield,young
0,0.000000,0.000000,0.023951,0.0,0.000000,0.025623,0.044714,0.042863,0.000000,0.0,...,0.030743,0.025344,0.0,0.000000,0.000000,0.036288,0.000000,0.028729,0.00000,0.000000
1,0.000000,0.026324,0.000000,0.0,0.000000,0.018588,0.000000,0.000000,0.000000,0.0,...,0.022302,0.018385,0.0,0.030514,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
2,0.000000,0.000000,0.019513,0.0,0.042771,0.000000,0.000000,0.034920,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
3,0.000000,0.000000,0.019513,0.0,0.042771,0.000000,0.000000,0.034920,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
4,0.000000,0.000000,0.019329,0.0,0.042369,0.020679,0.036086,0.000000,0.000000,0.0,...,0.024811,0.000000,0.0,0.000000,0.035966,0.029286,0.000000,0.000000,0.04855,0.036576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,0.029887,0.019678,0.012988,0.0,0.000000,0.013895,0.000000,0.023244,0.031128,0.0,...,0.000000,0.013744,0.0,0.000000,0.024167,0.000000,0.029887,0.015580,0.00000,0.024577
942,0.000000,0.032333,0.021341,0.0,0.046779,0.000000,0.000000,0.000000,0.000000,0.0,...,0.027393,0.022582,0.0,0.037480,0.000000,0.000000,0.000000,0.025599,0.00000,0.000000
943,0.000000,0.032333,0.021341,0.0,0.046779,0.000000,0.000000,0.000000,0.000000,0.0,...,0.027393,0.022582,0.0,0.037480,0.000000,0.000000,0.000000,0.025599,0.00000,0.000000
944,0.000000,0.032333,0.021341,0.0,0.046779,0.000000,0.000000,0.000000,0.000000,0.0,...,0.027393,0.022582,0.0,0.037480,0.000000,0.000000,0.000000,0.025599,0.00000,0.000000


In [26]:

lda_model = LatentDirichletAllocation(n_components=20, learning_method='online')

lda_vectors = lda_model.fit_transform(X)

In [27]:
def print_topics(model, tf_idf_vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(tf_idf_vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, tf_idf_vectorizer)

Topic 0:
[('requisite', 0.05240385352163568), ('balancing', 0.052314843207528994), ('line', 0.0522711601701398), ('possible', 0.05223455967171398), ('earlier', 0.052234154846940016), ('inherent', 0.052219244601542356), ('practicable', 0.05221810884668959), ('play', 0.05221604859698206), ('showed', 0.05221040068731092), ('settled', 0.05220564942353627)]
Topic 1:
[('foreign', 0.05236309514618423), ('sphere', 0.05227164520490249), ('leaving', 0.052270032120496523), ('conclud', 0.052248888266671525), ('contact', 0.05223383244336926), ('higher', 0.052229213997579974), ('dependent', 0.05220097719662375), ('crown', 0.052191903768205095), ('pleaded', 0.05219006509502056), ('refusing', 0.052162070585472994)]
Topic 2:
[('depended', 0.05237751065012326), ('institution', 0.05234616204209677), ('extent', 0.0523450637915102), ('security', 0.05234157149251969), ('submitted', 0.05231677357616037), ('period', 0.05229173553422748), ('executive', 0.05228262498747113), ('considering', 0.05225644449558371)



[('precludes', 0.052233204184190016), ('intends', 0.05219538783077494), ('significantly', 0.05217166934349457), ('discovered', 0.05215827577000676), ('insurer', 0.052148495884073005), ('original', 0.052130497845479434), ('wrong', 0.05211838924853605), ('road', 0.05210726493818346), ('attack', 0.05210713468847153), ('count', 0.05209830481837035)]


# Initialize Model

In [28]:
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument