In [10]:
from zipfile import ZipFile

with ZipFile('../intermediate/lists_to_use.zip', 'r') as f:
    f.extractall('../intermediate/')

In [11]:
import pickle
with open('../intermediate/jgslist_latest.pickle', 'rb') as file:
  jgslist = pickle.load(file)

with open('../intermediate/sumlist_new.pickle', 'rb') as file:
  sumlist = pickle.load(file)

1. jgslist and sumlist is a list of list of sentences that is preprocessed
2. jgslist[i] is a list that contains sentences of ith judgement
3. sumlist[i] is a list that contains sentences of ith judgement's summary
4. len(jgslist) = len(sumlist) = 7028 (we have 7028 judgements in our dataset)
5. stored jgslist and sumlist in preprocessing.ipynb file present in the root directory

Found that lemmatizing our words may reduce chances of OOV words problem

Lemmatized each word in our dataset using nltk WordNetLemmatizer

In [5]:
#lets lemmatize all the words of the dataset
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
lemmatizer = WordNetLemmatizer()
jgslist = [[(" ".join([lemmatizer.lemmatize(word) for word in word_tokenize(sent)])) for sent in lst] for lst in jgslist]
sumlist = [[(" ".join([lemmatizer.lemmatize(word) for word in word_tokenize(sent)])) for sent in lst] for lst in sumlist]

In [None]:
import numpy as np
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
import torch  
from transformers import BertTokenizer,BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertModel.from_pretrained("bert-base-uncased")

In [11]:
#takes list of strings as a parameter
#return list where each element 

def get_sentence_embeddings(sentences):
    # Tokenize the sentences
    encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        sentence_embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling

    return sentence_embeddings.tolist()


In [21]:
#just checking whether each sentence embedding is correctly converted to list or some other datatype

temp = [["paper publish","of course"],["NITK Hale Lab","IT cross CSE"]]
tvec = [get_sentence_embeddings(lst) for lst in temp]
type(tvec[0][0])

list

Finding vector of each sentences in jgslist and sumlist and storing it in list

In [11]:
#finding sentence embeddings for each sentence

#lst is a list of sentences of a particular judgement
jgsvector = [get_sentence_embeddings(lst) for lst in jgslist]

#lst is a list of sentences of a particular judgement
sumvector = [get_sentence_embeddings(lst) for lst in sumlist]

In [None]:
# for a particular vector is jgsvector there will be a similarity score
# so for vectors in jgsvector[i] we have to compare each vector with vectors in sumvector[i] and add the scores for that vector in jgsvector[i]

scores = []
for i in range(len(jgsvector)):
  temp = []
  for j in range(len(jgsvector[i])):
    #compare jth sentence with all sentences in sumlist[i] and add scores
    score = 0
    vect1 = np.array(jgsvector[i][j])
    for lst in sumvector[i]:
      vect2 = np.array(lst)
      score += cosine(vect1,vect2)
    temp.append(score)
  scores.append(temp)

In [None]:
#store my scores list, sentence emebeddings in intermediate folder                      
# Save data to a pickle file
                
with open('../intermediate/scores_bert.pickle', 'wb') as file:
    pickle.dump(scores, file)

with open('../intermediate/jgsvector_bert.pickle', 'wb') as file:
    pickle.dump(jgsvector, file)

with open('../intermediate/sumvector_bert.pickle', 'wb') as file:
    pickle.dump(sumvector, file)