In [38]:
import os
import pandas as pd
import numpy as np
from nltk import wordpunct_tokenize
import torch

#import preprocessing file
import pp

In [39]:
asp = {
"NAME" : 1,
"CITATION" : 2,
"COUNSEL" : 3,
"JUDGE" : 4,
"FACTS" : 5,
"RLC" : 6,
"REASONING" : 7,
"ARG" : 8,
"STATUTE" : 9,
"PRECEDENT" : 10,
"RPC" : 11,
"ISSUE" : 12
}

In [40]:
x = []
y = []
no_of_files = 10

In [41]:
for i in range(1,no_of_files+1):
    annpath = f'../labelled_data/{i}.csv'
    df = pd.read_csv(annpath)
    df = df.dropna(subset='Sentence',how='all')

    #now each row in df has some value and are not empty
    text = []
    for index,row in df.iterrows():
        text.append(row['Sentence'])
    x.append(pp.preprocess(text))
    temp = []
    for index, row in df.iterrows():
        if pd.isna(row['Labels']):
            dum = [0]
        else:
            dum = []
            for word in wordpunct_tokenize(row['Labels']):
                if word in asp:
                    dum.append(asp[word])
            if len(dum)==0:
                dum.append(0)
        temp.append(dum)
    y.append(temp)

In [42]:
for i in range(0,10):
    if len(x[i])!=len(y[i]):
        print(i)

In [43]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

Some weights of the model checkpoint at law-ai/InLegalBERT were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [44]:
#Now there are some sentences in x which should not be included in 
#summary and thus we have to mark its corresponding y as 0

#Use of sentence similarity
similarity = []
for i in range(0,no_of_files):
    # x[i] is list of preprocessed sentences present in {i+1}.txt judgement

    sim = []
    #need to calculate similarity of each sentence with summary
    for jgs in x[i]:
        #finding sentence embedding of jgs
        encoded_input = tokenizer(jgs, return_tensors='pt')
        with torch.no_grad():
            output = model(**encoded_input) 
        last_hidden_state = output.last_hidden_state[:, 0, :]
        jgsvect = last_hidden_state

        #find cosine similarity between jgs and all sentences in corresponding summary
        simi = 0
        sumpath = f'../dataset/train-data/summary/{i+1}.txt'
        text = []
        with open(sumpath,'r') as file:
            for line in file:
                text.append(line.strip())
        file = pp.preprocess(text)
        for line in file:
            #finding sentence embedding of sumline
            encoded_input = tokenizer(line, return_tensors='pt')
            with torch.no_grad():
                output = model(**encoded_input)
            last_hidden_state = output.last_hidden_state[:, 0, :]
            sumvect = last_hidden_state
            simi += torch.nn.functional.cosine_similarity(sumvect, jgsvect, dim=1)
        sim.append(simi)
    similarity.append(sim)

In [52]:
import pickle
# Save data to a pickle file
with open('../intermediate/x.pickle', 'wb') as file:
    pickle.dump(x, file)

with open('../intermediate/y.pickle', 'wb') as file:
    pickle.dump(y, file)

with open('../intermediate/similarity.pickle', 'wb') as file:
    pickle.dump(similarity, file)


In [61]:
with open('../intermediate/x.pickle','rb') as file:
    x = pickle.load(file)
with open('../intermediate/y.pickle','rb') as file:
    y = pickle.load(file)
with open('../intermediate/similarity.pickle','rb') as file:
    similarity = pickle.load(file)

In [62]:
for i in range(0,no_of_files):
    dict = {1 : [], 2 : [], 3 : [], 4 : [], 5 : [], 6 : [], 7 : [], 8 : [], 9 : [], 10 : [], 11 : [], 12 : []}
    
    #for each sentences in x[i]
    for j in range(len(x[i])):
        #x[i][j] is the jth sentence in ith judgement
        # y[i][j] is a list containing labels
        for label in y[i][j]:
            if label!=0:
                dict[label].append([similarity[i][j],j])
    
    #stored values in dict
    #now for each dict[key] i will take last some k values and map rest other with 0
    for key in dict:
        lst = dict[key]
        sorted_list = sorted(lst, key=lambda x: x[0], reverse=True)
        sz = len(sorted_list)
        if sz<=1:
            continue
        else:
            #take first len/2 elements only i.e from 0 to len/2-1
            for k in range(sz//2, sz):
                y[i][sorted_list[k][1]].remove(key)
                if len(y[i][sorted_list[k][1]])==0:
                    y[i][sorted_list[k][1]].append(0)

In [64]:
with open('../intermediate/Y.pickle', 'wb') as file:
    pickle.dump(y, file)