# Import necessary libraries

In [8]:
#Install libraries
# !pip install transformers
# !pip install sentencepiece
# !pip install textattack


#Import libraries
import pandas as pd
import pickle
from transformers import MarianMTModel, MarianTokenizer
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter, EasyDataAugmenter, CharSwapAugmenter, CheckListAugmenter, CLAREAugmenter

  from .autonotebook import tqdm as notebook_tqdm


Data Augmentation - NOTE : Do not use these data for validation

Back translation
EDA (Easy Data Augmentation).
NLP Albumentation.
NLP Aug.

## NOTE: ##
If you already have back.pkl in this directory no need to perform below codes
You can just directly load back.pkl and start working with EDA

## BACK-TRANSLATION

In [3]:
#English to french model
en2french = 'Helsinki-NLP/opus-mt-en-fr'
en2french_tkn = MarianTokenizer.from_pretrained(en2french)
en2french_model = MarianMTModel.from_pretrained(en2french)

#french to english model
french2en = 'Helsinki-NLP/opus-mt-fr-en'
french2en_tkn = MarianTokenizer.from_pretrained(french2en)
french2en_model = MarianMTModel.from_pretrained(french2en)



In [4]:
original_texts = ["THE NEW PIECEGOODS BAZAR CO., LTD.,BOMBAY vs THE COMMISSIONER OF INCOME-TAX,BOMBAY on 26 May, 1950 ",
          "Equivalent citations: 1950 AIR 165, 1950 SCR 553 ",
          "The first model translates from English to French, which is a temporary process", 
          "The second model finally translates back all the temporary french text into English"]

In [6]:
#bring the texts to format for the model
def format_batch_texts(language_code, batch_texts):
  
    formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

    return formated_bach

#performs translation
def perform_translation(batch_texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)
    
    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True),max_new_tokens=200)

    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    
    return translated_texts

# translated_texts = perform_translation(original_texts, en2french_model, en2french_tkn)

def perform_back_translation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, en2french_model, en2french_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, french2en_model, french2en_tkn, original_language)

  # Return The Final Result
  return back_translated_batch

def combine_texts(original_texts, back_translated_batch):
  
  return set(original_texts + back_translated_batch) 

def perform_back_translation_with_augmentation(batch_texts, original_language="en", temporary_language="fr"):

 # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, en2french_model, en2french_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, french2en_model, french2en_tkn, original_language)

  # Return The Final Result
  return combine_texts(original_texts, back_translated_batch)

# Work on dataset

In [60]:
df = pd.read_csv('../../deprecated/final.csv',sep=',',names=['label','sentence'])
df = df.drop(0)

label_counts = df['label'].value_counts()
print(label_counts)

#Remove labels which is occuring more than 240 times in final.csv
df = df.drop(df[df['label'] == 'RPC'].index)
df = df.drop(df[df['label'] == 'PRECEDENT'].index)
df = df.drop(df[df['label'] == 'REASONING'].index)
df = df.drop(df[df['label'] == 'FACTS'].index)

label
REASONING    3291
FACTS        1458
PRECEDENT     271
RPC           242
STATUTE       176
RLC           171
COUNSEL       128
ARG           121
JUDGE          64
CITATION       60
NAME           59
Name: count, dtype: int64


In [61]:
# Analyse the frequency of each labels
label_counts = df['label'].value_counts()
print(label_counts)

label
STATUTE     176
RLC         171
COUNSEL     128
ARG         121
JUDGE        64
CITATION     60
NAME         59
Name: count, dtype: int64


In [62]:
#storing df sentences into a list tmp
tmp = (df.values).tolist()
tmp = [item[1] for item in tmp]

#Storing back transalated sentences in back list
back = []
sz = len(tmp)

#Working faster if we are sending 1 sent per loop
for sent in tmp:
    dum = perform_back_translation([sent])
    back.extend(dum)
    sz = sz-1
    print(f"{sz} sentences left")

In [21]:
#Storing back translated sentences in back.pkl as list

with open('./back.pkl', 'wb') as file:
    pickle.dump(back, file) 

# Performing EDA

In [None]:
# Run this cell only if back.pkl already exists

with open('back.pkl','rb') as file:
    back = pickle.load(file)

In [12]:
#example text
text = "start each day with positive thoughts and make your day"

#Create instances
wordnet_aug = WordNetAugmenter()
embed_aug = EmbeddingAugmenter()

#RI, RS, RD, SR
aug_type = {"RI":0,"RS":1,"RD":2,"SR":3}
eda_aug = EasyDataAugmenter()

charswap_aug = CharSwapAugmenter()
checklist_aug = CheckListAugmenter()
# clare_aug = CLAREAugmenter()

#print augmented text
print(wordnet_aug.augment(text))
print(embed_aug.augment(text))
print(eda_aug.augment(text))
print(charswap_aug.augment(text))
print(checklist_aug.augment(text))
# print(clare_aug.augment(text))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['start each day with positive thoughts and pee your day']
['start each day with positive thinks and make your day']
['start each day with positive and make your day', 'thoughts each day with positive start and make your day', 'start each day apiece with positive thoughts and make your day', 'start each day with positive thoughts and give your day']
['start each day with positive thoughts and make your dny']
2024-04-01 00:05:15,901 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
['start each day with positive thoughts and make your day']


In [18]:
print(wordnet_aug.augment(text))
print(embed_aug.augment(text))
print(eda_aug.augment(text)[aug_type["SR"]])
print(charswap_aug.augment(text))
print(checklist_aug.augment(text))

['start each day with positive thoughts and establish your day']
['initiation each day with positive thoughts and make your day']
<class 'str'>
['start each day with positive thoughts and make your ady']
['start each day with positive thoughts and make your day']


In [33]:
from IPython.display import clear_output

aug = []
it = len(back)
for text in back:

    #Synonym replacement
    newtxt = wordnet_aug.augment(text)[0]
    newtxt = embed_aug.augment(newtxt)[0]

    #Random Insertion
    try:
        newtxt = eda_aug.augment(newtxt)[aug_type["RI"]]
    except:
        print(text)
        print(eda_aug.augment(newtxt))

    #Random Swapping
    try:
        newtxt = eda_aug.augment(newtxt)[aug_type["RS"]]
    except:
        print(text)
        print(eda_aug.augment(newtxt))

    #Random Deletion
    try:
        newtxt = eda_aug.augment(newtxt)[aug_type["RD"]]
    except:
        print(text)
        print(eda_aug.augment(newtxt))

    #Push newtxt in new list
    aug.append(newtxt)
    it -= 1
    clear_output(wait=True)
    print(f"Iterations left : {it}")

Iterations left : 0


In [None]:
#Storing augmented sentences in aug.pkl as list

with open('./aug.pkl', 'wb') as file:
    pickle.dump(aug, file) 

# Balance the dataset(Classification)

In [72]:
#Check compare df if you wanna see how different are new sentences than before

compare = pd.DataFrame(list(zip(tmp, aug)),
              columns=['Orig Sentences','New Sentences'])

In [37]:
# Run this cell only if aug.pkl already exists

with open('aug.pkl','rb') as file:
    aug = pickle.load(file)

In [40]:
labels = [item[0] for item in (df.values).tolist()]
merged_list = list(zip(labels, aug))
extradf = pd.DataFrame(merged_list, columns=['label', 'sentence'])

In [47]:
label_counts = extradf['label'].value_counts()
print(label_counts)

#Initial data
'''
STATUTE     176
RLC         171
COUNSEL     128
ARG         121
JUDGE        64
CITATION     60
NAME         59
'''

label
STATUTE     176
RLC         171
COUNSEL     128
ARG         121
JUDGE        64
CITATION     60
NAME         59
Name: count, dtype: int64


'\nSTATUTE     176\nRLC         171\nCOUNSEL     128\nARG         121\nJUDGE        64\nCITATION     60\nNAME         59\n'

In [51]:
#Bringing some labels to 240


rows_to_delete = extradf[extradf['label'] == 'STATUTE']
rows_to_keep = 64
selected_rows1 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

rows_to_delete = extradf[extradf['label'] == 'RLC']
rows_to_keep = 69
selected_rows2 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

rows_to_delete = extradf[extradf['label'] == 'COUNSEL']
rows_to_keep = 112
selected_rows3 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

rows_to_delete = extradf[extradf['label'] == 'ARG']
rows_to_keep = 119
selected_rows4 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

#These are the rows index to keep and not delete
selected = (selected_rows1.index).to_list() + (selected_rows2.index).to_list() + (selected_rows3.index).to_list() + (selected_rows4.index).to_list() + (extradf[extradf['label'] == 'JUDGE'].index).to_list() + (extradf[extradf['label'] == 'CITATION'].index).to_list() + (extradf[extradf['label'] == 'NAME'].index).to_list()

selected_df = extradf.iloc[selected].reset_index(drop=True)

In [52]:
# Load the DataFrame from CSV
newdf = pd.read_csv('../deprecated/final.csv', sep=',', names=['label', 'sentence'])
newdf = newdf.drop(0)


#Brining labels with more than 240 counts to 240 freq
rows_to_delete1 = newdf[newdf['label'] == 'RPC']
selected_rows1 = rows_to_delete1.sample(n=2, random_state=42)
newdf = newdf.drop(selected_rows1.index)

rows_to_delete2 = newdf[newdf['label'] == 'PRECEDENT']
selected_rows2 = rows_to_delete2.sample(n=31, random_state=42)
newdf = newdf.drop(selected_rows2.index)

rows_to_delete3 = newdf[newdf['label'] == 'REASONING']
selected_rows3 = rows_to_delete3.sample(n=3051, random_state=42)
newdf = newdf.drop(selected_rows3.index)

rows_to_delete4 = newdf[newdf['label'] == 'FACTS']
selected_rows4 = rows_to_delete4.sample(n=1218, random_state=42)
newdf = newdf.drop(selected_rows4.index)

newdf = newdf.reset_index(drop=True)

In [55]:
final_df = pd.concat([newdf, selected_df], axis=0, ignore_index=True)

In [56]:
label_counts = final_df['label'].value_counts()
print(label_counts)

label
STATUTE      240
COUNSEL      240
RLC          240
FACTS        240
REASONING    240
RPC          240
PRECEDENT    240
ARG          240
JUDGE        128
CITATION     120
NAME         118
Name: count, dtype: int64


In [70]:
#Storing augmented dataset(Not preprocessed) in final_df.pkl as list

with open('./final_df.pkl', 'wb') as file:
    pickle.dump(final_df, file) 

# Preprocessing

In [1]:
from pp import preprocess

In [3]:
import pickle
with open('final_df.pkl','rb') as file:
    df = pickle.load(file)

df = df.dropna()

In [4]:
len(df)

2284

In [5]:
#Loading mappings that we got from ./analysis/analysis.
# here is our abbreviations mapping dictionary
with open('../../intermediate/mappings.pickle','rb') as file:
    mappings = pickle.load(file)

#got these legal stopwords by analysis
with open('../../intermediate/legal_stopwords.pickle','rb') as file:
    legal_stopwords = pickle.load(file)

texts =  (df['sentence']).to_list()
texts = preprocess(texts,legal_stopwords,mappings)

In [9]:
labels =  (df['label']).to_list()
merged_list = list(zip(labels, texts))
pp_df = pd.DataFrame(merged_list, columns=['label', 'sentence'])

In [10]:
#Storing augmented dataset(preprocessed) in final_pp_df.pkl

with open('./final_pp_df.pkl', 'wb') as file:
    pickle.dump(pp_df, file) 

# Sentence Embedding

In [13]:
# Run this cell only if back.pkl already exists

with open('final_pp_df.pkl','rb') as file:
    df = pickle.load(file)

In [11]:
import torch

import os

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

Some weights of the model checkpoint at law-ai/InLegalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def get_sentence_embeddings(sentences):
    # Tokenize the sentences
    encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        sentence_embeddings = outputs.last_hidden_state[:,0,:]  # Average pooling

    return sentence_embeddings

In [15]:
textvect = get_sentence_embeddings((pp_df['sentence']).to_list())

In [18]:
with open('./intermediate/textvect.pickle', 'wb') as file:
    pickle.dump(textvect, file)