# Import necessary libraries

In [1]:
#Install libraries
!pip install transformers
!pip install sentencepiece
!pip install textattack


#Import libraries
import pandas as pd
import pickle
from transformers import MarianMTModel, MarianTokenizer
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter, EasyDataAugmenter, CharSwapAugmenter, CheckListAugmenter, CLAREAugmenter


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting textattack
  Obtaining dependency information for textattack from https://files.pythonhosted.org/packages/69/85/f7878f69021c4f6583e07e285380d88f0bf2fafcef32c91dddd4db573692/textattack-0.3.9-py3-none-any.whl.metadata
  Downloading textattack-0.3.9-py3-none-any.whl.metadata (37 kB)
Collecting bert-score>=0.3.5 (from textattack)
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m804.2 kB/s[0

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw to /Users/meetbanthia/nltk_data...
[nltk_data]   Package omw is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package punkt is alr

In [2]:
#English to french model
en2french = 'Helsinki-NLP/opus-mt-en-fr'
en2french_tkn = MarianTokenizer.from_pretrained(en2french)
en2french_model = MarianMTModel.from_pretrained(en2french)

#french to english model
french2en = 'Helsinki-NLP/opus-mt-fr-en'
french2en_tkn = MarianTokenizer.from_pretrained(french2en)
french2en_model = MarianMTModel.from_pretrained(french2en)



In [3]:
original_texts = ["THE NEW PIECEGOODS BAZAR CO., LTD.,BOMBAY vs THE COMMISSIONER OF INCOME-TAX,BOMBAY on 26 May, 1950 ",
          "Equivalent citations: 1950 AIR 165, 1950 SCR 553 ",
          "The first model translates from English to French, which is a temporary process", 
          "The second model finally translates back all the temporary french text into English"]

original_texts

['THE NEW PIECEGOODS BAZAR CO., LTD.,BOMBAY vs THE COMMISSIONER OF INCOME-TAX,BOMBAY on 26 May, 1950 ',
 'Equivalent citations: 1950 AIR 165, 1950 SCR 553 ',
 'The first model translates from English to French, which is a temporary process',
 'The second model finally translates back all the temporary french text into English']

In [4]:
#bring the texts to format for the model
def format_batch_texts(language_code, batch_texts):
  
    formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

    return formated_bach

#performs translation
def perform_translation(batch_texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)
    
    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True),max_new_tokens=200)

    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    
    return translated_texts

# translated_texts = perform_translation(original_texts, en2french_model, en2french_tkn)

# Backtranslation

In [5]:
def perform_back_translation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, en2french_model, en2french_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, french2en_model, french2en_tkn, original_language)

  # Return The Final Result
  return back_translated_batch

# Backtranslation with augmentation

In [6]:
def combine_texts(original_texts, back_translated_batch):
  
  return set(original_texts + back_translated_batch) 

def perform_back_translation_with_augmentation(batch_texts, original_language="en", temporary_language="fr"):

 # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, en2french_model, en2french_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, french2en_model, french2en_tkn, original_language)

  # Return The Final Result
  return combine_texts(original_texts, back_translated_batch)

# Work on dataset

In [63]:
df = pd.read_csv('final.csv',sep=',',names=['label','sentence'])
df = df.drop(0)

#Remove labels which is occuring more than 240 times in final.csv
df = df.drop(df[df['label'] == 'RPC'].index)
df = df.drop(df[df['label'] == 'PRECEDENT'].index)
df = df.drop(df[df['label'] == 'REASONING'].index)
df = df.drop(df[df['label'] == 'FACTS'].index)

In [8]:
# rows_to_delete = df[df['label'] == 'REASONING']
# rows_to_keep = 3051
# selected_rows = rows_to_delete.sample(n=rows_to_keep, random_state=42)
# df = df.drop(selected_rows.index)

In [64]:
# Analyse the frequency of each labels

label_counts = df['label'].value_counts()
print(label_counts)

label
STATUTE     176
RLC         171
COUNSEL     128
ARG         121
JUDGE        64
CITATION     60
NAME         59
Name: count, dtype: int64


In [65]:
#storing df sentences into a list tmp
tmp = (df.values).tolist()
tmp = [item[1] for item in tmp]

# Perform back translation on dataframe list tmp

In [11]:
back = []
sz = len(tmp)
for sent in tmp:
    dum = perform_back_translation([sent])
    back.extend(dum)
    sz = sz-1
    print(f"{sz} sentences left")

778 sentences left
777 sentences left
776 sentences left
775 sentences left
774 sentences left
773 sentences left
772 sentences left
771 sentences left
770 sentences left
769 sentences left
768 sentences left
767 sentences left
766 sentences left
765 sentences left
764 sentences left
763 sentences left
762 sentences left
761 sentences left
760 sentences left
759 sentences left
758 sentences left
757 sentences left
756 sentences left
755 sentences left
754 sentences left
753 sentences left
752 sentences left
751 sentences left
750 sentences left
749 sentences left
748 sentences left
747 sentences left
746 sentences left
745 sentences left
744 sentences left
743 sentences left
742 sentences left
741 sentences left
740 sentences left
739 sentences left
738 sentences left
737 sentences left
736 sentences left
735 sentences left
734 sentences left
733 sentences left
732 sentences left
731 sentences left
730 sentences left
729 sentences left
728 sentences left
727 sentences left
726 sentence

In [12]:
with open('./back.pkl', 'wb') as file:
    pickle.dump(back, file) 

In [118]:
with open('./back.pkl','rb') as file:
    back = pickle.load(file)

# Performing EDA

In [13]:
text = "start each day with positive thoughts and make your day"

#Create instances
wordnet_aug = WordNetAugmenter()
embed_aug = EmbeddingAugmenter()
eda_aug = EasyDataAugmenter()
charswap_aug = CharSwapAugmenter()
checklist_aug = CheckListAugmenter()
# clare_aug = CLAREAugmenter()

#print augmented text
print(wordnet_aug.augment(text))
print(embed_aug.augment(text))
print(eda_aug.augment(text))
print(charswap_aug.augment(text))
print(checklist_aug.augment(text))
# print(clare_aug.augment(text))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.
100%|██████████| 481M/481M [24:04<00:00, 333kB/s]     
textattack: Unzipping file /Users/meetbanthia/.cache/textattack/tmp6o1oer9n.zip to /Users/meetbanthia/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meetbanthia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

['starting each day with positive thoughts and make your day']
['launched each day with positive thoughts and make your day']
['start each day with positive thoughts and make day your', 'start day with positive thoughts and make your day', 'start each day with positive view and make your day', 'start each day with mentation positive thoughts and make your day']
['start each day with psitive thoughts and make your day']


pytorch_model.bin:   0%|          | 0.00/432M [00:00<?, ?B/s]

2024-01-25 23:39:10,288 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
['start each day with positive thoughts and make your day']


KeyError: 'upos'

# Perform synonym replacement on back

In [130]:
aug = [wordnet_aug.augment(text)[0] for text in back]

In [136]:
labels = [item[0] for item in (df.values).tolist()]
merged_list = list(zip(labels, aug))
extradf = pd.DataFrame(merged_list, columns=['label', 'sentence'])

In [139]:
label_counts = extradf['label'].value_counts()
print(label_counts)

#Initial data
'''
STATUTE     176
RLC         171
COUNSEL     128
ARG         121
JUDGE        64
CITATION     60
NAME         59
'''

label
STATUTE     176
RLC         171
COUNSEL     128
ARG         121
JUDGE        64
CITATION     60
NAME         59
Name: count, dtype: int64


'\nSTATUTE     176\nRLC         171\nCOUNSEL     128\nARG         121\nJUDGE        64\nCITATION     60\nNAME         59\n'

In [140]:
rows_to_delete = extradf[extradf['label'] == 'STATUTE']
rows_to_keep = 64
selected_rows1 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

rows_to_delete = extradf[extradf['label'] == 'RLC']
rows_to_keep = 69
selected_rows2 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

rows_to_delete = extradf[extradf['label'] == 'COUNSEL']
rows_to_keep = 112
selected_rows3 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

rows_to_delete = extradf[extradf['label'] == 'ARG']
rows_to_keep = 119
selected_rows4 = rows_to_delete.sample(n=rows_to_keep, random_state=42)

selected = (selected_rows1.index).to_list() + (selected_rows2.index).to_list() + (selected_rows3.index).to_list() + (selected_rows4.index).to_list() + (extradf[extradf['label'] == 'JUDGE'].index).to_list() + (extradf[extradf['label'] == 'CITATION'].index).to_list() + (extradf[extradf['label'] == 'NAME'].index).to_list()

selected_df = extradf.iloc[selected].reset_index(drop=True)

In [173]:
selected_df

Unnamed: 0,label,sentence
0,STATUTE,(2) Where a farmer has paid a creditor twice t...
1,STATUTE,Article 5 of the Ordinance preface several ame...
2,STATUTE,It was urged that section 3(b) of the Jaipur A...
3,STATUTE,"Section 21 of the Act render that, if the Reve..."
4,STATUTE,"Article 13, paragraph 1, with which we are int..."
...,...,...
542,NAME,Mahant Pragdasji Guru... fivesome Patel Ishwar...
543,NAME,Amjad Khan v. The commonwealth on 20 March 1952
544,NAME,Raja Bhupendra Narain Singha... quintuplet Mah...
545,NAME,Gnanambal Ammal vs tonne. Raju Ayyar and other...


In [170]:
# Load the DataFrame from CSV
newdf = pd.read_csv('final.csv', sep=',', names=['label', 'sentence'])
newdf = newdf.drop(0)

# Select rows based on conditions
rows_to_delete1 = newdf[newdf['label'] == 'RPC']
selected_rows1 = rows_to_delete1.sample(n=2, random_state=42)
newdf = newdf.drop(selected_rows1.index)

rows_to_delete2 = newdf[newdf['label'] == 'PRECEDENT']
selected_rows2 = rows_to_delete2.sample(n=31, random_state=42)
newdf = newdf.drop(selected_rows2.index)

rows_to_delete3 = newdf[newdf['label'] == 'REASONING']
selected_rows3 = rows_to_delete3.sample(n=3051, random_state=42)
newdf = newdf.drop(selected_rows3.index)

rows_to_delete4 = newdf[newdf['label'] == 'FACTS']
selected_rows4 = rows_to_delete4.sample(n=1218, random_state=42)
newdf = newdf.drop(selected_rows4.index)

newdf = newdf.drop(newdf[newdf['label'] == 'STATUTE'].index)
newdf = newdf.drop(newdf[newdf['label'] == 'RLC'].index)
newdf = newdf.drop(newdf[newdf['label'] == 'COUNSEL'].index)
newdf = newdf.drop(newdf[newdf['label'] == 'ARG'].index)
newdf = newdf.drop(newdf[newdf['label'] == 'JUDGE'].index)
newdf = newdf.drop(newdf[newdf['label'] == 'CITATION'].index)
newdf = newdf.drop(newdf[newdf['label'] == 'NAME'].index)

newdf = newdf.reset_index(drop=True)

In [175]:
# add this selected_df with df and create a new df
result_df = pd.concat([newdf, selected_df], axis=0, ignore_index=True)

In [176]:
label_counts = result_df['label'].value_counts()
print(label_counts)

label
FACTS        240
REASONING    240
RPC          240
PRECEDENT    240
ARG          119
COUNSEL      112
RLC           69
STATUTE       64
JUDGE         64
CITATION      60
NAME          59
Name: count, dtype: int64


In [178]:
final_df = pd.concat([df, result_df], axis=0, ignore_index=True)

In [180]:
label_counts = final_df['label'].value_counts()
print(label_counts)

label
STATUTE      240
COUNSEL      240
RLC          240
ARG          240
FACTS        240
REASONING    240
RPC          240
PRECEDENT    240
JUDGE        128
CITATION     120
NAME         118
Name: count, dtype: int64


In [91]:
final_df.to_csv('final_df.csv', index=False)