In [14]:
import json
import csv

import pandas as pd
import numpy as np
import torch

from torch.utils.data import DataLoader
from torch.optim import SGD
from transformers import BertTokenizerFast

In [15]:
DATASET_DIR = "/kaggle/input/indian-legal-ner-dataset/"
JUDGEMENT_FILE = "indian_legal_NER_train_judgement.json"
PREAMBLE_FILE = "indian_legal_NER_train_preamble.json"

In [16]:
fileData = []

for fileName in [JUDGEMENT_FILE, PREAMBLE_FILE]:
    # Open the JSON file in read mode
    with open(DATASET_DIR + fileName, 'r') as file:
        # Load the JSON data
        data = json.load(file)

    fileData.append(data)

In [17]:
for singleFileData in fileData:
    print(f"len(singleFileData): {len(singleFileData)}")

len(singleFileData): 9435
len(singleFileData): 1560


In [18]:
sampleFile = fileData[0]
print(sampleFile[0])

{'id': '90d9a97c7b7749ec8a4f460fda6f937e', 'annotations': [{'result': [{'value': {'start': 90, 'end': 103, 'text': 'Hongkong Bank', 'labels': ['ORG']}, 'id': 'C8HPTIM1', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 267, 'end': 278, 'text': 'Rahul & Co.', 'labels': ['ORG']}, 'id': 'KOWE3RAM', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}]}], 'data': {'text': "\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy."}, 'meta': {'source': 'tax_districtcourts judgement https://indiankanoon.org/doc/1556717/'}}


In [19]:
print(sampleFile[0]["annotations"][0]["result"])
print(sampleFile[0]["data"]["text"])

[{'value': {'start': 90, 'end': 103, 'text': 'Hongkong Bank', 'labels': ['ORG']}, 'id': 'C8HPTIM1', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 267, 'end': 278, 'text': 'Rahul & Co.', 'labels': ['ORG']}, 'id': 'KOWE3RAM', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}]


(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.


In [20]:
annotations = sampleFile[0]["annotations"][0]["result"]
text = sampleFile[0]["data"]["text"]

In [21]:
annotated_texts = []
for annotation in annotations:
    label = annotation["value"]["labels"][0]
    annotated_text_list = annotation["value"]["text"].split()
    for i, annotated_text in enumerate(annotated_text_list):
        if i == 0:
            annotated_texts.append((annotated_text, "B-" + label))
        else:
            annotated_texts.append((annotated_text, "I-" + label))

print(annotated_texts)

[('Hongkong', 'B-ORG'), ('Bank', 'I-ORG'), ('Rahul', 'B-ORG'), ('&', 'I-ORG'), ('Co.', 'I-ORG')]


In [22]:
split_text = text.split()
whole_text_labeled = ['O'] * len(split_text)

In [23]:
i, j = 0, 0
while i < len(split_text) and j < len(annotated_texts):
    if split_text[i] == annotated_texts[j][0]:
        whole_text_labeled[i] = annotated_texts[j][1]
        i += 1
        j += 1
    else:
        i += 1

                
for text, label in zip(split_text, whole_text_labeled):
    print(text, label)

(7) O
On O
specific O
query O
by O
the O
Bench O
about O
an O
entry O
of O
Rs. O
1,31,37,500 O
on O
deposit O
side O
of O
Hongkong B-ORG
Bank I-ORG
account O
of O
which O
a O
photo O
copy O
is O
appearing O
at O
p. O
40 O
of O
assessee's O
paper O
book, O
learned O
authorised O
representative O
submitted O
that O
it O
was O
related O
to O
loan O
from O
broker, O
Rahul B-ORG
& I-ORG
Co. I-ORG
on O
the O
basis O
of O
his O
submission O
a O
necessary O
mark O
is O
put O
by O
us O
on O
that O
photo O
copy. O


In [24]:
data = [
    ["text", "labels"]
]

for singleFileData, segmentName in zip(fileData, ["judgement", "preamble"]):
    
    for entry in singleFileData:
        annotations = entry["annotations"][0]["result"]
        text = entry["data"]["text"]

        annotated_texts = []
        for annotation in annotations:
            label = annotation["value"]["labels"][0]
            annotated_text_list = annotation["value"]["text"].split()
            for i, annotated_text in enumerate(annotated_text_list):
                if i == 0:
                    annotated_texts.append((annotated_text, "B-" + label))
                else:
                    annotated_texts.append((annotated_text, "I-" + label))

        split_text = text.split()
        whole_text_labeled = ['O'] * len(split_text)

        i, j = 0, 0
        while i < len(split_text) and j < len(annotated_texts):
            if split_text[i] == annotated_texts[j][0]:
                whole_text_labeled[i] = annotated_texts[j][1]
                i += 1
                j += 1
            else:
                i += 1
    
        data.append([text, " ".join(whole_text_labeled)])
    
    with open(f'/kaggle/working/{segmentName}_data.csv', 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for row in data:
            writer.writerow(row)


In [25]:
import pandas as pd

# Read the CSV file and create a DataFrame
df = pd.read_csv('/kaggle/working/judgement_data.csv')

# You can now work with the DataFrame 'df' as needed
df.head()

Unnamed: 0,text,labels
0,\n\n(7) On specific query by the Bench about a...,O O O O O O O O O O O O O O O O O B-ORG I-ORG ...
1,"He was also asked whether Agya <span class=""hi...",O O O O O B-OTHER_PERSON O O O O O O O O O O O...
2,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...",O O O B-WITNESS I-WITNESS O O O B-ORG I-ORG O ...
3,You are hereby asked not to carry out any cons...,O O O O O O O O O O O O O O O O O O O O O O
4,The pillion rider T.V. Satyanarayana Murthy al...,O O O B-OTHER_PERSON I-OTHER_PERSON I-OTHER_PE...


In [26]:
# Read the first CSV file
df1 = pd.read_csv('/kaggle/input/indian-judgementpreamble-ner-dataset/judgement_data.csv')

# Read the second CSV file
df2 = pd.read_csv('/kaggle/input/indian-judgementpreamble-ner-dataset/preamble_data.csv')

# Append the data from df2 to df1
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined data to a new CSV file
combined_df.to_csv('indian_ner_data.csv', index=False)

df = combined_df

combined_df.head()

Unnamed: 0,text,labels
0,\n\n(7) On specific query by the Bench about a...,O O O O O O O O O O O O O O O O O B-ORG I-ORG ...
1,"He was also asked whether Agya <span class=""hi...",O O O O O B-OTHER_PERSON O O O O O O O O O O O...
2,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...",O O O B-WITNESS I-WITNESS O O O B-ORG I-ORG O ...
3,You are hereby asked not to carry out any cons...,O O O O O O O O O O O O O O O O O O O O O O
4,The pillion rider T.V. Satyanarayana Murthy al...,O O O B-OTHER_PERSON I-OTHER_PERSON I-OTHER_PE...


In [27]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]
 
print(unique_labels)

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

print(labels_to_ids)

{'B-OTHER_PERSON', 'I-OTHER_PERSON', 'I-LAWYER', 'I-PROVISION', 'I-ORG', 'B-GPE', 'I-PETITIONER', 'B-CASE_NUMBER', 'I-COURT', 'I-DATE', 'B-DATE', 'I-RESPONDENT', 'B-LAWYER', 'I-WITNESS', 'B-PRECEDENT', 'I-PRECEDENT', 'I-STATUTE', 'B-RESPONDENT', 'I-JUDGE', 'B-COURT', 'B-STATUTE', 'B-PROVISION', 'I-CASE_NUMBER', 'I-GPE', 'B-PETITIONER', 'O', 'B-JUDGE', 'B-ORG', 'B-WITNESS'}
{'B-CASE_NUMBER': 0, 'B-COURT': 1, 'B-DATE': 2, 'B-GPE': 3, 'B-JUDGE': 4, 'B-LAWYER': 5, 'B-ORG': 6, 'B-OTHER_PERSON': 7, 'B-PETITIONER': 8, 'B-PRECEDENT': 9, 'B-PROVISION': 10, 'B-RESPONDENT': 11, 'B-STATUTE': 12, 'B-WITNESS': 13, 'I-CASE_NUMBER': 14, 'I-COURT': 15, 'I-DATE': 16, 'I-GPE': 17, 'I-JUDGE': 18, 'I-LAWYER': 19, 'I-ORG': 20, 'I-OTHER_PERSON': 21, 'I-PETITIONER': 22, 'I-PRECEDENT': 23, 'I-PROVISION': 24, 'I-RESPONDENT': 25, 'I-STATUTE': 26, 'I-WITNESS': 27, 'O': 28}
