In [2]:
pip install spacy[transformers]

Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.8-py2.py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.7-py2.py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.5/53.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy-alignments, spacy-transformers
Successfully installed spacy-alignments-0.8.6 spacy-transformers-1.1.7
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import json

def training_function(input_dataset_path):
    
    #Opening the training files
    with open(input_dataset_path,'r') as f:
        data = json.load(f)
    
    training_data = {'classes' : [], 'annotations' : []}
    for example in data:        
        if len(example['annotations']) >= 1 and len(example['annotations'][0]['result']) >= 1:
            train_temp = {}
            train_temp['text'] = example['data']['text']
            train_temp['entities'] = []
            for train_data in example['annotations'][0]['result']:
                start = train_data['value']['start']
                end = train_data['value']['end']
                label = train_data['value']['labels'][0].upper()
                train_temp['entities'].append((start, end, label))
        training_data['annotations'].append(train_temp)
    print(training_data['annotations'][0])
    return training_data
    

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans


def spacy_function(training_data):
    #load a new spacy model
    nlp = spacy.blank("en") 
    
    #create a DocBin object
    doc_bin = DocBin() 
    
    #Looping through all the entitites in the data text file
    for training_example  in tqdm(training_data['annotations']): 
        text = training_example['text']
        labels = training_example['entities']
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents 
        doc_bin.add(doc)
    doc_bin.to_disk("trn_data.spacy") # save the docbin object

In [5]:
import json

def test_function(test_data_path):
    
    #Opening the DEV files
    with open(test_data_path, 'r') as f:
        data1 = json.load(f)
        
    #Loading the best models in the computation done by the training model    
    nlp_ner = spacy.load("model-best")
    
    #Defining colors for each entity
    colors = {"COURT": "#F67DE3", 
              "JUDGE": "#7DF6D9", 
              "PETITIONER": "#FFFFFF",
                  'RESPONDENT': '#FF0000',
                  'LAWYER': '#F08080',
                  'JUDGE': '#F9E79F',
                  'DATE' :'#CA2E0D',
                  'ORG' : '#AEB815',
                  'GPE' :'#77AA12',
                  'STATUTE' :'#219872',
                  'PROVISION' :'#154E49',
                  'PRECEDENT' :'#134563',
                  'CASE_NUMBER' :'#8A2BA9',
                  'WITNESS' :'#BC1E8A',
                  'OTHER_PERSON' :'#BC1E38', 
                 }
    options = {"colors": colors}
    
    #Displaying 10 results
    for i in range(10):
        doc = nlp_ner(data1[i]['data']['text'])
        #Displaying the text with the named entities
        spacy.displacy.render(doc, style="ent", options= options, jupyter=True)
    
 


In [12]:
#Defining the accuracy
def accuracy(test_data_path):
    with open(test_data_path, 'r') as f:
        test_data = json.load(f)
    correct_pred_cnt = 0
    total_cnt = 0
    
    #Loading the best model
    nlp_ner = spacy.load("model-best")

    for data in test_data:
        
        #Count the total texts
        total_cnt += 1
        
        #Using the model to predict text given in DEV files
        model_input_text = data['data']['text']
        model_pred = nlp_ner(model_input_text)
        
        #Appending the text and label to a prediction list
        pred_label_list = []
        for ent in model_pred.ents:
            pred_label_list.append(ent.text + ent.label_)
            
        #Appending the text and label to a actual label list
        label_list = []
        for label in data['annotations'][0]['result']:
            label_list.append(label['value']['text'] + label['value']['labels'][0])
        
        #Incrementing the correct prediction count only if the whole text is correctly predicted
        if pred_label_list == label_list:
            correct_pred_cnt += 1

    accuracy = (correct_pred_cnt / total_cnt) * 100
    print(accuracy)


In [6]:
judgement_data_path = '/kaggle/input/ner-dataset/NER_TRAIN_JUDGEMENT.json'
training_data = training_function(judgement_data_path)
spacy_function(training_data)

{'text': "\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.", 'entities': [(90, 103, 'ORG'), (267, 278, 'ORG')]}


100%|██████████| 9435/9435 [00:04<00:00, 2347.82it/s]


In [7]:
!python -m spacy init fill-config /kaggle/input/d/nikhilpanda13/config-files/base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
!python -m spacy train config.cfg --output ./ --paths.train ./trn_data.spacy --paths.dev ./trn_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-12-13 07:18:57,902] [INFO] Set up nlp object from config
[2022-12-13 07:18:57,914] [INFO] Pipeline: ['transformer', 'ner']
[2022-12-13 07:18:57,918] [INFO] Created vocabulary
[2022-12-13 07:18:57,922] [INFO] Finished initializing nlp object
Downloading: 100%|██████████████████████████████| 480/480 [00:00<00:00, 296kB/s]
Downloading: 100%|███████████████████████████| 878k/878k [00:00<00:00, 6.57MB/s]
Downloading: 100%|███████████████████████████| 446k/446k [00:00<00:00, 3.36MB/s]
Downloading: 100%|█████████████████████████| 1.29M/1.29M [00:00<00:00, 9.31MB/s]
Downloading: 100%|███████████████████████████| 316M/316M [00:06<00:00, 51.5MB/s]
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected i

In [9]:
judgement_data_path = '/kaggle/input/ner-dataset/NER_DEV_JUDGEMENT.json'
test_function(judgement_data_path)

In [10]:
accuracy(judgement_data_path)

74.18335089567967


In [7]:
preamble_data_path = '/kaggle/input/ner-dataset/NER_TRAIN_PREAMBLE.json'
training_data = training_function(preamble_data_path)
spacy_function(training_data)

{'text': "In The High Court Of Kerala At Ernakulam\n\nCrl Mc No. 1622 of 2006()\n\n\n1. T.R.Ajayan, S/O. O.Raman,\n                      ...  Petitioner\n\n                        Vs\n\n\n\n1. M.Ravindran,\n                       ...       Respondent\n\n2. Mrs. Nirmala Dinesh, W/O. Dinesh,\n\n                For Petitioner  :Sri.A.Kumar\n\n                For Respondent  :Smt.M.K.Pushpalatha\n\nThe Hon'ble Mr. Justice P.R.Raman\nThe Hon'ble Mr. Justice V.K.Mohanan\n\n Dated :07/01/2008\n\n O R D E R\n", 'entities': [(7, 40, 'COURT'), (73, 83, 'PETITIONER'), (171, 182, 'RESPONDENT'), (237, 251, 'RESPONDENT'), (304, 311, 'LAWYER'), (350, 365, 'LAWYER'), (391, 400, 'JUDGE'), (425, 436, 'JUDGE')]}


100%|██████████| 1560/1560 [00:03<00:00, 466.28it/s]


In [8]:
!python -m spacy init fill-config /kaggle/input/d/nikhilpanda13/config-files/base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
!python -m spacy train config.cfg --output ./ --paths.train ./trn_data.spacy --paths.dev ./trn_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-12-13 18:25:44,895] [INFO] Set up nlp object from config
[2022-12-13 18:25:44,906] [INFO] Pipeline: ['transformer', 'ner']
[2022-12-13 18:25:44,911] [INFO] Created vocabulary
[2022-12-13 18:25:44,914] [INFO] Finished initializing nlp object
Downloading: 100%|██████████████████████████████| 480/480 [00:00<00:00, 590kB/s]
Downloading: 100%|███████████████████████████| 878k/878k [00:00<00:00, 5.97MB/s]
Downloading: 100%|███████████████████████████| 446k/446k [00:00<00:00, 3.39MB/s]
Downloading: 100%|█████████████████████████| 1.29M/1.29M [00:00<00:00, 8.86MB/s]
Downloading: 100%|███████████████████████████| 316M/316M [00:07<00:00, 45.7MB/s]
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected i

In [10]:
preamble_data_path = '/kaggle/input/ner-dataset/NER_DEV_PREAMBLE.json'
test_function(preamble_data_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (878 > 512). Running this sequence through the model will result in indexing errors


In [13]:
accuracy(preamble_data_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (878 > 512). Running this sequence through the model will result in indexing errors


55.2
