In [1]:
pip install spacy[transformers]

Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.8-py2.py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.7-py2.py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.5/53.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy-alignments, spacy-transformers
Successfully installed spacy-alignments-0.8.6 spacy-transformers-1.1.7
[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
import json

def training_function(input_dataset_path):
    with open(input_dataset_path,'r') as f:
        data = json.load(f)
    training_data = {'classes' : [], 'annotations' : []}
    for example in data:        
        if len(example['annotations']) >= 1 and len(example['annotations'][0]['result']) >= 1:
            temp_dict = {}
            temp_dict['text'] = example['data']['text']
            temp_dict['entities'] = []
            for train_data in example['annotations'][0]['result']:
                start = train_data['value']['start']
                end = train_data['value']['end']
                label = train_data['value']['labels'][0].upper()
                temp_dict['entities'].append((start, end, label))
        training_data['annotations'].append(temp_dict)
    print(training_data['annotations'][0])
    return training_data
    

In [9]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans


def spacy_function(training_data):
    nlp = spacy.blank("en") # load a new spacy model
    doc_bin = DocBin() # create a DocBin object
    for training_example  in tqdm(training_data['annotations']): 
        text = training_example['text']
        labels = training_example['entities']
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents 
        doc_bin.add(doc)
    doc_bin.to_disk("trn_data.spacy") # save the docbin object

In [10]:
preamble_data_path = '/kaggle/input/ner-dataset/NER_TRAIN_PREAMBLE.json'
judgement_data_path = '/kaggle/input/ner-dataset/NER_TRAIN_JUDGEMENT.json'
training_data = training_function(preamble_data_path)
spacy_function(training_data)

{'text': "In The High Court Of Kerala At Ernakulam\n\nCrl Mc No. 1622 of 2006()\n\n\n1. T.R.Ajayan, S/O. O.Raman,\n                      ...  Petitioner\n\n                        Vs\n\n\n\n1. M.Ravindran,\n                       ...       Respondent\n\n2. Mrs. Nirmala Dinesh, W/O. Dinesh,\n\n                For Petitioner  :Sri.A.Kumar\n\n                For Respondent  :Smt.M.K.Pushpalatha\n\nThe Hon'ble Mr. Justice P.R.Raman\nThe Hon'ble Mr. Justice V.K.Mohanan\n\n Dated :07/01/2008\n\n O R D E R\n", 'entities': [(7, 40, 'COURT'), (73, 83, 'PETITIONER'), (171, 182, 'RESPONDENT'), (237, 251, 'RESPONDENT'), (304, 311, 'LAWYER'), (350, 365, 'LAWYER'), (391, 400, 'JUDGE'), (425, 436, 'JUDGE')]}


100%|██████████| 1560/1560 [00:03<00:00, 450.35it/s]


In [11]:
!python -m spacy init fill-config /kaggle/input/d/nikhilpanda13/config-files/base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python -m spacy train config.cfg --output ./ --paths.train ./trn_data.spacy --paths.dev ./trn_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-12-12 11:01:35,445] [INFO] Set up nlp object from config
[2022-12-12 11:01:35,457] [INFO] Pipeline: ['transformer', 'ner']
[2022-12-12 11:01:35,462] [INFO] Created vocabulary
[2022-12-12 11:01:35,466] [INFO] Finished initializing nlp object
Downloading: 100%|██████████████████████████████| 480/480 [00:00<00:00, 623kB/s]
Downloading: 100%|███████████████████████████| 878k/878k [00:00<00:00, 5.76MB/s]
Downloading: 100%|███████████████████████████| 446k/446k [00:00<00:00, 3.22MB/s]
Downloading: 100%|█████████████████████████| 1.29M/1.29M [00:00<00:00, 9.21MB/s]
Downloading: 100%|███████████████████████████| 316M/316M [00:06<00:00, 50.9MB/s]
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected i

In [13]:
import json
 
with open('/kaggle/input/ner-dataset/NER_DEV_PREAMBLE.json', 'r') as f:
    data1 = json.load(f)

In [14]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner(data1[10]['data']['text'])
#print(data1[0]['annotations'][0]['result'][0]['value']['labels'])
colors = {"COURT": "#F67DE3", "JUDGE": "#7DF6D9", "PETITIONER": "#FFFFFF",'RESPONDENT': '#FF0000','LAWYER': '#F08080','JUDGE': '#F9E79F'}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)