In [1]:
pip install spacy[transformers]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.8-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.0 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 318 kB/s 
[?25hCollecting transformers<4.22.0,>=3.4.0
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 17.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 17.1

In [2]:
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

In [4]:
## Loading the json files to be trained and tested for the model.

# Training datasets
with open('NER_TRAIN_PREAMBLE.json', 'r') as f:
    data_A = json.load(f)

with open('NER_TRAIN_JUDGEMENT.json', 'r') as f:
    data_B = json.load(f)

# Dev Test datasets
with open('NER_DEV_PREAMBLE.json', 'r') as f:
    data_C = json.load(f)

with open('NER_DEV_JUDGEMENT.json', 'r') as f:
    data_D = json.load(f)

In [5]:
preamble_data = {'classes' : [], 'annotations' : []}
judgement_data = {'classes' : [], 'annotations' : []}

for data in data_A:
  if len(data['annotations']) >= 1 and len(data['annotations'][0]['result']) >= 1:
    dict = {}
    dict['text'], dict['entities'] = data['data']['text'], []
    start = data['annotations'][0]['result'][0]['value']['start']
    end = data['annotations'][0]['result'][0]['value']['end']
    label = data['annotations'][0]['result'][0]['value']['labels'][0].upper()
    dict['entities'].append((start, end, label))
    preamble_data['annotations'].append(dict)

for data in data_B:
  if len(data['annotations']) >= 1 and len(data['annotations'][0]['result']) >= 1:
    dict = {}
    dict['text'], dict['entities'] = data['data']['text'], []
    start = data['annotations'][0]['result'][0]['value']['start']
    end = data['annotations'][0]['result'][0]['value']['end']
    label = data['annotations'][0]['result'][0]['value']['labels'][0].upper()
    dict['entities'].append((start, end, label))
    judgement_data['annotations'].append(dict)

In [6]:
# loading a new preamble spacy model
preamble_model = spacy.blank("en")

# create a DocBin object
bin = DocBin() 
for annotation in tqdm(preamble_data['annotations']): 
    text = annotation['text']
    labels = annotation['entities']
    doc = preamble_model.make_doc(text) 
    entities = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            entities.append(span)
    filtered_entities = filter_spans(entities)
    doc.ents = filtered_entities 
    bin.add(doc)

bin.to_disk("preamble_training_data.spacy") # save the docbin object

100%|██████████| 1558/1558 [00:03<00:00, 497.81it/s]


In [7]:
# loading a new judgement spacy model
judgement_model = spacy.blank("en")

# create a DocBin object
bin = DocBin() 
for annotation in tqdm(judgement_data['annotations']): 
    text = annotation['text']
    labels = annotation['entities']
    doc = judgement_model.make_doc(text) 
    entities = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            entities.append(span)
    filtered_entities = filter_spans(entities)
    doc.ents = filtered_entities 
    bin.add(doc)

bin.to_disk("judgement_training_data.spacy") # save the docbin object

100%|██████████| 7258/7258 [00:04<00:00, 1620.42it/s]


In [8]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
!python -m spacy train config.cfg --output ./ --paths.train ./preamble_training_data.spacy --paths.dev ./preamble_training_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-12-13 19:06:51,735] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-12-13 19:06:51,745] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2022-12-13 19:06:51,749] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-12-13 19:06:51,750] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Downloading tokenizer_config.json: 100% 28.0/28.0 [00:00<00:00, 38.8kB/s]
Downloading config.json: 100% 483/483 [00:00<00:00, 579kB/s]
Downloading vocab.txt: 100% 226k/226k [00:00<00:00, 1.25MB/s]
Downloading tokenizer.json: 100% 455k/455k [00:00<00:00, 1.50MB/s]
Downloading pytorch_model.bin: 100% 256M/256M [00:04<00:00, 59.4MB/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.we

In [10]:
!python -m spacy train config.cfg --output ./ --paths.train ./judgement_training_data.spacy --paths.dev ./judgement_training_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-12-13 19:34:57,928] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-12-13 19:34:57,939] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2022-12-13 19:34:57,942] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-12-13 19:34:57,943] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT ex

In [11]:
import spacy

nlp_ner = spacy.load("model-best")

preamble_doc = nlp_ner(data_C[1]['data']['text'])
judgement_doc = nlp_ner(data_D[10]['data']['text'])

colors = {"COURT": "#FFFFFF", "JUDGE": "#1FEA47", "PETITIONER": "#F67DE3", "RESPONDENT": "#7DF6D9", "LAWYER": "#AD685E", "DATE": "#AD685E", "GPE": "#AD685E", "ORG": "#AD685E", "STATUE": "#AD685E", "PROVISION": "#1FEA47","PRECEDENT": "#1FEA47","CASE NUMBER": "#1FEA47" }
options = {"colors": colors}

spacy.displacy.render(preamble_doc, style="ent", options= options, jupyter=True)
spacy.displacy.render(judgement_doc, style="ent", options= options, jupyter=True)