In [5]:
# Load data from spaCy format
import spacy
from spacy.tokens import DocBin
from datasets import Dataset

def load_data(spacy_file='training/data/train.spacy'):
    print("Loading data...")
    doc_bin = DocBin().from_disk(spacy_file)
    nlp = spacy.load('en_core_web_trf')
    docs = doc_bin.get_docs(nlp.vocab)

    all_sents = []
    all_labels = set()
    for doc in docs:
        new_sent = {'tokens': [token.text for token in doc],
                    'tags': [token.ent_iob_ + ("-" + token.ent_type_ if token.ent_type_ else '') for token in doc]}
        all_sents.append(new_sent)
        [all_labels.add(tag) for tag in new_sent['tags']]
    return Dataset.from_list(all_sents), sorted(list(all_labels))

train, labels = load_data()
print(train[0])
print(labels)

Loading data...
{'tokens': ['\n\n', '(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs', '.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank', 'account', 'of', 'which', 'a', 'photo', 'copy', 'is', 'appearing', 'at', 'p.', '40', 'of', 'assessee', "'s", 'paper', 'book', ',', 'learned', 'authorised', 'representative', 'submitted', 'that', 'it', 'was', 'related', 'to', 'loan', 'from', 'broker', ',', 'Rahul', '&', 'Co.', 'on', 'the', 'basis', 'of', 'his', 'submission', 'a', 'necessary', 'mark', 'is', 'put', 'by', 'us', 'on', 'that', 'photo', 'copy', '.'], 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [6]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained('roberta-base', add_prefix_space=True)

def tokenize(row):
    tokenized = tokenizer(row['tokens'], truncation=True, is_split_into_words=True)
    aligned_labels = [-100 if i is None else labels.index(row['tags'][i]) for i in tokenized.word_ids()]
    tokenized['labels'] = aligned_labels
    return tokenized

tokenize(train[15])

{'input_ids': [0, 20, 97, 2810, 963, 11, 209, 7688, 16, 7162, 1812, 24998, 61, 1639, 13, 18349, 11, 2098, 9, 4632, 8, 3077, 31, 1402, 2683, 26438, 32711, 97, 87, 2112, 709, 26438, 32711, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 28, 28, 28, 28, 28, 28, 28, 28, 10, 24, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, -100]}

In [216]:
train = train.map(tokenize)


dev, _ = load_data('training/data/dev.spacy')
dev = dev.map(tokenize)
dev

  0%|          | 0/10995 [00:00<?, ?ex/s]

loading configuration file /var/folders/1y/v6m6qm2d3p52jn2sv6smp5cr0000gn/T/tmp1y2xisax/config.json
Model config RobertaConfig {
  "_name_or_path": "/var/folders/1y/v6m6qm2d3p52jn2sv6smp5cr0000gn/T/tmp1y2xisax/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Didn't find file /var/folders/1y/v6m6qm2d3p52jn2sv6smp5cr0000gn/T/tmp1y2xisax/added_tokens.json. We won't load it.
loading file /var/folders/1y/v6m6qm2d3p52jn2sv6smp5cr

  0%|          | 0/1074 [00:00<?, ?ex/s]

ValueError: '' is not in list

In [68]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=len(labels))

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initia

In [70]:
batch_size=16
args = TrainingArguments(
    f"output",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=3,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
)

In [71]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [78]:
from datasets import load_metric
import numpy as np

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [80]:
trainer = Trainer(
    model,
    args,
    train_dataset=train,
    eval_dataset=dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [97]:
# trainer.train()

True

In [138]:
print(dev[15]['tokens'])

['High', 'Court', 'Of', 'Judicature', 'At', 'Allahabad', '\n \n \n\n                                                                             ', 'A.F.R.', '\n \n                                                 \t          ', 'Reserved', 'on', '07.10.2021', '\n \n\t\t\t\t\t\t          ', 'Delivered', 'on', '13.12.2021', '\n \n\n \n\n \n', 'Case', ':', '-', 'Writ', '-', 'C', 'No', '.', '-', '59863', 'of', '2015', '\n \n', 'Petitioner', ':', '-', 'Sun', 'Tower', 'Residents', 'Welfare', 'Association', '\n \n', 'Respondent', ':', '-', 'Ghaziabad', 'Development', 'Authority', 'through', 'its', 'Vice', 'Chairman', 'and', '2', 'Others', '\n \n', 'Counsel', 'for', 'Petitioner', ':', '-', 'Prashant', ',', 'Abhijeet', 'Mukherji', ',', 'Prashant', ',', 'S.K.', 'Pal', '\n \n', 'Counsel', 'for', 'Respondent', ':', '-', 'Ram', 'Bilas', 'Yadav', ',', 'Anoop', 'Tivedi', ',', 'Anoop', 'Trivedi', '(', 'Senior', 'Adv.),Himanshu', 'Tyagi', ',', 'Kartikeya', 'Saran', ',', 'Rahul', 'Agarwal', ',', 'Rakesh

In [139]:
print(len(dev[15]['tokens']) * ['O'])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [206]:
missing_row = ['B-COURT', 'I-COURT', 'I-COURT', 'I-COURT', 'I-COURT', 'I-COURT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PETITIONER', 'I-PETITIONER', 'I-PETITIONER', 'I-PETITIONER', 'I-PETITIONER', 'O', 'O', 'O', 'O', 'B-RESPONDENT', 'I-RESPONDENT', 'I-RESPONDENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LAWYER', 'I-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PETITIONER', 'I-PETITIONER', 'I-PETITIONER', 'I-PETITIONER', 'I-PETITIONER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-RESPONDENT', 'I-RESPONDENT', 'I-RESPONDENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LAWYER', 'I-LAWYER', 'I-LAWYER', 'O', 'B-LAWYER', 'I-LAWYER', 'O', 'O', 'B-JUDGE', 'I-JUDGE', 'I-JUDGE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-JUDGE', 'I-JUDGE', 'O', 'O', 'O', 'O', 'O', 'B-JUDGE', 'I-JUDGE', 'I-JUDGE', 'O', 'O', 'O', 'O', 'O', 'O']

print(list(zip(dev[15]['tokens'], missing_row)))

[('High', 'B-COURT'), ('Court', 'I-COURT'), ('Of', 'I-COURT'), ('Judicature', 'I-COURT'), ('At', 'I-COURT'), ('Allahabad', 'I-COURT'), ('\n \n \n\n                                                                             ', 'O'), ('A.F.R.', 'O'), ('\n \n                                                 \t          ', 'O'), ('Reserved', 'O'), ('on', 'O'), ('07.10.2021', 'O'), ('\n \n\t\t\t\t\t\t          ', 'O'), ('Delivered', 'O'), ('on', 'O'), ('13.12.2021', 'O'), ('\n \n\n \n\n \n', 'O'), ('Case', 'O'), (':', 'O'), ('-', 'O'), ('Writ', 'O'), ('-', 'O'), ('C', 'O'), ('No', 'O'), ('.', 'O'), ('-', 'O'), ('59863', 'O'), ('of', 'O'), ('2015', 'O'), ('\n \n', 'O'), ('Petitioner', 'O'), (':', 'O'), ('-', 'O'), ('Sun', 'B-PETITIONER'), ('Tower', 'I-PETITIONER'), ('Residents', 'I-PETITIONER'), ('Welfare', 'I-PETITIONER'), ('Association', 'I-PETITIONER'), ('\n \n', 'O'), ('Respondent', 'O'), (':', 'O'), ('-', 'O'), ('Ghaziabad', 'B-RESPONDENT'), ('Development', 'I-RESPONDENT'), ('Authority'

In [208]:
len(missing_row)

196

In [222]:
for row in dev:
    if row['tags'][0] == '':
        print(1)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [4]:
a, b = (1, 2, 3)

ValueError: too many values to unpack (expected 2)

In [20]:
import train_sentence_classifier
classifier_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
tokenized = classifier_tokenizer(
                train[9999]['tokens'],
                padding='max_length',
                truncation=True,
                is_split_into_words=True,
                return_token_type_ids=False,
                return_tensors='pt'
            )
tokenized

{'input_ids': tensor([[  101,  1818,   119,   165,   117,   160,   117,   264,  1746,  1201,
           469,   212,  1285,   166,   189,   117,  1992,   119,  2639,   182,
          6993, 15786,  9339,   109,  1285,   254,   210,   371,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [11]:
import torch
classifier_model = train_sentence_classifier.SentenceBinaryClassifier(hidden_size=128)
classifier_model.load_state_dict(torch.load('../sentence-classification-model.pth', map_location=torch.device('cpu')))

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [22]:
classifier_model(tokenized).item()

0.9989068508148193

In [19]:
train[9999]

{'tokens': ['Petitioner',
  ':',
  '\n',
  'U.P.',
  'State',
  'Road',
  'Transport',
  'Corporation',
  'And',
  'Others',
  '\n\n\t',
  'Vs',
  '.',
  '\n\n',
  'Respondent',
  ':',
  '\n',
  'Trilok',
  'Chandra',
  '&',
  'Others',
  '\n\n',
  'Date',
  'Of',
  'Judgment'],
 'tags': ['O',
  'O',
  'O',
  'B-PETITIONER',
  'I-PETITIONER',
  'I-PETITIONER',
  'I-PETITIONER',
  'I-PETITIONER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-RESPONDENT',
  'I-RESPONDENT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']}

In [16]:
len(train)

10995

In [26]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<PREAMBLE>', '<JUDGEMENT>']})
tokenizer

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False), 'additional_special_tokens': ['<PREAMBLE>', '<JUDGEMENT>']})

In [32]:
tokenizer.convert_tokens_to_ids(['<PREAMBLE>', '<JUDGEMENT>'])

[50265, 50266]

In [30]:
tokenizer.convert_ids_to_tokens(50265)

'<PREAMBLE>'