# Document Classifier Model
We are building a document classifer for the legal texts so we can classify whether a sentence is from the preamble or judgement.

In [92]:
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [68]:
preamble = pd.read_json("NER_TRAIN/NER_TRAIN_PREAMBLE.json")
preamble_texts = [item['text'] for item in preamble['data']]
preamble_texts[0]

"In The High Court Of Kerala At Ernakulam\n\nCrl Mc No. 1622 of 2006()\n\n\n1. T.R.Ajayan, S/O. O.Raman,\n                      ...  Petitioner\n\n                        Vs\n\n\n\n1. M.Ravindran,\n                       ...       Respondent\n\n2. Mrs. Nirmala Dinesh, W/O. Dinesh,\n\n                For Petitioner  :Sri.A.Kumar\n\n                For Respondent  :Smt.M.K.Pushpalatha\n\nThe Hon'ble Mr. Justice P.R.Raman\nThe Hon'ble Mr. Justice V.K.Mohanan\n\n Dated :07/01/2008\n\n O R D E R\n"

In [69]:
judgement = pd.read_json("NER_TRAIN/NER_TRAIN_JUDGEMENT.json")
judgement_texts = [item['text'] for item in judgement['data']]
judgement_texts[0]

"\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy."

In [94]:
import torch
from transformers import AutoTokenizer, AutoModel
from typing import List

class BatchTokenizer:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
        
    def __call__(self, sentence: List[str]):
        return self.tokenizer(
            sentence,
            padding='max_length', 
            truncation=True,
            return_token_type_ids=False,
            return_tensors='pt'
        ).to(device)
    
tokenizer = BatchTokenizer()
tokenizer([judgement_texts[0]])

{'input_ids': tensor([[  101,   111,   204,   112,   222,   599, 18380,   218,   207,  7396,
           802,   237,  1003,   210,   162,   189,   117,   198,   115,   415,
           115,   984,   115,  1272,   222,  1101,  2336,   210,  4413, 11904,
           177,   429,   366,   210,   229,   145,  5732,  1081,   223,  4973,
           236,   160,   117,   673,   210,  2351,   175,   175,   110,   163,
          1513,  2301,   115,  6532,  1837,   900,   642,   216,   233,   246,
           497,   211,   413,   238,  2757,   115,  3679,  6416,   182,   109,
           535,   117,   222,   207,   421,   210,   275,  1927,   145,   369,
          1233,   223,  1206,   218,  1143,   222,   216,  5732,  1081,   117,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [74]:
import random 

# Create labels for each of the sentences
all_texts = preamble_texts + judgement_texts
all_labels = [1] * len(preamble_texts) + [0] * len(judgement_texts)

# Randomize order
combined = list(zip(all_texts, all_labels))
random.shuffle(combined)
all_texts[:], all_labels[:] = zip(*combined)

print(all_labels[:100])

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [75]:
def chunk(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]
        
batch_size = 64

train_input_batches = [b for b in chunk(all_texts, batch_size)]
train_input_batches = [tokenizer(batch) for batch in train_input_batches]

train_input_batches[0]

{'input_ids': tensor([[  101,   300,   261,  ...,     0,     0,     0],
        [  101,   207, 15959,  ...,     0,     0,     0],
        [  101,   373,   207,  ...,     0,     0,     0],
        ...,
        [  101,   435,   207,  ...,     0,     0,     0],
        [  101,   226,   968,  ...,     0,     0,     0],
        [  101,   213,   207,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [93]:
# Batch labels as well
def encode_labels(labels: List[int]) -> torch.FloatTensor:
    return torch.FloatTensor([int(l) for l in labels]).to(device)

train_label_batches = [encode_labels(b) for b in chunk(all_labels, batch_size)]
train_label_batches[0]

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [90]:
class SentenceBinaryClassifier(torch.nn.Module):
    def __init__(self, hidden_size: int):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.legal_bert = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
        self.bert_hidden_dim = self.legal_bert.config.hidden_size
        for param in self.legal_bert.parameters():
            param.requires_grad = False
        
        self.hidden_layer = torch.nn.Linear(self.bert_hidden_dim, self.hidden_size)
        self.relu = torch.nn.ReLU()
        
        self.classifier = torch.nn.Linear(self.hidden_size, 1)
        
    def forward(self, sentences) -> torch.Tensor:
        src = self.legal_bert(**sentences).pooler_output
        src = self.relu(self.hidden_layer(src))
        out = self.classifier(src)
        return torch.sigmoid(out)
    
def predict(model, sents):
    return model(sents) > 0.5

In [78]:
from tqdm.notebook import tqdm

def training_loop(num_epochs, train_sentences, train_labels, dev_sentences, dev_labels, optimizer, model):
    print("Training...")
    loss_func = torch.nn.BCELoss()
    batches = list(zip(train_sentences, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for sents, labels in tqdm(batches):
            optimizer.zero_grad()
            preds = model(sents).squeeze(1)
            loss = loss_func(preds, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        print("Evaluating dev")
        dev_preds = []
        dev_labels = []
        for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            pred = predict(model, sents)
            dev_preds.extend(pred)
            dev_labels.extend(list(labels.numpy()))
        accuracy = sum(dev_preds == dev_labels) / len(dev_labels)
        print(f"Dev Acc: {accuracy}")
    return model

In [88]:
preamble_texts_dev = [item['text'] for item in pd.read_json("NER_DEV/NER_DEV_PREAMBLE.json")['data']]
judgement_texts_dev = [item['text'] for item in pd.read_json("NER_DEV/NER_DEV_JUDGEMENT.json")['data']]
all_texts_dev = preamble_texts_dev + judgement_texts_dev
all_labels_dev = [1] * len(preamble_texts_dev) + [0] * len(judgement_texts_dev)
dev_sents_batches = [tokenizer(b) for b in chunk(all_texts_dev, batch_size)]
dev_labels_batches = [encode_labels(b) for b in chunk(all_labels_dev, batch_size)]

In [None]:
model = SentenceBinaryClassifier(hidden_size=128).to(device)

training_loop(
    num_epochs=10,
    train_sentences=train_input_batches,
    train_labels=train_label_batches,
    dev_sentences=dev_sents_batches,
    dev_labels=dev_labels_batches,
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    model=model
)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training...


  0%|          | 0/172 [00:00<?, ?it/s]