## Installing necessary packages

In [None]:
!pip install transformers --q

## Importing necessary packages

In [None]:
import pandas as pd
import numpy as np
from numpy import zeros

from tqdm import tqdm

import json
import torch

from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score


import torch as th
import torch.utils.data as Data
from torch.optim import lr_scheduler
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

## Parameters

In [None]:
sequence_length = 256
embedding_dim = 256
batch_size = 16
epochs = 20
model_name = 'nlpaueb/legal-bert-base-uncased'
output_type = 'mean_pooled' # by default CLS

## Dataset Loader

#### Training dataset

In [None]:
train_df = pd.read_csv('../../data/model_data/train_df.csv')
print("# Size: ", train_df.shape)
train_df.head()

In [None]:
print("#Nan in sents: ", train_df.sentence.isnull().values.sum())
print("#Nan in labels: ", train_df.label.isnull().values.sum())

In [None]:
train_df.label.value_counts()

#### Validation dataset

In [None]:
dev_df = pd.read_csv('../../data/model_data/dev_df.csv')
print("# Size: ", dev_df.shape)
dev_df.head()

In [None]:
print("#Nan in sents: ", dev_df.sentence.isnull().values.sum())
print("#Nan in labels: ", dev_df.label.isnull().values.sum())

In [None]:
dev_df.label.value_counts()

#### Reading labels

In [None]:
le_encoder = LabelEncoder()
le_encoder.fit(list(train_df.label.values) + list(dev_df.label.values))

label_names = le_encoder.classes_
print(label_names)

In [None]:
ID_Label_Map = {}
with open('../../data/model_data/ID_Label_Map.json', 'r') as fp:
    ID_Label_Map = json.load(fp)

Label_ID_Map = {}
with open('../../data/model_data/Label_ID_Map.json', 'r') as fp:
    Label_ID_Map = json.load(fp)

print(list(Label_ID_Map.keys()))

In [None]:
train_labels = le_encoder.transform(list(train_df.label.values))
print("# Labels: ", len(train_labels))

In [None]:
dev_labels = le_encoder.transform(list(dev_df.label.values))
print("# Labels: ", len(dev_labels))

In [None]:
dev_sentences = list(dev_df.sentence.values)
dev_actual_labels = list(dev_df.label.values)

## Bert Model

In [None]:
class BertClassifier(th.nn.Module):
    def __init__(self, base_model = 'roberta-base', pretrained_model='roberta-base', nb_classes = 13, embedding_dim = 512):
        super(BertClassifier, self).__init__()
        self.nb_classes = nb_classes
        self.embedding_dim = embedding_dim
        
        self.tokenizer_1 = AutoTokenizer.from_pretrained(pretrained_model)
        self.tokenizer_2 = AutoTokenizer.from_pretrained(base_model)
        
        self.bert_model_1 = AutoModel.from_pretrained(pretrained_model)
        self.bert_model_2 = AutoModel.from_pretrained(base_model)
        
        self.feat_dim_1 = list(self.bert_model_1.modules())[-2].out_features
        self.classifier_1 = th.nn.Linear(self.feat_dim_1, self.embedding_dim)
        
        self.feat_dim_2 = list(self.bert_model_2.modules())[-2].out_features
        self.classifier_2 = th.nn.Linear(self.feat_dim_2, self.embedding_dim)
        
        self.fcs = th.nn.ModuleList([th.nn.Linear(2 * self.embedding_dim, 1) for _ in range(self.nb_classes)])
    
    def forward(self, input_ids = None, attention_mask = None, output_type = 'CLS'):
        
        input_ids_1 = input_ids[0]
        attention_mask_1 = attention_mask[0]
        bert_output = None
        if output_type=='mean_pooled':
            output = self.bert_model_1(input_ids_1, attention_mask=attention_mask_1).last_hidden_state
            ### Mean Pooling
            bert_output = output.sum(axis=1) / attention_mask.sum(axis=-1).unsqueeze(-1)
        else:
            bert_output = self.bert_model_1(input_ids_1, attention_mask=attention_mask_1)[0][:, 0]
        output_1 = torch.squeeze(self.classifier_1(bert_output))
        


        input_ids_2 = input_ids[1]
        attention_mask_2 = attention_mask[1]
        bert_output = None
        if output_type=='mean_pooled':
            output = self.bert_model_2(input_ids_2, attention_mask=attention_mask_2).last_hidden_state
            ### Mean Pooling
            bert_output = output.sum(axis=1) / attention_mask.sum(axis=-1).unsqueeze(-1)
        else:
            bert_output = self.bert_model_2(input_ids_2, attention_mask=attention_mask_2)[0][:, 0]
        output_2 = torch.squeeze(self.classifier_2(bert_output))


        
        last_hidden_states = th.cat((output_1.float(), output_2.float()), -1)
        logits = [th.sigmoid(fc(last_hidden_states)) for fc in self.fcs]
        
        return torch.cat(logits, dim=-1)

### Read testset

In [None]:
model = BertClassifier(pretrained_model = model_name, nb_classes=len(ID_Label_Map), embedding_dim = embedding_dim)
model.cuda()

#### Preparing the data loader for bert classifier

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def encode_input(text, tokenizer):
    tokenized_text = tokenizer(
        text,
        max_length = sequence_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return tokenized_text.input_ids, tokenized_text.attention_mask

In [None]:
input_ids, attention_mask = {'train': {0: [], 1:[]}, 'val': {0: [], 1:[]}}, {'train': {0: [], 1:[]}, 'val': {0: [], 1:[]}}

In [None]:
input_ids['train'][0], attention_mask['train'][0] = encode_input(list(train_df.sentence.values), model.tokenizer_1)
input_ids['train'][1], attention_mask['train'][1] = encode_input(list(train_df.sentence.values), model.tokenizer_2)

print(input_ids['train'][0].shape, type(input_ids['train'][0]))
print(input_ids['train'][1].shape, type(input_ids['train'][1]))
print(attention_mask['train'][0].shape, type(attention_mask['train'][0]))
print(attention_mask['train'][1].shape, type(attention_mask['train'][1]))

In [None]:
input_ids['val'][0], attention_mask['val'][0] = encode_input(list(dev_df.sentence.values), model.tokenizer_1)
input_ids['val'][1], attention_mask['val'][1] = encode_input(list(dev_df.sentence.values), model.tokenizer_2)

print(input_ids['val'][0].shape, type(input_ids['val'][0]))
print(input_ids['val'][1].shape, type(input_ids['val'][1]))
print(attention_mask['val'][0].shape, type(attention_mask['val'][0]))
print(attention_mask['val'][1].shape, type(attention_mask['val'][1]))

In [None]:
label_encode = {}
label_encode['train'] = th.LongTensor(train_labels)
label_encode['val'] = th.LongTensor(dev_labels)

print(label_encode['train'].shape, type(label_encode['train']))
print(label_encode['val'].shape, type(label_encode['val']))

In [None]:
datasets = {}
loader = {}
for split in ['train', 'val']:
    
    datasets[split] =  Data.TensorDataset(input_ids[split][0], attention_mask[split][0], input_ids[split][1], attention_mask[split][1], label_encode[split])
    
    split_sampler = None
    if split == 'train':
        split_sampler = Data.RandomSampler(datasets[split])
    else:
        split_sampler = Data.SequentialSampler(datasets[split])
    
    loader[split] = Data.DataLoader(datasets[split], sampler = split_sampler, batch_size = batch_size)

In [None]:
print("#train Dataset: ", len(datasets['train']))
print("#train Labels: ", len(label_encode['train']))
print("#train Loader: ", len(loader['train']))

print("#dev Dataset: ", len(datasets['val']))
print("#dev Labels: ", len(label_encode['val']))
print("#dev Loader: ", len(loader['val']))

#### Optimizer and scheduler

In [None]:
optimizer = th.optim.Adam(model.parameters(), lr=1e-4)
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[30], gamma=0.1)

#### Training

In [None]:
train_loss = []
val_loss = []

train_acc = []
val_acc = []

best_val_loss = None

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1 , epochs))
    
    t_loss, t_acc = 0, 0
    v_loss, v_acc = 0, 0
    
    actual_labels, predictions = [], []
    
    for step, batch in enumerate(loader['train']):
        
        # progress update after every 50 batches.
        print('\rBatch {:>5,}  of  {:>5,}.'.format(step, len(loader['train'])), end='')
    
        model.train()

        model = model.to(device)
        optimizer.zero_grad()

        (input_ids_0, attention_mask_0, input_ids_1, attention_mask_1, label_0) = [x.to(device) for x in batch]
        optimizer.zero_grad()

        y_pred = model([input_ids_0, attention_mask_0], [input_ids_1, attention_mask_1])
        y_true = label_0.type(th.long)

        loss = F.cross_entropy(y_pred, y_true)
        loss.backward()

        optimizer.step()
        t_loss += loss.item()

        with th.no_grad():
            y_true = y_true.detach().cpu()
            y_pred = y_pred.argmax(axis=1).detach().cpu()
            t_acc += accuracy_score(y_true, y_pred)
            
            for i in y_true:
                actual_labels.append(i)
                
            for i in y_pred:
                predictions.append(i)
                
    train_loss.append(t_loss/len(loader['train']))
    train_acc.append(t_acc/len(loader['train']))
    f1_measure = f1_score(actual_labels, predictions, average='micro')
    
    print("Training Loss: ", t_loss/len(loader['train']))
    print("Train Accuracy: ", t_acc/len(loader['train']))
    print("F1 measure: ", f1_measure)
    
    #### Validating the validation samples
    predictions, actual_labels = [], []
    with th.no_grad():
        for step, batch in enumerate(loader['val']):
            
            # progress update after every 50 batches.
            print('\rBatch {:>5,}  of  {:>5,}.'.format(step, len(loader['val'])), end="")
                
            model.eval()
            model = model.to(device)
            
            (input_ids_0, attention_mask_0, input_ids_1, attention_mask_1, label_0) = [x.to(device) for x in batch]
            optimizer.zero_grad()
            
            y_pred = model([input_ids_0, attention_mask_0], [input_ids_1, attention_mask_1])
            y_true = label_0.type(th.long)
            
            loss = F.cross_entropy(y_pred, y_true)
            v_loss += loss.item()
            
            y_true = y_true.detach().cpu()
            y_pred = y_pred.argmax(axis=1).detach().cpu()
            
            for i in y_true:
                actual_labels.append(i)
                
            for i in y_pred:
                predictions.append(i)
                
            v_acc += accuracy_score(y_true, y_pred)
            
    val_loss.append(v_loss/len(loader['val']))
    val_acc.append(v_acc/len(loader['val']))
    
    f1_measure = f1_score(actual_labels, predictions, average='micro')
    
    print("Validation Loss: ", v_loss/len(loader['val']))
    print("Validation Accuracy: ", v_acc/len(loader['val']))
    print("Validation F1 measure: ", f1_measure)
    
    scheduler.step()
    
    if best_val_loss is None or best_val_loss > (v_acc/len(loader['val'])):
        th.save(
            {
                'bert_model_1': model.bert_model_1.state_dict(),
                'classifier_1': model.classifier_1.state_dict(),
                'bert_model_2': model.bert_model_2.state_dict(),
                'classifier_2': model.classifier_2.state_dict(),
                'classifier': model.classifier.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epochs,
            },
            '../../checkpoints/checkpoint.pth'
        )
        best_val_loss = (v_acc/len(loader['val']))
        
        pred_labels = []
        for la in predictions:
            pred_labels.append(ID_Label_Map.get(str(la.item()), ''))
        df = pd.DataFrame(data = {'sentence': dev_sentences, 'actual': dev_actual_labels, 'predict': pred_labels}, columns = ['sentence', 'actual', 'predict'])
        df.to_csv('../../model_results/eval_predictions.csv')

### Accuracy and Loss plots

In [None]:
fig, (ax1, ax2) = plt.subplots(figsize = (8, 4), nrows = 1, ncols=2)

ax1.plot(range(epochs), train_loss, label='train', color='red')
ax1.scatter(range(epochs), train_loss, color='red')
ax1.plot(range(epochs), val_loss, label='dev', color='blue')
ax1.scatter(range(epochs), val_loss, color='blue')
ax1.set(xlabel = 'Epochs', ylabel = 'Loss')
ax1.set_title('Train vs Dev Loss')
ax1.legend()

ax2.plot(range(epochs), train_acc, label='train', color='red')
ax2.scatter(range(epochs), train_loss, color='red')
ax2.plot(range(epochs), val_acc, label='dev', color='blue')
ax2.scatter(range(epochs), val_acc, color='blue')
ax2.set(xlabel = 'Epochs', ylabel = 'Accuracy')
ax2.set_title('Train vs Dev Accuracy')
ax2.legend()

fig.savefig('train_vs_test_loss_and_accuracy.jpg')
fig.show()

## Test Data Inference and submission file preparation

### Load the labels list

In [None]:
print(ID_Label_Map)
labels_list = []
for i in range(len(ID_Label_Map)):
    label = ID_Label_Map[str(i)]
    labels_list.append(label)

print(labels_list)

### Read testset

In [None]:
rr_test_data = {}
with open('../../data/model_data/SAMPLE_SUBMISSION_RR.json', 'r') as fp:
    rr_test_data = json.load(fp)
print("# Documents: ", len(rr_test_data))

In [None]:
final_results = rr_test_data.copy()
for doc_index, entry in tqdm(enumerate(rr_test_data)):
    results = rr_test_data[doc_index]['annotations'][0]['result']
    for sent_index, sent in enumerate(results):
        
        #### Extracting the text from the test data
        sentence = sent['value']['text']
        sentence = sentence.replace(r'\s+', ' ').strip()
        
        #### Getting input ids and attention mask from the model for the given sentence
        input_ids_1, attention_mask_1 = encode_input([sentence], model.tokenizer_1)
        input_ids_2, attention_mask_2 = encode_input([sentence], model.tokenizer_2)

        label = None
        with torch.no_grad():
            model.eval()
            model.to(device)
            y_pred = model([input_ids_1.to(device), input_ids_2.to(device)], [attention_mask_1.to(device), attention_mask_2.to(device)]).cpu()
            label = labels_list[np.argmax(y_pred)]
        
        if label is not None:
            final_results[doc_index]['annotations'][0]['result'][sent_index]['value']['labels'] = [label]
        else:
            print("Label is None for Doc-index: {} and Sent-index: {}".format(doc_index, sent_index))

print("# Documents: ", len(final_results))

In [None]:
with open('../../output/RR_SUBMISSION.json', 'w') as fp:
    json.dump(final_results, fp, indent=4)