In [2]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from transformers import AdamW
import math
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('always')

In [3]:
def get_data(path):
    data=pd.read_csv(path)

    # train=data.iloc[:16]
    # test=data.iloc[16:32]
    # val=data.iloc[32:48]

    train=data.iloc[:800]
    test=data.iloc[800:900]
    val=data.iloc[900:1000]
    train['ranked-sentences']=train['ranked-sentences'].apply(eval)
    test['ranked-sentences']= test['ranked-sentences'].apply(eval)
    val['ranked-sentences']=val['ranked-sentences'].apply(eval)
    train['sentences']=train['sentences'].apply(eval)
    test['sentences']=test['sentences'].apply(eval)
    val['sentences']=val['sentences'].apply(eval)
    return train, val, test


class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.decisions = self.df.decision.map({'dismissed': 0, 'granted': 1})
        self.ranked_sentences = self.df['ranked-sentences']
        self.sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        sample = {}
        lines = self.df.iloc[idx]['sentences']
        embeddings = self.sentence_model.encode(
            lines
        )

        labels = [0] * len(lines)
        indices = [lines.index(i) for i in self.ranked_sentences.iloc[idx]]
        for i in indices[:len(labels)//2]:
            labels[i] = 1


        sample['embeddings'] = torch.from_numpy(embeddings)
        sample['bail'] = torch.Tensor([self.decisions.iloc[idx]])
        sample['salience_labels'] = torch.LongTensor(labels)

        return sample 
    
def custom_collate(batch):
    bails, labels, embs = [], [], []
    for item in batch:
        bails.append(item['bail'])
        labels.append(item['salience_labels'])
        embs.append(item['embeddings'])

    bails = pad_sequence(bails, batch_first=True)
    embs = pad_sequence(embs, batch_first=True)
    labels = pad_sequence(labels, padding_value=-100, batch_first=True)
    return embs, bails.long(), labels.long()


In [4]:

class MultiTaskModel(nn.Module):
    def __init__(self,
                 nhead=1,
                 nlayers=1,
                 use_cls=True,
                 #  cls_bail_embed=None,
                 d_model=768):
        super(MultiTaskModel, self).__init__()
        self.saliency_classifier = nn.Linear(d_model, 2)
        self.bail_classifier = nn.Linear(d_model, 2)

        # Use [cls] token or pooling output for bail prediction
        self.use_cls = use_cls
        self.d_model = d_model
        if use_cls:
            self.cls_bail_embed = nn.Embedding(1, self.d_model)

        self.encoder_layer = nn.TransformerEncoder(nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            batch_first=True),
            nlayers,
            norm=None)

    def forward(self, x):
        batch_size = x.size()[0]
        if self.use_cls:
            x = torch.cat([self.cls_bail_embed.weight[0].unsqueeze(
                0).repeat(batch_size, 1, 1), x], dim=1)

        x = self.encoder_layer(x)

        if self.use_cls:
            bail_x = x[:, 0, :]
            saliency_x = x[:, 1:, :]
        else:
            bail_x = torch.sum(x, dim=1)
            saliency_x = x
        bail_logits = self.bail_classifier(bail_x)  
        saliency_logits = self.saliency_classifier(
            saliency_x)  
        return bail_logits, saliency_logits


In [5]:
def train_step( model, dataloader, device, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        embeddings, bail_label, saliency_label = batch 
        embeddings = embeddings.to(device)
        bail_label = bail_label.to(device)
        saliency_label = saliency_label.to(device)
        optimizer.zero_grad()
        bail_logits, saliency_logits = model(embeddings) ## mask

        saliency_logits = saliency_logits.contiguous().view(-1, saliency_logits.size(-1))
        saliency_label = saliency_label.contiguous().view(-1)

        loss_bail = F.cross_entropy(bail_logits, bail_label.squeeze(1))
        loss_saliency = F.cross_entropy(saliency_logits, saliency_label)

        loss = loss_bail + loss_saliency 
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [6]:
def eval_step(model, dataloader, device):
    model.eval()
    total_loss = 0
    targets = []
    predictions = []
    for batch in dataloader:
            embeddings, bail_label, saliency_label = batch 
            embeddings = embeddings.to(device)
            bail_label = bail_label.to(device)
            saliency_label = saliency_label.to(device)

            model.zero_grad()
            with torch.no_grad():
                bail_logits, saliency_logits = model(embeddings) ## mask

            saliency_logits = saliency_logits.contiguous().view(-1, saliency_logits.size(-1))
            saliency_label = saliency_label.contiguous().view(-1)

            loss_bail = F.cross_entropy(bail_logits, bail_label.squeeze(1))
            loss_saliency = F.cross_entropy(saliency_logits, saliency_label)

            loss = loss_bail + loss_saliency 
            total_loss += loss.item()
            
            pred_bail = torch.argmax(bail_logits, dim=1).flatten().cpu().numpy()
            predictions.append(pred_bail)
            targets.append(bail_label.squeeze(1).cpu().numpy())

    targets = np.concatenate(targets, axis=0)
    predictions = np.concatenate(predictions, axis=0)
    
    epoch_loss = total_loss/len(dataloader)
    return epoch_loss, targets, predictions

In [7]:
input_path = "./data/summary/data_ranked.csv"
batch_size = 16
 

device =torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda:0")

train, val, test = get_data(input_path)
train_dataset = Dataset(train)
val_dataset = Dataset(val)
test_dataset = Dataset(test)

train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=custom_collate
)
val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=custom_collate
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=custom_collate
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['ranked-sentences']=train['ranked-sentences'].apply(eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ranked-sentences']= test['ranked-sentences'].apply(eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['ranked-sentences']=val['ranked-sentences'].apply(eval)
A value is trying 

In [8]:
epochs=100
save_model = True 
model_path = "./bail_model.pt"
d_model = 768

model = MultiTaskModel(d_model=d_model)

model.to(device)
optimizer = AdamW(model.parameters(), lr=5*1e-5)
best_loss = np.inf
best_epoch = 0
for epoch in range(epochs):
    train_loss = train_step(model, train_dataloader, device, optimizer)

    val_loss,_,_ = eval_step(model, val_dataloader, device)

    print(f"\nEpoch: {epoch+1} | Training loss: {train_loss} | Validation Loss: {val_loss}")
    if (val_loss < best_loss) and (save_model == True):
        torch.save(model.state_dict(), model_path)
        best_loss = val_loss
        best_epoch = epoch+1




Epoch: 1 | Training loss: 1.4851136207580566 | Validation Loss: 1.318014315196446


KeyboardInterrupt: 

In [1]:
loaded_state_dict = torch.load(model_path,  map_location=device)
model.load_state_dict(loaded_state_dict)

test_loss,targets,predictions = eval_step(model, test_dataloader, device)
accuracy = np.sum(targets == predictions)/len(targets)
print(f"Accuracy: {accuracy}")
ConfusionMatrixDisplay.from_predictions(targets, predictions)
plt.savefig(f"./confusion_matrix.png", dpi=300)

NameError: name 'torch' is not defined