In [1]:
#%pip install torch --user
import json
import os
import numpy as np 
import pandas as pd
import torch
import torch.functional as F
import torch.nn as nn

%pip install transformers --user
%pip install openpyxl --user

from transformers import BertTokenizerFast,  BatchEncoding, BertModel, \
                            BertForTokenClassification
from tokenizers import Encoding

# %pip install ipywidgets --upgrade
# %pip install jupyter --upgrade
# %pip install seaborn --user
# %pip install sentencepiece

import seaborn as sns 
import matplotlib.pyplot as plt

import tqdm


torch.manual_seed(42)

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


<torch._C.Generator at 0x1550cd626e90>

In [2]:
import pandas as pd
from utils import preproc
df = pd.read_excel("data/appliedAI_Use Case Library - Risk Class_TUM.ai.xlsx", index_col=0, header=2)

df = preproc(df)


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, df['risk'],
                                                    stratify=df['risk'], 
                                                    test_size=0.25,
                                                   random_state=42)

In [4]:
y_test

Use Case ID
86      low-risk
57      low-risk
8       low-risk
30      low-risk
114     low-risk
36     high-risk
56     high-risk
87      low-risk
62      low-risk
92      low-risk
91      low-risk
80      low-risk
108     low-risk
89      low-risk
7       low-risk
5      high-risk
68      low-risk
85      low-risk
32      low-risk
42     high-risk
21      low-risk
76      low-risk
49      low-risk
34      low-risk
117     low-risk
Name: risk, dtype: object

In [5]:
from torch.utils.data import Dataset


# create a dataset - currently all files loaded beforehand (woeks for smaller datasets)
class RDataset(Dataset):
    # A pytorch dataset class for holding data for a text labeling task.
    # gets a parent directory of several datasets' directories
    def __init__(self, model_name, X, y):
        '''
        Takes as input the name of a file containing sentences with a classification label (comma separated) in each line.
        Stores the text data in a member variable X and labels in y
        '''
        
        # Load a pre-trained tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name) 

        self.encodings = self.tokenizer(X["desc"].to_list(), return_tensors="pt", padding=True)
        self.y = y.replace({"low-risk": 0.0, "high-risk": 1.0}).to_list()
        
        

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.y[idx])
        return item

    def __len__(self):
        return len(self.y)


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary"
dataset_train = RDataset(model_name,  X_train, y_train)
dataset_test = RDataset(model_name, X_test, y_test)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, ignore_mismatched_sizes=True)
# model.config.num_labels = 1


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 384]) in the checkpoint and torch.Size([1, 384]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from torch.utils.data import DataLoader



In [8]:
# Check PyTorch GPU capabilities:

print("\nPyTorch:")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('%d GPU(s) available.' % torch.cuda.device_count())
    print('GPU-Name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# device = torch.device("cpu")

model = model.to(device)


PyTorch:
1 GPU(s) available.
GPU-Name: NVIDIA A40


In [9]:
#
## other training parameters
clip = 0.25            #gradient clipping
lr = 0.00001#0.00003           #initial learning rate
wdecay=1.2e-6          #weight decay applied to all weights
epochs = 2             #maximum number of epochs
#save = 'models/model.pt'      #path to save the final model

train_max_number_batches = -1 # only for the sake of debugging. Set to -1 to be ignored
inference_max_number_batches = -1 # only for the sake of debugging. Set to -1 to be ignored

## log parameters
log_interval = 100     #log interval during training
log_interval_val = 100 #log interval during validation

In [10]:

class Learner():
    def __init__(self,train_loader, val_loader, model, optimizer, device, loss_fn=None):
        self.train_loader= train_loader
        self.val_loader = val_loader
        self.model = model 
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.device = device
        
    def evaluate(self, loader):
        eval_loss = 0
        self.model.eval()
        for b, inp in tqdm.tqdm(enumerate(loader), total=len(loader), position=0, leave=True):
            with torch.no_grad():
                inp = {k:v.clone().detach().to(device) for (k,v) in inp.items()}
                
                output = self.model(**inp)
                #loss = self.loss_fn(log_pred, y)
                logits = output.logits
                
                loss = self.loss_fn(logits, inp["labels"], pos_weight=torch.tensor(9))
                
                eval_loss += loss.item()
            
        eval_loss /= len(loader)
        
        return eval_loss
    
    def train(self,n_epochs, train_losses, val_losses, early_stopping=True, es_limit=3):

        if early_stopping:
            min_val_loss = np.inf
            no_improve = 0
        
        
        for e in range(n_epochs):
            # train
            train_loss = 0
            self.model.train()
            for b, inp in tqdm.tqdm(enumerate(self.train_loader), total=len(self.train_loader), position=0, leave=True):
                inp = {k:v.clone().detach().to(device) for (k,v) in inp.items()}

                self.optimizer.zero_grad()
                output = self.model(**inp)

                logits = output.logits

                print(inp["labels"])
                loss = self.loss_fn(logits, inp["labels"], pos_weight=torch.tensor(10))

                loss.backward()

                self.optimizer.step()

            # evaluate (train, validation)
            train_loss = self.evaluate(self.train_loader)
            val_loss = self.evaluate(self.val_loader)

            # early stopping
            if early_stopping:
                if val_loss < min_val_loss:
                    min_val_loss = val_loss
                    torch.save(self.model, "models/deberta_risk")
                    no_improve = 0
                else:
                    no_improve += 1
            if no_improve > es_limit:
                print("Early stopped")
                self.model = torch.load("models/deberta_risk")
                break
            
            print(f"After {e+1} epochs: ")
            print(f"Train loss: {train_loss:.3}")
            print(f"Val loss: {val_loss:.3}\n")
                
            train_losses.append(train_loss)
            val_losses.append(val_loss)
        return train_losses, val_losses
    
    

train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True, num_workers=0)
val_loader = DataLoader(dataset_test, batch_size=16, shuffle=True, num_workers=0)

optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wdecay)
loss_fn = torch.nn.functional.binary_cross_entropy_with_logits

train_losses, val_losses = [], []

In [11]:
learner = Learner(train_loader, val_loader, model, optim, device, loss_fn=loss_fn)
train_losses, val_losses = learner.train(50, train_losses, val_losses)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / torch.tensor(
  score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
  score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)
 30%|███       | 3/10 [00:00<00:01,  3.75it/s]

tensor([0., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([1., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')


 50%|█████     | 5/10 [00:01<00:00,  6.13it/s]

tensor([1., 0., 1., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')


100%|██████████| 10/10 [00:01<00:00,  6.97it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

tensor([0., 1., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 1., 0., 0.], device='cuda:0')
tensor([0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 56.48it/s]
100%|██████████| 2/2 [00:00<00:00, 32.08it/s]
 20%|██        | 2/10 [00:00<00:00, 14.00it/s]

After 1 epochs: 
Train loss: 1.96
Val loss: 1.65

tensor([0., 0., 1., 0., 1., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')


 60%|██████    | 6/10 [00:00<00:00, 14.80it/s]

tensor([0., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([1., 0., 0., 0., 0., 1., 0., 0.], device='cuda:0')
tensor([0., 0., 1., 1., 0., 0., 0., 0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 15.11it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 52.12it/s]
100%|██████████| 2/2 [00:00<00:00, 32.23it/s]
 20%|██        | 2/10 [00:00<00:00, 14.14it/s]

After 2 epochs: 
Train loss: 1.35
Val loss: 1.35

tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 1., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 1., 0.], device='cuda:0')


 60%|██████    | 6/10 [00:00<00:00, 14.91it/s]

tensor([0., 0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 1., 0., 0., 1., 0., 0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 15.14it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

tensor([0., 0., 0., 0., 0., 1., 0., 1.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 1., 0.], device='cuda:0')
tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([1.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 53.77it/s]
100%|██████████| 2/2 [00:00<00:00, 32.18it/s]
 20%|██        | 2/10 [00:00<00:00, 15.40it/s]

After 3 epochs: 
Train loss: 1.32
Val loss: 1.58

tensor([0., 0., 1., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 1., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')


 60%|██████    | 6/10 [00:00<00:00, 15.33it/s]

tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([1., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 1., 1.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 15.33it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([1.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 55.44it/s]
100%|██████████| 2/2 [00:00<00:00, 32.01it/s]
 20%|██        | 2/10 [00:00<00:00, 14.70it/s]

After 4 epochs: 
Train loss: 1.27
Val loss: 1.5

tensor([0., 0., 0., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 1., 1.], device='cuda:0')
tensor([0., 0., 0., 0., 1., 0., 0., 1.], device='cuda:0')


 60%|██████    | 6/10 [00:00<00:00, 14.81it/s]

tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 1., 0., 0., 0., 0., 0., 1.], device='cuda:0')
tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 15.37it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 55.48it/s]
100%|██████████| 2/2 [00:00<00:00, 32.89it/s]
 20%|██        | 2/10 [00:00<00:00, 15.46it/s]

After 5 epochs: 
Train loss: 1.24
Val loss: 1.48

tensor([0., 0., 0., 0., 0., 1., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 1., 0.], device='cuda:0')
tensor([0., 0., 1., 0., 0., 0., 0., 0.], device='cuda:0')


 60%|██████    | 6/10 [00:00<00:00, 15.63it/s]

tensor([1., 0., 1., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 0., 1., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 0., 1., 1., 0., 0., 0., 0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 15.75it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([0.], device='cuda:0')


100%|██████████| 10/10 [00:00<00:00, 54.26it/s]
100%|██████████| 2/2 [00:00<00:00, 32.30it/s]


Early stopped


In [12]:
def predict(model, loader, device="cpu", verbose=False):
    model.eval()
    model.to(device)
    
    y_pred = []
    y_true = []
    
    for b, inp in tqdm.tqdm(enumerate(loader), total=len(loader), position=0, leave=True):
        with torch.no_grad():
            inp = {k:v.to(device) for (k,v) in inp.items()}    
            output = model(**inp)
            
        y_true.append(inp["labels"])
        print(inp["labels"])
        #print(amask)
        print(output.logits)
        y_pred.append((output.logits > 0).float())
    #print(len(y_pred))
    #print(y_pred[0].shape)
    return torch.cat(y_pred, dim=0).to("cpu"), torch.cat(y_true, dim=0).to("cpu") 

pred_train, y_train = predict(model, train_loader, device=device, verbose=False)
pred_val, y_val = predict(model, val_loader, device=device, verbose=False)
# pred_test, y_test = predict(model, test_loader, device=device, verbose=False)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 10/10 [00:00<00:00, 49.83it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

tensor([0., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([ 0.1931,  0.2950, -0.0117,  0.1630,  0.6543,  0.1556, -0.1576,  0.0365],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 1., 0., 1.], device='cuda:0')
tensor([ 0.3262, -0.0452, -0.1705, -0.2292,  0.0963,  0.7271,  0.2569,  0.5412],
       device='cuda:0')
tensor([1., 0., 0., 0., 1., 0., 0., 0.], device='cuda:0')
tensor([ 0.5149, -0.0600, -0.4120,  0.3254,  0.7488, -0.4367, -0.1553, -0.0601],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([ 0.2424, -0.0601, -0.4120, -0.0114, -0.4218,  0.2817, -0.0954,  0.1764],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([-0.0805,  0.0685,  0.7223,  0.4202, -0.0804, -0.0165, -0.0031, -0.2157],
       device='cuda:0')
tensor([0., 0., 1., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([ 0.0846,  0.1497,  0.7223, -0.2732, -0.4120,  0.2289, -0.0682, -0.0804],
       device='cuda:0')
tensor([0., 0., 0., 0.

100%|██████████| 2/2 [00:00<00:00, 31.36it/s]

tensor([1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
tensor([ 0.6140,  0.2814,  0.0466,  0.1805,  0.3158,  0.5411,  0.2106, -0.0804,
         0.0591,  0.1652,  0.0846,  0.6543, -0.1230,  0.4593, -0.3122,  0.2013],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
tensor([ 0.0562,  0.0094,  0.1523,  0.1680, -0.0922,  0.1607,  0.5349,  0.1764,
        -0.1382], device='cuda:0')





In [13]:
y_train, y_test



(tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0.]),
 Use Case ID
 86      low-risk
 57      low-risk
 8       low-risk
 30      low-risk
 114     low-risk
 36     high-risk
 56     high-risk
 87      low-risk
 62      low-risk
 92      low-risk
 91      low-risk
 80      low-risk
 108     low-risk
 89      low-risk
 7       low-risk
 5      high-risk
 68      low-risk
 85      low-risk
 32      low-risk
 42     high-risk
 21      low-risk
 76      low-risk
 49      low-risk
 34      low-risk
 117     low-risk
 Name: risk, dtype: object)

In [14]:
pred_train

tensor([1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
        0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 1.,
        0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
        0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1.,
        0.])

In [15]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [17]:
for name, pred, y in zip(["TRAIN", "VAL"], [pred_train, pred_val], [y_train, y_val]):
    print(f"Summary on {name} set:")
    print()
    print(classification_report(y.ravel(), pred.ravel(), target_names=["low-res", "high-res"]))
    print(confusion_matrix(y.ravel(), pred.ravel()))
    print("*"*53)
    print()

Summary on TRAIN set:

              precision    recall  f1-score   support

     low-res       1.00      0.57      0.73        63
    high-res       0.27      1.00      0.43        10

    accuracy                           0.63        73
   macro avg       0.64      0.79      0.58        73
weighted avg       0.90      0.63      0.69        73

[[36 27]
 [ 0 10]]
*****************************************************

Summary on VAL set:

              precision    recall  f1-score   support

     low-res       1.00      0.24      0.38        21
    high-res       0.20      1.00      0.33         4

    accuracy                           0.36        25
   macro avg       0.60      0.62      0.36        25
weighted avg       0.87      0.36      0.38        25

[[ 5 16]
 [ 0  4]]
*****************************************************

