In [1]:
!git clone https://github.com/tblock/10kGNAD.git

fatal: destination path '10kGNAD' already exists and is not an empty directory.


In [1]:
import torch
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from sklearn import preprocessing
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_scheduler
from torch.utils.tensorboard import SummaryWriter

In [2]:
train = pd.read_csv('10kGNAD/train.csv', sep=';', quotechar="'", header=None, names = ['labels','text'])
test = pd.read_csv('10kGNAD/test.csv', sep=';', quotechar="'", header=None, names = ['labels','text'])

In [3]:
labels = list(train.labels.unique())
num_labels = len(labels)
model_name = 'bert-base-german-cased'

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to('cuda')

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
tokenized_train = tokenizer(train["text"].to_list(), return_tensors="pt", padding='max_length', truncation=True)
tokenized_test = tokenizer(test["text"].to_list(), return_tensors="pt", padding='max_length', truncation=True)

In [17]:
le = preprocessing.LabelEncoder()
train_tokenized_labels = le.fit_transform(train["labels"])
test_tokenized_labels = le.fit_transform(test["labels"])

In [19]:
tokenized_train['labels'] = torch.tensor(train_tokenized_labels)
tokenized_test['labels'] = torch.tensor(test_tokenized_labels)

In [9]:
writer = SummaryWriter()

In [10]:
num_training_steps = len(train["text"].to_list())
num_epochs = 1

optimizer = AdamW(model.parameters(), lr=5e-5)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [11]:
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for row in range(len(train.index)):
        model_input = dict()
        for key in tokenized_train.keys():
            model_input[key] = tokenized_train[key][row:row+1].to('cuda:0')
        
        optimizer.zero_grad()
            
        outputs = model(model_input['input_ids'], attention_mask=model_input['attention_mask'], labels=model_input['labels'])
        loss = outputs.loss
        writer.add_scalar('Loss/train', loss, row)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        writer.add_scalar('LR/train', lr_scheduler.get_last_lr()[0], row)
        progress_bar.update(1)

  0%|          | 0/9245 [00:00<?, ?it/s]

# Inference

In [24]:
model = model.eval()

In [20]:
def get_sample(idx):
    model_input = dict()
    for key in tokenized_train.keys():
        model_input[key] = tokenized_test[key][idx:idx+1].to('cuda:0')
    return model_input

In [92]:
from torch import nn

sample_idx = 23

def predict_label(sample_idx):
    sample = get_sample(sample_idx)
    prediction = model(**sample)

    probabilities = nn.functional.softmax(prediction.logits, dim=-1)
    max_idx = probabilities.argmax()
    pred_label = max_idx.item()
    true_label = test_tokenized_labels[sample_idx]
    return pred_label, true_label

In [113]:
true_list = list()
pred_list = list()
for idx in tqdm(range(len(test.index))):
    true, pred = predict_label(idx)
    true_list.append(true)
    pred_list.append(pred)

target = torch.tensor(true_list)
preds = torch.tensor(pred_list)

  0%|          | 0/1028 [00:00<?, ?it/s]

In [114]:
from torchmetrics import Accuracy
accuracy = Accuracy()
model_acc = accuracy(preds, target)
print(f'Model accuracy is {model_acc}')

Model accuracy is 0.8803501725196838
