In [1]:
!pip install --upgrade torch

In [2]:
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

In [3]:
train = pd.read_csv('../input/classification/train.tsv', sep='\t', index_col='tweet_id')
dev = pd.read_csv('../input/classification/dev.tsv', sep='\t', index_col='tweet_id')
test = pd.read_csv('../input/classification/test.tsv', sep='\t', index_col='tweet_id')

In [4]:
# PRE_TRAINED_MODEL_NAME = 'DeepPavlov/rubert-base-cased-conversational'
PRE_TRAINED_MODEL_NAME = 'cimm-kzn/enrudr-bert'

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 5
LR = 3e-5
LABEL_SMOOTHING = 0.2

DEVICE

In [5]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [6]:
class TweetDataset(Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tweets = df['tweet'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.tweets)
  
    def __getitem__(self, item):
        tweet = self.tweets[item]
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            )

        return {
            'tweet': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
            }

In [7]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TweetDataset(
        df,
        tokenizer=tokenizer,
        max_len=max_len
        )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        #num_workers=4
        )

In [8]:
train_data_loader = create_data_loader(train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(dev, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test, tokenizer, MAX_LEN, BATCH_SIZE)

In [9]:
model.to(DEVICE);

In [10]:
optimizer = AdamW(model.parameters(), lr=LR)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=int(0.1 * total_steps),
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTHING).to(DEVICE)

In [22]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
    model = model.train()
    
    losses = []
    predictions = []
    labels = []
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits

        preds = torch.argmax(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        predictions.extend(preds.cpu())
        labels.extend(d['label'])
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    f1 = f1_score(labels, predictions, zero_division=0)

    return f1, np.mean(losses)

In [23]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    predictions = []
    labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds = torch.argmax(outputs, dim=1)

            loss = loss_fn(outputs, targets)
            
            predictions.extend(preds.cpu())
            labels.extend(d['label'])
            losses.append(loss.item())
            
    f1 = f1_score(labels, predictions, zero_division=0)

    return f1, np.mean(losses)

In [24]:
%%time

history = defaultdict(list)
best_f1 = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_f1, train_loss = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        DEVICE, 
        scheduler, 
        len(train)
        )

    print(f'Train loss {train_loss} f1_score {train_f1}')

    val_f1, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        DEVICE, 
        len(dev)
        )

    print(f'Val   loss {val_loss} f1_score {val_f1}')
    print()

    history['train_f1'].append(train_f1)
    history['train_loss'].append(train_loss)
    history['val_f1'].append(val_f1)
    history['val_loss'].append(val_loss)

    if val_f1 > best_f1:
        torch.save(model.state_dict(), 'model_state.bin')
        best_f1 = val_f1

In [25]:
plt.plot(history['train_f1'], label='train f1 score')
plt.plot(history['val_f1'], label='validation f1 score')

plt.title('Training history')
plt.ylabel('F1 score')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [27]:
model.load_state_dict(torch.load('model_state.bin'))
model = model.to(DEVICE)

In [None]:
def get_predictions(model, data_loader, device):
    model = model.eval()
  
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["label"].to(device)
        
            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds = torch.argmax(outputs, dim=1)

            predictions.extend(preds)

        predictions = torch.stack(predictions).cpu()

    return predictions

In [None]:
y_test = get_predictions(
  model,
  test_data_loader,
  DEVICE
)

In [None]:
f1_score(test['label'], y_test)

In [None]:
test['pred'] = y_test
test.to_csv('predictions.tsv', sep='\t')