In [1]:
import pandas as pd
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

In [2]:
device = 'cuda:2' if torch.cuda.is_available() else 'cpu'

model_checkpoint = 'cointegrated/rubert-base-cased-nli-threeway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=13, ignore_mismatched_sizes=True)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-base-cased-nli-threeway and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([13]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [3]:
df = pd.read_csv('../data/raw/orgs_df.csv').drop(columns=['ogrn'])
df.aimid = df.aimid - np.repeat(1, df.shape[0])

In [4]:
input_ids = df.fullname.apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
max_len = max(input_ids.apply(len))
max_len

84

In [5]:
full_tokenized_text = df.fullname.apply(
    lambda x: tokenizer.encode_plus(
        x, 
        add_special_tokens=True, 
        max_length=max_len, 
        pad_to_max_length=True, # padding='longest' does not work correctly in this version
        return_attention_mask=True,
        truncation=True
    )
)

input_ids = torch.tensor(full_tokenized_text.apply(lambda x: x['input_ids']), dtype=torch.float64)
token_type_ids = torch.tensor(full_tokenized_text.apply(lambda x: x['token_type_ids']), dtype=torch.float64)
attention_mask = torch.tensor(full_tokenized_text.apply(lambda x: x['attention_mask']), dtype=torch.float64)
labels = torch.tensor(df.aimid, dtype=torch.float64)

In [6]:
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split

dataset = TensorDataset(input_ids, token_type_ids, attention_mask, labels)
train_indices, val_indices = train_test_split(list(range(len(labels))), test_size=0.3, stratify=labels)

train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)

In [7]:
from torch.utils.data import DataLoader, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    sampler=SequentialSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

In [8]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW

optimizer = AdamW(
    model.parameters(),
    lr=5e-5,
    eps=1e-8
)

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

def get_preds_and_labels_flatted(logits_pred, true_ids):
    preds_flat = np.argmax(logits_pred, axis=1).flatten()
    labels_flat = true_ids.flatten()
    
    return preds_flat, labels_flat


def get_all_metrics(logits_flatted, true_ids_flatted):
    preds_flat, labels_flat = get_preds_and_labels_flatted(logits_flatted, true_ids_flatted)
    
    return {
        'accuracy_score': accuracy_score(preds_flat, labels_flat),
        'recall_score': recall_score(preds_flat, labels_flat, average='micro'),
        'precision_score': precision_score(preds_flat, labels_flat, average='micro'),
        'f1_score': f1_score(preds_flat, labels_flat, average='micro'),
    }

In [10]:
import random

seed_val = 66
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [12]:
import time

for epoch_i in range(epochs):
    t0 = time.time()
    train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = torch.tensor(batch[0]).to(device).long()
        b_token_type_ids = torch.tensor(batch[1]).to(device).long()
        b_attention_mask = torch.tensor(batch[2]).to(device).long()
        b_labels = torch.tensor(batch[3]).to(device).long()

        model.zero_grad()

        outputs = model(
            input_ids=b_input_ids,
            token_type_ids=b_token_type_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        train_loss = loss.item()
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        if step % 50 == 0 and not step == 0:
            spent = time.time() - t0
            
            current_loss = train_loss / batch_size
            
            print('Batch {:} of {:}. Spent: {:}. Current_loss {:}'.format(step, len(train_dataloader), spent, current_loss))
        
    avg_train_loss = train_loss / len(train_dataloader)
    training_time = time.time() - t0
    
    print("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Training epcoh took: {:}".format(training_time))
    
    print("Validation:")
    
    t0 = time.time()
    model.eval()
    
    eval_metrics = {
        'accuracy_score': 0,
        'recall_score': 0,
        'precision_score': 0,
        'f1_score': 0,
    }
    
    eval_loss = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = torch.tensor(batch[0]).to(device).long()
        b_token_type_ids = torch.tensor(batch[1]).to(device).long()
        b_attention_mask = torch.tensor(batch[2]).to(device).long()
        b_labels = torch.tensor(batch[3]).to(device).long()
        
        with torch.no_grad():
            
            outputs = model(
                b_input_ids,
                token_type_ids=b_token_type_ids,
                attention_mask=b_attention_mask,
                labels=b_labels
            )
            
        
        loss = outputs.loss
        logits = outputs.logits
        
        eval_loss += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        metric_results = get_all_metrics(logits, label_ids)
        
        for metric in metric_results.keys():
            eval_metrics[metric] += metric_results[metric]
            

    
    for metric in eval_metrics.keys():
        metric_value = eval_metrics[metric] / len(validation_dataloader)
        print(metric, ": {0:.4f}".format(metric_value))

Batch 50 of 628. Spent: 19.787189483642578. Current_loss 0.029003862291574478
Batch 100 of 628. Spent: 39.15810441970825. Current_loss 0.03434387594461441
Batch 150 of 628. Spent: 58.517911195755005. Current_loss 0.030229099094867706
Batch 200 of 628. Spent: 77.94353175163269. Current_loss 0.025907428935170174
Batch 250 of 628. Spent: 97.37931275367737. Current_loss 0.028067566454410553
Batch 300 of 628. Spent: 116.80711507797241. Current_loss 0.01740529015660286
Batch 350 of 628. Spent: 136.31088018417358. Current_loss 0.02136034145951271
Batch 400 of 628. Spent: 155.82962679862976. Current_loss 0.01788063533604145
Batch 450 of 628. Spent: 175.32253861427307. Current_loss 0.021248163655400276
Batch 500 of 628. Spent: 194.85252714157104. Current_loss 0.022216472774744034
Batch 550 of 628. Spent: 214.40735006332397. Current_loss 0.024685952812433243
Batch 600 of 628. Spent: 233.90070748329163. Current_loss 0.03277542442083359
Average training loss: 0.00
Training epcoh took: 244.32013630