In [1]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader, SequentialSampler
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import random
import time

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
def init_random_seed(seed_val: int = 66) -> None:
    """
    Fixes random seed
    @param seed_val: value for random seed
    """
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
init_random_seed()

In [5]:
df = pd.read_csv('best_df.csv')
df.target = df.target - np.repeat(1, df.shape[0])
text_column = df.text


def get_input_ids_max_len(df_text_column: pd.Series, tokenizer: any) -> int:
    """
    @param df_text_column: column to be processed by the tokenizer
    @param tokenizer: tokenizer object
    @return: max length of input ids from the tokenizer
    """
    input_ids = df_text_column.apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True)
    )
    return max(input_ids.apply(len))


def get_tokenizer(model_checkpoint: str) -> any:
    """
    @param model_checkpoint: name of the model checkpoint
    @return: tokenizer object
    """
    return AutoTokenizer.from_pretrained(model_checkpoint)

model_checkpoint = 'cointegrated/rubert-base-cased-nli-threeway'
tokenizer = get_tokenizer(model_checkpoint)
max_len = get_input_ids_max_len(text_column, tokenizer)
max_len

84

In [6]:
def tokenize_text_field(df_text_column: pd.Series, tokenizer: any, max_len: int) -> pd.Series:
    """
    @param df_text_column: column to be processed by the tokenizer
    @param tokenizer: tokenizer object
    @param max_len: max length of input ids from the tokenizer
    @return: pandas column with dictionary of input_ids, token_type_ids and attention_mask
    for each text in the dataframe
    """
    return df_text_column.apply(
        lambda x: tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,  # padding='longest' does not work correctly in this version
            return_attention_mask=True,
            truncation=True,
        )
    )


tokenized_text = tokenize_text_field(text_column, tokenizer, max_len)
tokenized_text[0]

{'input_ids': [101, 55669, 7471, 43148, 12945, 10167, 11466, 852, 10770, 37823, 1388, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [7]:
def get_offset_and_specify_torch_datatype(
    tokenized_text: pd.Series, offset: str
) -> torch.Tensor:
    """
    @param tokenized_text: pandas column with dictionary of input_ids, token_type_ids and attention_mask
    for each text in the dataframe
    @param offset: offset from the tokenized text dictionary
    @return: torch tensor of specified data
    """
    return torch.tensor(tokenized_text.apply(lambda x: x[offset]), dtype=torch.float64)


input_ids = get_offset_and_specify_torch_datatype(
    tokenized_text, "input_ids"
)
token_type_ids = get_offset_and_specify_torch_datatype(
    tokenized_text, "token_type_ids"
)
attention_mask = get_offset_and_specify_torch_datatype(
    tokenized_text, "attention_mask"
)
labels = torch.tensor(df.target, dtype=torch.float64)
input_ids[0]

tensor([  101., 55669.,  7471., 43148., 12945., 10167., 11466.,   852., 10770.,
        37823.,  1388.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.], dtype=torch.float64)

In [8]:
def get_dataset(
    input_ids: torch.tensor,
    token_type_ids: torch.tensor,
    attention_mask: torch.tensor,
    labels: torch.tensor,
) -> TensorDataset:
    """
    @param input_ids: input_ids from the tokenizer
    @param token_type_ids: token_type_ids from the tokenizer
    @param attention_mask: attention_mask from the tokenizer
    @param labels: object labels
    @return: full TensorDataset object
    """
    return TensorDataset(input_ids, token_type_ids, attention_mask, labels)


def get_train_val_stratified_dataset(
    full_dataset: TensorDataset,
    labels: torch.tensor,
    test_size: float,
) -> tuple[torch.utils.data.Subset, torch.utils.data.Subset]:
    """
    @param full_dataset: full TensorDataset object
    @param labels: object labels
    @param test_size: test sample size
    @return: stratified train and test datasets
    """
    train_indices, val_indices = train_test_split(
        list(range(len(labels))), test_size=test_size, stratify=labels
    )

    train_dataset = torch.utils.data.Subset(full_dataset, train_indices)
    val_dataset = torch.utils.data.Subset(full_dataset, val_indices)

    return train_dataset, val_dataset


def get_data_loader(dataset, batch_size: int) -> DataLoader:
    """
    @param dataset: train or test dataset
    @param batch_size: size of one batch in dataloader object
    @return: dataloader object
    """
    return DataLoader(
        dataset, sampler=SequentialSampler(dataset), batch_size=batch_size
    )


test_size = 0.1
batch_size = 32

full_dataset = get_dataset(input_ids, token_type_ids, attention_mask, labels)
train_dataset, val_dataset = get_train_val_stratified_dataset(
    full_dataset, labels, test_size
)
train_dataloader = get_data_loader(train_dataset, batch_size)
validation_dataloader = get_data_loader(val_dataset, batch_size)
# full_dataloader = get_data_loader(full_dataset, batch_size)

train_dataset[0]

(tensor([  101., 71911., 12945., 18551., 39701.,   852., 13236.,  7494., 12986.,
           862.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.], dtype=torch.float64),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
     

In [9]:
def get_preds_and_labels_flatted(
    logits: np.ndarray, true_ids: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
    """
    @param logits: model predictions
    @param true_ids: true labels
    @return: flatted model predictions and true labels
    """
    preds_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = true_ids.flatten()

    return preds_flat, labels_flat


def get_all_metrics(preds: np.ndarray, true_label_ids: np.ndarray) -> dict:
    """
    Returns dictionary of all necessary metrics
    @param preds: flatted model predictions
    @param true_label_ids: flatted true labels
    @return: metric values
    """

    return {
        "accuracy_score": accuracy_score(preds, true_label_ids),
        "recall_score": recall_score(preds, true_label_ids, average="micro"),
        "precision_score": precision_score(preds, true_label_ids, average="micro"),
        "f1_score": f1_score(preds, true_label_ids, average="micro"),
    }

In [10]:
def get_device() -> str:
    """
    @return: device name
    """
    return "cuda:2" if torch.cuda.is_available() else "cpu"


def get_model(model_checkpoint: str, num_labels: int) -> any:
    """
    @param model_checkpoint: name of the model checkpoint
    @param num_labels: number of labels in the dataframe
    @return: model object
    """
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels, ignore_mismatched_sizes=True
    )


device = get_device()
model = get_model(model_checkpoint, 13)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-base-cased-nli-threeway and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([13]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def val_model(
    model: any,
    validation_dataloader: DataLoader,
    device: str,
) -> None:

    model.eval()

    eval_metrics = {
        "accuracy_score": 0,
        "recall_score": 0,
        "precision_score": 0,
        "f1_score": 0,
    }

    eval_loss = 0

    for batch in validation_dataloader:

        b_input_ids = torch.tensor(batch[0]).to(device).long()
        b_token_type_ids = torch.tensor(batch[1]).to(device).long()
        b_attention_mask = torch.tensor(batch[2]).to(device).long()
        b_labels = torch.tensor(batch[3]).to(device).long()

        with torch.no_grad():

            outputs = model(
                b_input_ids,
                token_type_ids=b_token_type_ids,
                attention_mask=b_attention_mask,
                labels=b_labels,
            )

        loss = outputs.loss
        logits = outputs.logits

        eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()

        flat_preds, flat_label_ids = get_preds_and_labels_flatted(logits, label_ids)
        metric_results = get_all_metrics(flat_preds, flat_label_ids)

        for metric in metric_results.keys():
            eval_metrics[metric] += metric_results[metric]

    for metric in eval_metrics.keys():
        metric_value = eval_metrics[metric] / len(validation_dataloader)
        print(f'{metric}: {metric_value:.4f}')

    return

In [None]:
def train_model(
    model: any,
    device: str,
    num_epochs: int,
    train_dataloader: DataLoader,
    validation_dataloader: DataLoader,
) -> any:
    """
    @param model: model object
    @param device: device name
    @param num_epochs: number of model learning epochs
    @param train_dataloader: processed dataloader object for model training
    @return: trained model object
    """

    batch_size = train_dataloader.batch_size
    total_steps = len(train_dataloader) * num_epochs

    optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    model.to(device)

    for epoch_i in range(num_epochs):

        t0 = time.time()
        train_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            b_input_ids = torch.tensor(batch[0]).to(device).long()
            b_token_type_ids = torch.tensor(batch[1]).to(device).long()
            b_attention_mask = torch.tensor(batch[2]).to(device).long()
            b_labels = torch.tensor(batch[3]).to(device).long()

            model.zero_grad()

            outputs = model(
                input_ids=b_input_ids,
                token_type_ids=b_token_type_ids,
                attention_mask=b_attention_mask,
                labels=b_labels,
            )

            loss = outputs.loss

            train_loss = loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()

            if step % 50 == 0 and not step == 0:
                spent = time.time() - t0

                current_loss = train_loss / batch_size

                print(
                    "Batch {:}  of  {:}.    Spent: {:}. Current_loss {:}".format(
                        step, len(train_dataloader), spent, current_loss
                    )
                )

        avg_train_loss = train_loss / len(train_dataloader)
        training_time = time.time() - t0

        print("Average training loss: {0:.2f}".format(avg_train_loss))
        print("Training epcoh took: {:}".format(training_time))
        val_model(model, validation_dataloader, device)

    return model


train_model(model, device, 3, train_dataloader, validation_dataloader)

Batch 50  of  807.    Spent: 19.881248474121094. Current_loss 0.036512114107608795
Batch 100  of  807.    Spent: 39.22720432281494. Current_loss 0.02678094431757927
Batch 150  of  807.    Spent: 58.609485387802124. Current_loss 0.0207226425409317
Batch 200  of  807.    Spent: 78.03159832954407. Current_loss 0.03630485758185387
Batch 250  of  807.    Spent: 97.50578308105469. Current_loss 0.017175067216157913
Batch 300  of  807.    Spent: 116.91290426254272. Current_loss 0.015692181885242462
Batch 350  of  807.    Spent: 136.40206289291382. Current_loss 0.03139745816588402
Batch 400  of  807.    Spent: 155.90159344673157. Current_loss 0.024047108367085457
Batch 450  of  807.    Spent: 175.41085481643677. Current_loss 0.030946440994739532
Batch 500  of  807.    Spent: 194.85094666481018. Current_loss 0.020916113629937172
Batch 550  of  807.    Spent: 214.31190395355225. Current_loss 0.03688619285821915
Batch 600  of  807.    Spent: 233.79784870147705. Current_loss 0.028216492384672165
Ba

In [None]:
# 66 - 0.7893