<a href="https://colab.research.google.com/github/mathewpolonsky/Marketplace-Item-Matching/blob/main/Training_ruBert_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate
!pip install transformers[torch]

In [None]:
!gdown 1-vKglR5qwkmXGT8EK2La2eiNy1ApKLPD
!unzip merged_datasets4nlp.zip
!mv merged_datasets4nlp datasets/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

---

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import evaluate

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer

In [None]:
checkpoint = "ai-forever/ruBert-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2
)

In [None]:
data = pd.read_parquet("datasets/merged_train4nlp.parquet")

train_data, val_data = train_test_split(
    data, random_state=42, test_size=.1
)

In [None]:
class TextDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.sentences1 = data_df["full_text_1"].values
        self.sentences2 = data_df["full_text_2"].values
        self.labels = data_df['target'].values

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, i):
        sentence1, sentence2, label = self.sentences1[i], self.sentences2[i], self.labels[i]

        tokens = tokenizer(sentence1, sentence2, truncation="longest_first", padding="max_length", max_length=self.max_length)

        tokens['labels'] = label

        tokens = {key: torch.tensor(val).long() for key, val in tokens.items()}

        return tokens


train_dataset = TextDataset(train_data, tokenizer)
val_dataset = TextDataset(val_data, tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    out = {}

    out.update(accuracy.compute(predictions=predictions, references=labels))
    out.update(f1_metric.compute(predictions=predictions, references=labels))

    return out

In [None]:
training_args = TrainingArguments(
    output_dir="drive/MyDrive/models/rubert-base-class-full",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## Making Predictions for Catboost Training

In [None]:
data

In [None]:
class TextTestDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.sentences1 = data_df["full_text_1"].values
        self.sentences2 = data_df['full_text_2'].values

    def __len__(self):
        return self.sentences1.shape[0]

    def __getitem__(self, i):
        sentence1, sentence2 = self.sentences1[i], self.sentences2[i]

        tokens = tokenizer(sentence1, sentence2, truncation="longest_first", padding="max_length", max_length=self.max_length)
    
        tokens = {key: torch.tensor(val).long() for key, val in tokens.items()}

        return tokens


test_dataset = TextTestDataset(data, tokenizer)

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, drop_last=False)

In [None]:
model = model.to("cuda")

In [None]:
all_preds = []

with torch.no_grad():
    for tokens in tqdm(test_dataloader):
        for key in tokens:
            tokens[key] = tokens[key].to("cuda")
        
        pred = model(**tokens)
        pred = pred['logits'].detach().cpu().numpy().tolist()

        all_preds.extend(pred)

  0%|          | 0/283 [00:00<?, ?it/s]

In [None]:
data['rubert_pred_1'] = np.array([i for i in all_preds])[:,0]
data['rubert_pred_2'] = np.array([i for i in all_preds])[:,1]

In [None]:
data = data.drop(['full_text_1', 'full_text_2'], axis=1)
data

In [None]:
data.to_csv("drive/MyDrive/datasets/train_cb_rubert_base_class_full_2_eps.csv", index=False)