<a href="https://colab.research.google.com/github/mathewpolonsky/Marketplace-Item-Matching/blob/main/Training_XLM_Roberta_Regression_on_Differences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate
!pip install transformers[torch]

In [None]:
!gdown 1N8fUo17cyn4UTSfuebL6RS3-tFb3nWe9
!unzip merged_t.zip
!mkdir datasets/
!mv merged_train4nlp_w_diff.parquet datasets/
!mv merged_test4nlp_w_diff.parquet datasets/

Archive:  merged_t.zip
  inflating: merged_test4nlp_w_diff.parquet  
  inflating: merged_train4nlp_w_diff.parquet  


---

In [None]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader, Dataset

import evaluate

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, AutoTokenizer

from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_parquet("datasets/merged_train4nlp_w_diff.parquet")

In [None]:
data

Unnamed: 0,target,variantid1,variantid2,full_text_1,full_text_2,text_difference
0,0.0,51197862,51198054,name: Удлинитель TDM Electric Люкс УЛ05В 5 м (...,name: Удлинитель TDM Electric Люкс УЛ05В 1.5 м...,Товар 1 | name: Удлинитель TDM Electric Люкс У...
1,0.0,51197862,51199884,name: Удлинитель TDM Electric Люкс УЛ05В 5 м (...,name: Удлинитель TDM Electric Люкс УЛ05В 3 м (...,Товар 1 | name: Удлинитель TDM Electric Люкс У...
2,1.0,53062686,536165289,name: Картридж лазерный Комус 729 (4370B002) ч...,name: Картридж лазерный Комус 729 (4368B002) п...,"Товар 1 | color: черный, чер\nТовар 2 | color:..."
3,1.0,53602615,587809782,name: Картридж лазерный Комус 729 (4368B002) п...,name: Картридж лазерный Комус 729 (4370B002) ч...,Товар 1 | color: пурпурный\nТовар 2 | color: ч...
4,1.0,53602615,615149925,name: Картридж лазерный Комус 729 (4368B002) п...,name: Картридж лазерный Комус 729 (4368B002) п...,Товар 1 | name: Картридж лазерный Комус 729 (4...
...,...,...,...,...,...,...
306535,0.0,810680230,820119986,"name: Комплект 4 шт, Картридж лазерный NV Prin...","name: Комплект 2 шт, Картридж лазерный NV Prin...","Товар 1 | name: Комплект 4 шт, Картридж лазерн..."
306536,0.0,812434186,815345877,name: Для iPhone 7 плюс 8 плюс военная броня г...,name: Для iPhone 7 8 военная броня корпус гибр...,Товар 1 | color: желтый\nТовар 2 | color: сере...
306537,0.0,815637954,817550808,name: Карта памяти 64 ГБ\ncat3: Карты памяти и...,name: Карта памяти 1 ТБ\ncat3: Карты памяти и ...,Товар 1 | name: Карта памяти 64 ГБ\nТовар 2 | ...
306538,0.0,817327230,822083612,"name: Смартфон Mate48 Pro.. 8/256 ГБ, зеленый\...","name: Смартфон Mate48 Pro.. 10/512 ГБ, зеленый...",Товар 1 | name: Смартфон Mate48 Pro.. 8/256 ГБ...


---

In [None]:
checkpoint = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=1
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.den

In [None]:
train_data, val_data = train_test_split(
    data, random_state=42, test_size=.1
)

In [None]:
class TextDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.sentences = data_df["text_difference"].values
        self.labels = data_df['target'].values

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, i):
        sentence, label = self.sentences[i], self.labels[i]

        tokens = tokenizer(sentence, truncation="longest_first", padding="max_length", max_length=self.max_length)

        tokens['labels'] = label

        tokens = {key: torch.tensor(val) for key, val in tokens.items()}

        return tokens


train_dataset = TextDataset(train_data, tokenizer)
val_dataset = TextDataset(val_data, tokenizer)

In [None]:
mse_metric = evaluate.load("mse")

Downloading builder script:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = mse_metric.compute(predictions=predictions, references=labels, squared=True)    
    return {"rmse": mse}

In [None]:
training_args = TrainingArguments(
    output_dir="drive/MyDrive/models/xlm-roberta-base-regr-diff",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### Making Predictions for Catboost Training

In [None]:
class TextTestDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.sentences = data_df["text_difference"].values

    def __len__(self):
        return self.sentences.shape[0]

    def __getitem__(self, i):
        sentence = self.sentences[i]

        tokens = tokenizer(sentence, truncation="longest_first", padding="max_length", max_length=self.max_length)

        tokens = {key: torch.tensor(val) for key, val in tokens.items()}

        return tokens


cb_train_dataset = TextTestDataset(data, tokenizer)

In [None]:
cb_train_dataloader = DataLoader(cb_train_dataset, batch_size=64, shuffle=False, drop_last=False)

In [None]:
model = model.to("cuda")

In [None]:
model.eval()

In [None]:
all_preds = []

with torch.no_grad():
    for tokens in tqdm(cb_train_dataloader):
        for key in tokens:
            tokens[key] = tokens[key].to("cuda")
        
        pred = model(**tokens)
        pred = pred['logits'].detach().cpu().numpy().tolist()

        all_preds.extend(pred)

  0%|          | 0/4790 [00:00<?, ?it/s]

In [None]:
data['xlm_roberta_pred'] = np.array(all_preds).flatten()
data.head(2)

Unnamed: 0,target,variantid1,variantid2,full_text_1,full_text_2,text_difference,xlm_roberta_pred
0,0.0,51197862,51198054,name: Удлинитель TDM Electric Люкс УЛ05В 5 м (...,name: Удлинитель TDM Electric Люкс УЛ05В 1.5 м...,Товар 1 | name: Удлинитель TDM Electric Люкс У...,0.017825
1,0.0,51197862,51199884,name: Удлинитель TDM Electric Люкс УЛ05В 5 м (...,name: Удлинитель TDM Electric Люкс УЛ05В 3 м (...,Товар 1 | name: Удлинитель TDM Electric Люкс У...,0.021375


In [None]:
data = data.drop(['full_text_1', 'full_text_2', 'text_difference'], axis=1)
data

Unnamed: 0,target,variantid1,variantid2,xlm_roberta_pred
0,0.0,51197862,51198054,0.017825
1,0.0,51197862,51199884,0.021375
2,1.0,53062686,536165289,0.175937
3,1.0,53602615,587809782,0.194001
4,1.0,53602615,615149925,1.008360
...,...,...,...,...
306535,0.0,810680230,820119986,0.016274
306536,0.0,812434186,815345877,0.170437
306537,0.0,815637954,817550808,0.000303
306538,0.0,817327230,822083612,-0.003481


In [None]:
data.to_csv("drive/MyDrive/datasets/train_cb_xlm_roberta_base_regr_diff_2_eps.csv", index=False)