## Imports

In [None]:
!pip install datasets transformers==4.28.0 evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.1

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
from datasets import ClassLabel
from transformers import DataCollatorWithPadding
import torch
import gc
torch.cuda.empty_cache()
gc.collect()
from datasets import load_metric
import numpy as np
import evaluate


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/final_thesis/ACQuA.csv", header=0)
df_french = pd.read_csv("/content/drive/MyDrive/final_thesis/french_df.csv", header=0, index_col=0)
df_russian = pd.read_csv("/content/drive/MyDrive/final_thesis/russian_df.csv", header=0, index_col=0)
df_en_fr_ru = pd.concat([df, df_french, df_russian], ignore_index=True)
df_en_fr_ru = df_en_fr_ru.drop(columns=['id',
                      'domain',
                      'it_1_confidence',
                      'it_2_confidence',
                      'better_count',
                      'worse_count',
                      'none_count',
                      'most_frequent_count',
                      'it_1_judgments', 
                      'it_2_judgments', 
                      'sentence_html', 
                      'judgments', 
                      'confidence', 
                      'dconfidence'])
df_en_fr_ru = df_en_fr_ru.dropna()

In [None]:
dataset = Dataset.from_pandas(df_en_fr_ru)
sentences = []

for i in range(len(dataset)):
  try:
    sentences.append(dataset[i]['object_a'] + ' [SEP] ' + dataset[i]['object_b'] + ' [SEP] ' + dataset[i]['sentence'])
  except:
    print(i)
    print(dataset[i]['object_a'], dataset[i]['object_b'], dataset[i]['sentence'])
    print(type(dataset[i]['object_a']), print(dataset[i]['object_b']), print(dataset[i]['sentence']))
    print('-----')

#sentences = [dataset[i]['object_a'] + ' [SEP] ' + dataset[i]['object_b'] + ' [SEP] ' + dataset[i]['sentence'] ]
dataset = dataset.add_column('full_sentences', sentences)

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_function(dataset):
  return tokenizer(dataset['full_sentences'], truncation=True)

dataset = dataset.map(tokenize_function, batched=True)
#to ClassLabel format
dataset = dataset.rename_column('most_frequent_label', 'labels')
new_features = dataset.features.copy()
new_features['labels'] = ClassLabel(num_classes = 3, names = ['WORSE', 'BETTER', 'NONE'], names_file=None, id=None)
dataset = dataset.cast(new_features)

Map:   0%|          | 0/21554 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21554 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.shuffle()
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='labels', seed=777)

## Multilingual Bert Training

In [None]:
device = torch.device('cuda:0')

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
accuracy_metric =  evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=preds, references = labels))
    results.update(precision_metric.compute(predictions=preds, references=labels, average="weighted"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="weighted"))
    results.update(f1_metric.compute(predictions=preds, references = labels, average="weighted"))
    return results

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/final_thesis',
    save_total_limit=1,
    load_best_model_at_end=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    eval_steps=50,
    save_steps=50,
    warmup_steps=10,
)

#training_args = training_args.to(device)
    

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics = compute_metrics,
    data_collator=data_collator
)

trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4803,0.354518,0.864997,0.859889,0.864997,0.860944
2,0.3053,0.310636,0.892368,0.887163,0.892368,0.888923
3,0.2072,0.311459,0.892832,0.897262,0.892832,0.894675
4,0.141,0.367441,0.901415,0.904375,0.901415,0.90273
5,0.0966,0.449822,0.906982,0.909061,0.906982,0.90792
6,0.0606,0.444457,0.911621,0.911499,0.911621,0.91155
7,0.0421,0.549978,0.91023,0.914722,0.91023,0.912062
8,0.0222,0.545782,0.913941,0.915417,0.913941,0.914577
9,0.0209,0.552253,0.916261,0.916447,0.916261,0.916346
10,0.0125,0.560524,0.915797,0.916416,0.915797,0.916091


TrainOutput(global_step=5390, training_loss=0.12938047163119343, metrics={'train_runtime': 3844.7214, 'train_samples_per_second': 44.849, 'train_steps_per_second': 1.402, 'total_flos': 9920816291528982.0, 'train_loss': 0.12938047163119343, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/final_thesis/model/")

In [None]:
def pipeline(obj1: str, obj2: str, sent : str, device = torch.device('cuda:0')) -> bool:
  tokens = tokenizer(obj1 + ' [SEP] ' + obj2 + ' [SEP] ' + sent, truncation=True, padding=True, return_tensors='pt')
  tokens = tokens.to(device)
  logits = model(**tokens).logits
  logits = logits.cpu().detach().numpy()
  preds = np.argmax(logits)
  return preds

In [None]:
obj1 = 'мама'
obj2 = 'папа'

sentence = 'мама гораздо хуже чем папа!!!'

In [None]:
pipeline(obj1, obj2, sentence)

0

## BERT from pretrained

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
from scipy.special import softmax

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/final_thesis/mbert_pretrained_2/checkpoint-4312").to('cuda:0')

In [None]:
def pipeline(obj1: str, obj2: str, sent : str, device = torch.device('cuda:0')) -> bool:
  tokens = tokenizer(obj1 + ' [SEP] ' + obj2 + ' [SEP] ' + sent, truncation=True, padding=True, return_tensors='pt')
  tokens = tokens.to(device)
  logits = model(**tokens).logits
  logits = logits.cpu().detach().numpy()
  probabilities = list(softmax(logits[0]))
  preds = np.argmax(logits)
  return preds, probabilities[preds]

In [None]:
obj1 = 'машина'
obj2 = 'автобус'
sent = 'Машина и автобус впринципе равны'

In [None]:
pipeline(obj1, obj2, sent)

(2, 0.892411)