In [3]:
import math
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers.readers import InputExample
from sklearn import metrics

Loading and preprocessing

In [5]:
df_train = pd.read_csv('data/train.csv')

In [6]:
def pre_proc(df):
      df.drop(['title1_zh', 'title2_zh'], axis=1, inplace=True)
      df.drop(['id', 'tid1', 'tid2'], axis=1, inplace=True)

      df['title1_en'] = df['title1_en'].str.lower()
      df['title2_en'] = df['title2_en'].str.lower()

      df['title1_en'] = df['title1_en'].str.replace('[^\w\s]','')
      df['title2_en'] = df['title2_en'].str.replace('[^\w\s]','')

      df_train.replace('disagreed', 0, inplace=True)
      df_train.replace('unrelated', 1, inplace=True)
      df_train.replace('agreed', 2, inplace=True)

In [7]:
pre_proc(df_train)

  
  if __name__ == '__main__':


10% of the sample is used for evaluation

In [8]:
df_eval = df_train.sample(frac = 0.1)

In [9]:
df_train= df_train.drop(df_eval.index)

In [None]:
train_samples = []
eval_samples = []

In [None]:
for row in df_train.itertuples():
  train_samples.append(InputExample(texts=[row[1], row[2]], label=row[3]))

In [None]:
for row in df_eval.itertuples():
  eval_samples.append(InputExample(texts=[row[1], row[2]], label=row[3]))

In [None]:
batch_size = 16
epochs = 1
save_path = "data/transformers_model"

In [None]:
model = CrossEncoder('distilroberta-base', num_labels=3)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 

In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

In [None]:
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(eval_samples, name='data-eval')

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1)

Training model

In [None]:
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=epochs,
          evaluation_steps=10000,
          warmup_steps=warmup_steps,
          output_path=save_path)



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18032 [00:00<?, ?it/s]

After training model can be loaded from "transformers_model" locally

In [None]:
model = CrossEncoder('data/transformers_model')

In [12]:
x_eval = df_eval.iloc[:, :-1]
y_eval = df_eval.iloc[:, -1]

In [11]:
def predict(segment1, segment2):
  scores = model.predict([(segment1, segment2)])
  return scores.argmax(axis=1)[0]

In [13]:
x_eval['label'] = x_eval.apply(lambda x: predict(x.title1_en, x.title2_en), axis=1)

In [15]:
predictions = x_eval['label'].tolist()

In [16]:
references = y_eval.tolist()

In [17]:
cf = metrics.classification_report(references, predictions)

Classification Report

In [18]:
print(cf)

              precision    recall  f1-score   support

           0       0.65      0.39      0.48       807
           1       0.90      0.91      0.90     21938
           2       0.80      0.79      0.80      9310

    accuracy                           0.86     32055
   macro avg       0.78      0.70      0.73     32055
weighted avg       0.86      0.86      0.86     32055



In [23]:
cm = metrics.confusion_matrix(references, predictions, labels=[2,0,1])

Confusion Matrix

In [24]:
pd.DataFrame(cm, columns=['agreed', 'disagreed', 'unrelated'], index=['agreed', 'disagreed', 'unrelated'])

Unnamed: 0,agreed,disagreed,unrelated
agreed,7398,6,1906
disagreed,63,311,433
unrelated,1784,159,19995
