In [9]:
import csv
from itertools import combinations
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate
from sentence_transformers import CrossEncoder
import numpy as np

In [3]:
rouge = evaluate.load("rouge")



In [7]:
model = CrossEncoder("BAAI/bge-reranker-v2-m3")

In [4]:
dev = pd.read_csv('ru-dev.csv', sep='\t', quoting=csv.QUOTE_NONE)

In [5]:
def create_dataset(dev, lang):
    to_return = {'gloss':[], 'Generated_Definition':[], 'score':[]}
    for word in dev.word.unique():
        this_word = dev[dev.word == word]
        for sense in this_word.sense_id.unique():
            this_sense = this_word[this_word.sense_id==sense]

            to_predict = list(this_sense[['gloss', 'Generated_Definition']].dropna().itertuples(index=False, name=None))
            if to_predict:
                sims = model.predict(to_predict)
                most_similar = to_predict[np.argmax(sims)]
                to_return['gloss'].append(most_similar[0])
                to_return['Generated_Definition'].append(most_similar[1])
                to_return['score'].append(1)
            
            other_sense = this_word[this_word.sense_id!=sense].Generated_Definition.dropna()
            for other, this in zip(other_sense, this_sense):
                to_return['gloss'].append(this)
                to_return['Generated_Definition'].append(other)
                to_return['score'].append(0)
    to_return = pd.DataFrame(to_return)
    to_return['lang'] = [lang for i in range(to_return.shape[0])]
    return to_return

In [10]:
ru = create_dataset(dev, 'ru')

In [11]:
ru.shape

(8369, 4)

In [12]:
fin = pd.read_csv('../dev_set/mt0-xl/ax_fi_2.tsv.gz', sep='\t', quoting=csv.QUOTE_NONE, compression='gzip')
fi = create_dataset(fin, 'fi')

In [13]:
dataset = pd.concat((ru, fi))

In [14]:
train, test = train_test_split(dataset, shuffle=True, stratify=dataset['score'], random_state=42)

In [15]:
def calc_rouge(ds):
    ds['rouge'] = ds.apply(lambda x: rouge.compute(
    predictions=[x.gloss+'\n'], references=[x.Generated_Definition+'\n'], tokenizer=lambda y: y.split()
), axis=1)
    ds['rouge1'] = [x['rouge1'] if isinstance(x, dict) else 0 for x in ds['rouge']]
    ds['rouge2'] = [x['rouge2'] if isinstance(x, dict) else 0 for x in ds['rouge']]
    ds['rougeL'] = [x['rougeL'] if isinstance(x, dict) else 0 for x in ds['rouge']]
    ds['rougeLsum'] = [x['rougeLsum'] if isinstance(x, dict) else 0 for x in ds['rouge']]
    return ds

In [16]:
train = calc_rouge(train)
test = calc_rouge(test)

In [17]:
train.to_csv('train.csv')
test.to_csv('test.csv')

In [20]:
train.columns

Index(['gloss', 'Generated_Definition', 'score', 'lang', 'rouge', 'rouge1',
       'rouge2', 'rougeL', 'rougeLsum'],
      dtype='object')

In [22]:
def predict_sims(ds):
    to_predict = list(ds[['gloss', 'Generated_Definition']].itertuples(index=False, name=None))
    sims = model.predict(to_predict)
    ds['sims'] = sims
    return ds

In [18]:
import torch
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [24]:
import joblib
clf =LogisticRegression()

train = predict_sims(train)
test = predict_sims(test)
train.to_csv('train-sims.csv')
test.to_csv('test-sims.csv')
clf = clf.fit(train[['sims', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum']], train['score'])


predictions = clf.predict(test[[ 'sims', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum']])
print(classification_report(test['score'], predictions))
joblib.dump(clf, 'mt0-fi-ru.joblib')

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      3554
           1       0.89      0.51      0.65       422

    accuracy                           0.94      3976
   macro avg       0.92      0.75      0.81      3976
weighted avg       0.94      0.94      0.93      3976



['mt0-fi-ru.joblib']

In [101]:
dev = pd.read_csv('ru-dev-aya.csv', sep='\t', quoting=csv.QUOTE_NONE)
ru = create_dataset(dev, 'ru')
train, test = train_test_split(ru, shuffle=True, stratify=ru['score'], random_state=42)
train = predict_sims(train)
test = predict_sims(test)
clf = tree.DecisionTreeClassifier()

clf = clf.fit(train[['sims']], train['score'])
predictions = clf.predict(test[[ 'sims']])
print(classification_report(test['score'], predictions))

(1749, 3)
              precision    recall  f1-score   support

           0       0.51      0.58      0.55       448
           1       0.50      0.43      0.46       438

    accuracy                           0.51       886
   macro avg       0.51      0.51      0.51       886
weighted avg       0.51      0.51      0.51       886

