In [2]:
import pandas as pd

from datasets import Dataset, load_dataset, ClassLabel, DatasetDict

In [3]:
ecco_train_df = pd.read_csv('../data/multilingual-task-data/ecco_train_no_dupl.csv')
ecco_test_df = pd.read_csv('../data/multilingual-task-data/ecco_test_no_dupl.csv')

ecco_train_df.rename(columns={"monolingual": "label", "ecco_full_title": "text"}, inplace=True)
ecco_test_df.rename(columns={"monolingual": "label", "ecco_full_title": "text"}, inplace=True)

ecco_train_dataset = Dataset.from_pandas(ecco_train_df)
ecco_test_dataset = Dataset.from_pandas(ecco_test_df)

ecco = DatasetDict({"train": ecco_train_dataset,
                       "test": ecco_test_dataset,})

In [4]:
caa_train_df = pd.read_csv('../data/multilingual-task-data/caa_train_df.csv')
caa_test_df = pd.read_csv('../data/multilingual-task-data/caa_test_df.csv')

caa_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
caa_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

caa_train_dataset = Dataset.from_pandas(caa_train_df)
caa_test_dataset = Dataset.from_pandas(caa_test_df)

caa = DatasetDict({"train": caa_train_dataset,
                       "test": caa_test_dataset,})

In [5]:
balanced_caa_train_df = pd.read_csv('../data/multilingual-task-data/few_shot_mono_train.csv')
balanced_caa_test_df = pd.read_csv('../data/multilingual-task-data/few_shot_mono_test.csv')

balanced_caa_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
balanced_caa_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

balanced_caa_train_dataset = Dataset.from_pandas(balanced_caa_train_df)
balanced_caa_test_dataset = Dataset.from_pandas(balanced_caa_test_df)

balanced_caa = DatasetDict({"train": balanced_caa_train_dataset,
                       "test": balanced_caa_test_dataset,})

In [6]:
combined_train_df = pd.read_csv('../data/multilingual-task-data/combined_train_no_dupl.csv')
combined_test_df = pd.read_csv('../data/multilingual-task-data/combined_test_no_dupl.csv')

combined_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
combined_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

combined_train_dataset = Dataset.from_pandas(combined_train_df)
combined_test_dataset = Dataset.from_pandas(combined_test_df)

combined = DatasetDict({"train": combined_train_dataset,
                       "test": combined_test_dataset,})

# Trained on CAA

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v2")



In [9]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

In [10]:
# Load SetFit model from Hub
setfitmodel = SetFitModel.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v2")

# Create trainer
trainer = SetFitTrainer(
    model=setfitmodel,
    train_dataset=balanced_caa['train'],
    eval_dataset=balanced_caa['test'],
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=3 # Number of epochs to use for contrastive learning
)


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [12]:
trainer.train()
metrics = trainer.evaluate()
# trainer.save_model("setfit_models_monolingual/balanced_caa")

***** Running training *****
  Num unique pairs = 3200
  Batch size = 16
  Num epochs = 3
  Total optimization steps = 600


Step,Training Loss


***** Running evaluation *****


In [13]:
from sklearn import metrics

from sklearn.metrics import classification_report

predictions = setfitmodel.predict(balanced_caa['test']['text'])

preds = predictions.tolist()

true = balanced_caa['test']['label']

setfit_eval_results = classification_report(true, preds, output_dict=True)

In [14]:
# Tested on CAA

predictions = setfitmodel.predict(caa['test']['text'])
preds = predictions.tolist()
true = caa['test']['label']

setfit_caa_eval_results = classification_report(true, preds, output_dict=True)

In [15]:
# Tested on ECCO

predictions = setfitmodel.predict(ecco['test']['text'])
preds = predictions.tolist()
true = ecco['test']['label']

setfit_ecco_eval_results = classification_report(true, preds, output_dict=True)

In [16]:
# Tested on Combined

predictions = setfitmodel.predict(combined['test']['text'])
preds = predictions.tolist()
true = combined['test']['label']

setfit_combined_eval_results = classification_report(true, preds, output_dict=True)

In [17]:
def list_eval_results_dictionaries():
    return {name: value for name, value in globals().items() if isinstance(value, dict) and 'eval_results' in name}

eval_result_dictionaries = list_eval_results_dictionaries()

dict_list = [{'DictName': name, ** eval_result_dictionaries[name]} for name in eval_result_dictionaries]

dict_df = pd.DataFrame(dict_list)

#dict_df = dict_df[['DictName'] + sorted(dict_df.columns.drop('DictName'),tolist())]


def expand_dict_columns(df):
    dict_columns = [col for col in df.columns if isinstance(df[col][0], dict)]
    expanded_cols = []
    for col in dict_columns:
        expanded = pd.json_normalize(df[col])
        expanded.columns = [f"{col}_{key}" for key in expanded.columns]
        expanded_cols.append(expanded)
    df = df.drop(columns=dict_columns)
    if expanded_cols:
        expanded_cols_df = pd.concat(expanded_cols, axis=1)
        df = pd.concat([df, expanded_cols_df], axis=1)
    return df


dff = expand_dict_columns(dict_df)

dff

Unnamed: 0,DictName,accuracy,0_precision,0_recall,0_f1-score,0_support,1_precision,1_recall,1_f1-score,1_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,setfit_eval_results,0.95,0.909091,1.0,0.952381,10.0,1.0,0.9,0.947368,10.0,0.954545,0.95,0.949875,20.0,0.954545,0.95,0.949875,20.0
1,setfit_caa_eval_results,0.818306,0.102362,0.40625,0.163522,32.0,0.968595,0.837143,0.898084,700.0,0.535479,0.621696,0.530803,732.0,0.930727,0.818306,0.865972,732.0
2,setfit_ecco_eval_results,0.658747,0.380734,0.783019,0.512346,106.0,0.906122,0.621849,0.737542,357.0,0.643428,0.702434,0.624944,463.0,0.785839,0.658747,0.685985,463.0
3,setfit_combined_eval_results,0.728033,0.295285,0.74375,0.422735,160.0,0.948232,0.725604,0.822113,1035.0,0.621759,0.734677,0.622424,1195.0,0.860808,0.728033,0.76864,1195.0


In [19]:
dff.to_csv('../results/setfit-multilingual-task.csv',index=False)