In [1]:
import pandas as pd

from datasets import Dataset, load_dataset, ClassLabel, DatasetDict

In [2]:
ecco_train_df = pd.read_csv('../data/translation-task-data/ecco_monolingual_train_no_dupl.csv')
ecco_test_df = pd.read_csv('../data/translation-task-data/ecco_monolingual_test_no_dupl.csv')

ecco_train_df.rename(columns={"monolingual_translations": "label", "ecco_full_title": "text"}, inplace=True)
ecco_test_df.rename(columns={"monolingual_translations": "label", "ecco_full_title": "text"}, inplace=True)

ecco_train_dataset = Dataset.from_pandas(ecco_train_df)
ecco_test_dataset = Dataset.from_pandas(ecco_test_df)

ecco = DatasetDict({"train": ecco_train_dataset,
                       "test": ecco_test_dataset,})

In [3]:
caa_train_df = pd.read_csv('../data/translation-task-data/caa_monolingual_train.csv')
caa_test_df = pd.read_csv('../data/translation-task-data/caa_monolingual_test.csv')

caa_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
caa_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

caa_train_dataset = Dataset.from_pandas(caa_train_df)
caa_test_dataset = Dataset.from_pandas(caa_test_df)

caa = DatasetDict({"train": caa_train_dataset,
                       "test": caa_test_dataset,})

In [5]:
balanced_caa_train_df = pd.read_csv('../data/translation-task-data/balanced_data_both_language_train_df.csv')
balanced_caa_test_df = pd.read_csv('../data/translation-task-data/balanced_data_both_language_test_df.csv')

balanced_caa_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
balanced_caa_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

balanced_caa_train_dataset = Dataset.from_pandas(balanced_caa_train_df)
balanced_caa_test_dataset = Dataset.from_pandas(balanced_caa_test_df)

balanced_caa = DatasetDict({"train": balanced_caa_train_dataset,
                       "test": balanced_caa_test_dataset,})

In [6]:
combined_train_df = pd.read_csv('../data/translation-task-data/combined_monolingual_train_no_dupl.csv')
combined_test_df = pd.read_csv('../data/translation-task-data/combined_monolingual_test_no_dupl.csv')

combined_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
combined_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

combined_train_dataset = Dataset.from_pandas(combined_train_df)
combined_test_dataset = Dataset.from_pandas(combined_test_df)

combined = DatasetDict({"train": combined_train_dataset,
                       "test": combined_test_dataset,})

# Trained on CAA

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v2")



In [7]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, Trainer

In [8]:
from setfit import SetFitModel
from typing import Dict, Any

def model_init(params: Dict[str, Any]) -> SetFitModel:
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v2", **params)

In [9]:
from optuna import Trial
from typing import Dict, Union

def hp_space(trial: Trial) -> Dict[str, Union[float, int, str]]:
    return {
        "body_learning_rate": trial.suggest_float("body_learning_rate", 1e-6, 1e-3, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 3),
        "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64]),
        "seed": trial.suggest_int("seed", 1, 40),
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [10]:
trainer = Trainer(
    train_dataset=balanced_caa['train'],
    eval_dataset=balanced_caa['test'],
    model_init=model_init,
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=10)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

[I 2024-10-16 16:26:39,310] A new study created in memory with name: no-name-d32e5336-7b69-40f4-a98b-cb6e1d6d2ff4
Trial: {'body_learning_rate': 0.0009800033214644025, 'num_epochs': 1, 'batch_size': 64, 'seed': 4, 'max_iter': 191, 'solver': 'newton-cg'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 60222
  Batch size = 64
  Num epochs = 1
  Total optimization steps = 941


Step,Training Loss


***** Running evaluation *****
[I 2024-10-16 16:35:26,385] Trial 0 finished with value: 0.8333333333333334 and parameters: {'body_learning_rate': 0.0009800033214644025, 'num_epochs': 1, 'batch_size': 64, 'seed': 4, 'max_iter': 191, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.8333333333333334.
Trial: {'body_learning_rate': 5.860823205538936e-06, 'num_epochs': 2, 'batch_size': 32, 'seed': 3, 'max_iter': 164, 'solver': 'newton-cg'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 60222
  Batch size = 32
  Num epochs = 2
  Total optimization steps = 3764


Step,Training Loss


***** Running evaluation *****
[I 2024-10-16 16:54:31,094] Trial 1 finished with value: 0.8205128205128205 and parameters: {'body_learning_rate': 5.860823205538936e-06, 'num_epochs': 2, 'batch_size': 32, 'seed': 3, 'max_iter': 164, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.8333333333333334.
Trial: {'body_learning_rate': 0.00014785884631160625, 'num_epochs': 1, 'batch_size': 32, 'seed': 2, 'max_iter': 114, 'solver': 'liblinear'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 60222
  Batch size = 32
  Num epochs = 1
  Total optimization steps = 1882


Step,Training Loss


***** Running evaluation *****
[I 2024-10-16 17:04:03,280] Trial 2 finished with value: 0.8205128205128205 and parameters: {'body_learning_rate': 0.00014785884631160625, 'num_epochs': 1, 'batch_size': 32, 'seed': 2, 'max_iter': 114, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8333333333333334.
Trial: {'body_learning_rate': 2.782848732335074e-06, 'num_epochs': 1, 'batch_size': 16, 'seed': 37, 'max_iter': 246, 'solver': 'lbfgs'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 60222
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 3764


Step,Training Loss


***** Running evaluation *****
[I 2024-10-16 17:14:49,164] Trial 3 finished with value: 0.8205128205128205 and parameters: {'body_learning_rate': 2.782848732335074e-06, 'num_epochs': 1, 'batch_size': 16, 'seed': 37, 'max_iter': 246, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8333333333333334.
Trial: {'body_learning_rate': 4.047590131223615e-05, 'num_epochs': 3, 'batch_size': 64, 'seed': 6, 'max_iter': 158, 'solver': 'lbfgs'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 60222
  Batch size = 64
  Num epochs = 3
  Total optimization steps = 2823


Step,Training Loss


***** Running evaluation *****
[I 2024-10-16 17:41:17,168] Trial 4 finished with value: 0.8333333333333334 and parameters: {'body_learning_rate': 4.047590131223615e-05, 'num_epochs': 3, 'batch_size': 64, 'seed': 6, 'max_iter': 158, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8333333333333334.
Trial: {'body_learning_rate': 1.4818154521353039e-06, 'num_epochs': 2, 'batch_size': 16, 'seed': 13, 'max_iter': 67, 'solver': 'liblinear'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 60222
  Batch size = 16
  Num epochs = 2
  Total optimization steps = 7528


Step,Training Loss


[W 2024-10-16 18:02:01,871] Trial 5 failed with parameters: {'body_learning_rate': 1.4818154521353039e-06, 'num_epochs': 2, 'batch_size': 16, 'seed': 13, 'max_iter': 67, 'solver': 'liblinear'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/pricie/mfantoli/miniconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/home/pricie/mfantoli/miniconda3/lib/python3.9/site-packages/setfit/integrations.py", line 27, in _objective
    trainer.train(trial=trial)
  File "/home/pricie/mfantoli/miniconda3/lib/python3.9/site-packages/setfit/trainer.py", line 410, in train
    self.train_embeddings(*full_parameters, args=args)
  File "/home/pricie/mfantoli/miniconda3/lib/python3.9/site-packages/setfit/trainer.py", line 462, in train_embeddings
    self._train_sentence_transformer(
  File "/home/pricie/mfantoli/miniconda3/lib/python3.9/site-packages/setfit/trainer.py", line 62

In [None]:
print(best_run)

In [11]:
BestRun(run_id='8', objective=0.4785, hyperparameters={'body_learning_rate': 0.0005575631179396824, 'num_epochs': 1, 'batch_size': 32, 'seed': 31, 'max_iter': 264, 'solver': 'newton-cg'}, backend=<optuna.study.study.Study object at 0x000001E088B8C310>)



SyntaxError: invalid syntax (2958636736.py, line 1)

In [12]:
trainer.train()
metrics = trainer.evaluate()

NameError: name 'best_run' is not defined

In [13]:
# Load SetFit model from Hub
setfitmodel = SetFitModel.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v2")

# Create trainer
trainer = SetFitTrainer(
    model=setfitmodel,
    train_dataset=balanced_caa['train'],
    eval_dataset=balanced_caa['test'],
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=3 # Number of epochs to use for contrastive learning
)


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(


Map:   0%|          | 0/308 [00:00<?, ? examples/s]

In [14]:
trainer.train()
metrics = trainer.evaluate()

***** Running training *****
  Num unique pairs = 12320
  Batch size = 16
  Num epochs = 3
  Total optimization steps = 2310


Step,Training Loss


***** Running evaluation *****


In [15]:
from sklearn import metrics

from sklearn.metrics import classification_report

predictions = setfitmodel.predict(balanced_caa['test']['text'])

preds = predictions.tolist()

true = balanced_caa['test']['label']

setfit_eval_results = classification_report(true, preds, output_dict=True)

In [16]:
# Tested on ECCO

predictions = setfitmodel.predict(ecco['test']['text'])
preds = predictions.tolist()
true = ecco['test']['label']

setfit_ecco_eval_results = classification_report(true, preds, output_dict=True)

In [17]:
# Tested on CAA

predictions = setfitmodel.predict(caa['test']['text'])
preds = predictions.tolist()
true = caa['test']['label']

setfit_caa_eval_results = classification_report(true, preds, output_dict=True)

In [18]:
# Tested on combined

predictions = setfitmodel.predict(combined['test']['text'])
preds = predictions.tolist()
true = combined['test']['label']

setfit_combined_eval_results = classification_report(true, preds, output_dict=True)

In [19]:
def list_eval_results_dictionaries():
    return {name: value for name, value in globals().items() if isinstance(value, dict) and 'eval_results' in name}

eval_result_dictionaries = list_eval_results_dictionaries()

dict_list = [{'DictName': name, ** eval_result_dictionaries[name]} for name in eval_result_dictionaries]

dict_df = pd.DataFrame(dict_list)

#dict_df = dict_df[['DictName'] + sorted(dict_df.columns.drop('DictName'),tolist())]


def expand_dict_columns(df):
    dict_columns = [col for col in df.columns if isinstance(df[col][0], dict)]
    expanded_cols = []
    for col in dict_columns:
        expanded = pd.json_normalize(df[col])
        expanded.columns = [f"{col}_{key}" for key in expanded.columns]
        expanded_cols.append(expanded)
    df = df.drop(columns=dict_columns)
    if expanded_cols:
        expanded_cols_df = pd.concat(expanded_cols, axis=1)
        df = pd.concat([df, expanded_cols_df], axis=1)
    return df


dff = expand_dict_columns(dict_df)

dff

Unnamed: 0,DictName,accuracy,0_precision,0_recall,0_f1-score,0_support,1_precision,1_recall,1_f1-score,1_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,setfit_eval_results,0.833333,0.847222,0.968254,0.903704,63.0,0.666667,0.266667,0.380952,15.0,0.756944,0.61746,0.642328,78.0,0.8125,0.833333,0.803175,78.0
1,setfit_ecco_eval_results,0.487252,0.414239,1.0,0.585812,128.0,1.0,0.195556,0.327138,225.0,0.70712,0.597778,0.456475,353.0,0.7876,0.487252,0.420935,353.0
2,setfit_caa_eval_results,0.956772,0.95671,1.0,0.977876,663.0,1.0,0.032258,0.0625,31.0,0.978355,0.516129,0.520188,694.0,0.958644,0.956772,0.936988,694.0
3,setfit_combined_eval_results,0.683761,0.669415,0.996652,0.800897,896.0,0.957143,0.13189,0.231834,508.0,0.813279,0.564271,0.516365,1404.0,0.773522,0.683761,0.594997,1404.0


In [20]:
dff.to_csv('../results/setfit-translation-task.csv',index=False)