In [1]:
import pandas as pd

from datasets import Dataset, load_dataset, ClassLabel, DatasetDict

Import training datasets:

In [2]:
ecco_train_df = pd.read_csv('../data/translation-task-data/ecco_monolingual_train_no_dupl.csv')
ecco_test_df = pd.read_csv('../data/translation-task-data/ecco_monolingual_test_no_dupl.csv')

ecco_train_df.rename(columns={"monolingual_translations": "label", "ecco_full_title": "text"}, inplace=True)
ecco_test_df.rename(columns={"monolingual_translations": "label", "ecco_full_title": "text"}, inplace=True)

ecco_train_dataset = Dataset.from_pandas(ecco_train_df)
ecco_test_dataset = Dataset.from_pandas(ecco_test_df)

ecco = DatasetDict({"train": ecco_train_dataset,
                       "test": ecco_test_dataset,})

In [3]:
caa_train_df = pd.read_csv('../data/translation-task-data/caa_monolingual_train.csv')
caa_test_df = pd.read_csv('../data/translation-task-data/caa_monolingual_test.csv')

caa_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
caa_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

caa_train_dataset = Dataset.from_pandas(caa_train_df)
caa_test_dataset = Dataset.from_pandas(caa_test_df)

caa = DatasetDict({"train": caa_train_dataset,
                       "test": caa_test_dataset,})

In [4]:
balanced_caa_train_df = pd.read_csv('../data/translation-task-data/caa_monolingual_balanced_train.csv')
balanced_caa_test_df = pd.read_csv('../data/translation-task-data/caa_monolingual_balanced_test.csv')

balanced_caa_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
balanced_caa_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

balanced_caa_train_dataset = Dataset.from_pandas(balanced_caa_train_df)
balanced_caa_test_dataset = Dataset.from_pandas(balanced_caa_test_df)

balanced_caa = DatasetDict({"train": balanced_caa_train_dataset,
                       "test": balanced_caa_test_dataset,})

In [5]:
combined_train_df = pd.read_csv('../data/translation-task-data/combined_monolingual_train_no_dupl.csv')
combined_test_df = pd.read_csv('../data/translation-task-data/combined_monolingual_test_no_dupl.csv')

combined_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
combined_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

combined_train_dataset = Dataset.from_pandas(combined_train_df)
combined_test_dataset = Dataset.from_pandas(combined_test_df)

combined = DatasetDict({"train": combined_train_dataset,
                       "test": combined_test_dataset,})

In [6]:
combined_balanced_caa_train_df = pd.read_csv('../data/translation-task-data/balanced_data_both_language_train_df.csv')
combined_balanced_caa_test_df = pd.read_csv('../data/translation-task-data/balanced_data_both_language_test_df.csv')

combined_balanced_caa_train_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)
combined_balanced_caa_test_df.rename(columns={"monolingual_translations": "label", "title": "text"}, inplace=True)

combined_balanced_caa_train_dataset = Dataset.from_pandas(combined_balanced_caa_train_df)
combined_balanced_caa_test_dataset = Dataset.from_pandas(combined_balanced_caa_test_df)

combined_balanced_caa = DatasetDict({"train": combined_balanced_caa_train_dataset,
                       "test": combined_balanced_caa_test_dataset,})

# Trained on CAA - Standard BERT model

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")



In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
tokenized_caa = caa.map(preprocess_function, batched=True)

Map:   0%|          | 0/2772 [00:00<?, ? examples/s]

Map:   0%|          | 0/694 [00:00<?, ? examples/s]

In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

In [11]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained( "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import torch
torch.cuda.empty_cache()

In [15]:
training_args = TrainingArguments(
    output_dir = "./results_mono",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_caa["train"],
    eval_dataset=tokenized_caa["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_translations/caa")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.101781,0.956772
2,No log,0.076747,0.976945
3,0.125500,0.095643,0.976945
4,0.125500,0.089424,0.979827
5,0.125500,0.109641,0.975504




In [16]:
model = AutoModelForSequenceClassification.from_pretrained('bert_models_translations/caa/', local_files_only=True)

model = model.to('cuda')

trainer = Trainer(model=model)

trainer.model = model.cuda()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
from sklearn.metrics import classification_report

max_length = 512

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)

new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

eval_results_caa = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/694 [00:00<?, ? examples/s]

# tested on ECCO

In [19]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

caa_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

# Tested on combined

In [20]:
from sklearn.metrics import classification_report

max_length = 512

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)


new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

caa_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

# tested on caa balanced

In [21]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

caa_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

# Trained on ECCO

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
tokenized_ecco = ecco.map(preprocess_function, batched=True)

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

In [24]:
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ecco["train"],
    eval_dataset=tokenized_ecco["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_translations/ecco")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.241524,0.943343
2,No log,0.224832,0.943343
3,No log,0.202349,0.951841
4,No log,0.193637,0.957507
5,No log,0.196267,0.957507




In [25]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/353 [00:00<?, ? examples/s]

In [63]:
ecco_eval_results

{'0': {'precision': 0.9747899159663865,
  'recall': 0.90625,
  'f1-score': 0.9392712550607287,
  'support': 128.0},
 '1': {'precision': 0.9487179487179487,
  'recall': 0.9866666666666667,
  'f1-score': 0.9673202614379085,
  'support': 225.0},
 'accuracy': 0.9575070821529745,
 'macro avg': {'precision': 0.9617539323421676,
  'recall': 0.9464583333333334,
  'f1-score': 0.9532957582493187,
  'support': 353.0},
 'weighted avg': {'precision': 0.9581718065304135,
  'recall': 0.9575070821529745,
  'f1-score': 0.9571495169158716,
  'support': 353.0}}

# Tested on CAA

In [26]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

ecco_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)





Map:   0%|          | 0/694 [00:00<?, ? examples/s]

# Tested on combined 

In [27]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

ecco_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

# Tested on balanced

In [28]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

ecco_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/105 [00:00<?, ? examples/s]

# Trained on both

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
tokenized_combined = combined.map(preprocess_function, batched=True)

Map:   0%|          | 0/5616 [00:00<?, ? examples/s]

Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

In [31]:
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined["train"],
    eval_dataset=tokenized_combined["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_translations/combined")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.13572,0.960826
2,0.161900,0.124386,0.973647
3,0.077400,0.123403,0.975071
4,0.077400,0.142816,0.972222
5,0.031800,0.130325,0.972934




In [32]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

# Tested on CAA

In [33]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

combined_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/694 [00:00<?, ? examples/s]

# Tested on ECCO

In [34]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

combined_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/353 [00:00<?, ? examples/s]

# Tested on CAA balanced

In [35]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

combined_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/105 [00:00<?, ? examples/s]

# Trained on CAA Balanced

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
tokenized_balanced_caa = balanced_caa.map(preprocess_function, batched=True)

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [38]:
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_balanced_caa["train"],
    eval_dataset=tokenized_balanced_caa["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_translations/balanced_caa")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.434376,0.857143
2,No log,0.354425,0.857143
3,No log,0.355673,0.885714
4,No log,0.477703,0.847619
5,No log,0.500385,0.87619
6,No log,0.618087,0.87619
7,No log,0.605793,0.866667
8,No log,0.62557,0.866667




In [39]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/105 [00:00<?, ? examples/s]

# Tested on CAA

In [40]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

balanced_caa_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)






Map:   0%|          | 0/694 [00:00<?, ? examples/s]

# Tested on ECCO

In [41]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

balanced_caa_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/353 [00:00<?, ? examples/s]

# Tested on combined

In [42]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

balanced_caa_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)





Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

# Trained on combined_balanced

In [43]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
tokenized_combined_balanced_caa = combined_balanced_caa.map(preprocess_function, batched=True)

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [45]:
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined_balanced_caa["train"],
    eval_dataset=tokenized_combined_balanced_caa["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_translations/balanced_combined")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.395702,0.871795
2,No log,0.486099,0.705128
3,No log,0.395641,0.807692
4,No log,0.412061,0.820513
5,No log,0.531718,0.833333
6,No log,0.593832,0.833333
7,No log,0.779273,0.833333
8,No log,0.806122,0.833333




In [46]:
new_text_list = combined_balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_balanced_caa_test_df["label"]

combined_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

# Tested on CAA

In [47]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

combined_balanced_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/694 [00:00<?, ? examples/s]

# Tested on ECCO

In [48]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

combined_balanced_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/353 [00:00<?, ? examples/s]

# Tested on combined

In [49]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

combined_balanced_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

# Tested on CAA balanced

In [50]:
new_text_list = combined_balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_balanced_caa_test_df["label"]

combined_balanced_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [56]:
def list_eval_results_dictionaries():
    return {name: value for name, value in globals().items() if isinstance(value, dict) and 'eval_results' in name}

eval_result_dictionaries = list_eval_results_dictionaries()

dict_list = [{'DictName': name, ** eval_result_dictionaries[name]} for name in eval_result_dictionaries]

dict_df = pd.DataFrame(dict_list)

#dict_df = dict_df[['DictName'] + sorted(dict_df.columns.drop('DictName'),tolist())]


def expand_dict_columns(df):
    dict_columns = [col for col in df.columns if isinstance(df[col][0], dict)]
    expanded_cols = []
    for col in dict_columns:
        expanded = pd.json_normalize(df[col])
        expanded.columns = [f"{col}_{key}" for key in expanded.columns]
        expanded_cols.append(expanded)
    df = df.drop(columns=dict_columns)
    if expanded_cols:
        expanded_cols_df = pd.concat(expanded_cols, axis=1)
        df = pd.concat([df, expanded_cols_df], axis=1)
    return df


dff = expand_dict_columns(dict_df)

dff

Unnamed: 0,DictName,accuracy,0_precision,0_recall,0_f1-score,0_support,1_precision,1_recall,1_f1-score,1_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,eval_results_caa,0.975504,0.987915,0.986425,0.98717,663.0,0.71875,0.741935,0.730159,31.0,0.853333,0.86418,0.858664,694.0,0.975892,0.975504,0.975689,694.0
1,caa_ecco_eval_results,0.745042,0.591346,0.960938,0.732143,128.0,0.965517,0.622222,0.756757,225.0,0.778432,0.79158,0.74445,353.0,0.82984,0.745042,0.747832,353.0
2,caa_combined_eval_results,0.865385,0.828837,0.99442,0.90411,896.0,0.984802,0.637795,0.774194,508.0,0.90682,0.816107,0.839152,1404.0,0.885269,0.865385,0.857103,1404.0
3,caa_caa_balanced_eval_results,0.971429,0.96,1.0,0.979592,72.0,1.0,0.909091,0.952381,33.0,0.98,0.954545,0.965986,105.0,0.972571,0.971429,0.97104,105.0
4,ecco_eval_results,0.957507,0.97479,0.90625,0.939271,128.0,0.948718,0.986667,0.96732,225.0,0.961754,0.946458,0.953296,353.0,0.958172,0.957507,0.95715,353.0
5,ecco_caa_eval_results,0.65562,0.990741,0.645551,0.781735,663.0,0.103053,0.870968,0.1843,31.0,0.546897,0.758259,0.483018,694.0,0.951089,0.65562,0.755049,694.0
6,ecco_combined_eval_results,0.825499,0.99095,0.733259,0.842848,896.0,0.677463,0.988189,0.803843,508.0,0.834207,0.860724,0.823346,1404.0,0.877523,0.825499,0.828735,1404.0
7,ecco_caa_balanced_eval_results,0.609524,0.918919,0.472222,0.623853,72.0,0.441176,0.909091,0.594059,33.0,0.680048,0.690657,0.608956,105.0,0.768771,0.609524,0.614489,105.0
8,combined_eval_results,0.972934,0.985294,0.972098,0.978652,896.0,0.951923,0.974409,0.963035,508.0,0.968609,0.973254,0.970843,1404.0,0.97322,0.972934,0.973001,1404.0
9,combined_caa_eval_results,0.991354,0.995475,0.995475,0.995475,663.0,0.903226,0.903226,0.903226,31.0,0.94935,0.94935,0.94935,694.0,0.991354,0.991354,0.991354,694.0


In [64]:
dff.to_csv('../results/bert-translation-task.csv', index=False)