In [2]:
import pandas as pd

from datasets import Dataset, load_dataset, ClassLabel, DatasetDict

In [3]:
ecco_train_df = pd.read_csv('../data/multilingual-task-data/ecco_train_no_dupl.csv')
ecco_test_df = pd.read_csv('../data/multilingual-task-data/ecco_test_no_dupl.csv')

ecco_train_df.rename(columns={"monolingual": "label","ecco_full_title": "text"}, inplace=True)
ecco_test_df.rename(columns={"monolingual": "label","ecco_full_title": "text"}, inplace=True)

ecco_train_dataset = Dataset.from_pandas(ecco_train_df)
ecco_test_dataset = Dataset.from_pandas(ecco_test_df)

ecco = DatasetDict({"train": ecco_train_dataset,
                       "test": ecco_test_dataset,})

In [4]:
caa_train_df = pd.read_csv('../data/multilingual-task-data/caa_train_df.csv')
caa_test_df = pd.read_csv('../data/multilingual-task-data/caa_test_df.csv')

caa_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
caa_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

caa_train_dataset = Dataset.from_pandas(caa_train_df)
caa_test_dataset = Dataset.from_pandas(caa_test_df)

caa = DatasetDict({"train": caa_train_dataset,
                       "test": caa_test_dataset,})

In [5]:
balanced_caa_train_df = pd.read_csv('../data/multilingual-task-data/balanced_caa_train_df.csv')
balanced_caa_test_df = pd.read_csv('../data/multilingual-task-data/balanced_caa_test_df.csv')

balanced_caa_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
balanced_caa_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

balanced_caa_train_dataset = Dataset.from_pandas(balanced_caa_train_df)
balanced_caa_test_dataset = Dataset.from_pandas(balanced_caa_test_df)

balanced_caa = DatasetDict({"train": balanced_caa_train_dataset,
                       "test": balanced_caa_test_dataset,})

In [6]:
combined_train_df = pd.read_csv('../data/multilingual-task-data/combined_train_no_dupl.csv')
combined_test_df = pd.read_csv('../data/multilingual-task-data/combined_test_no_dupl.csv')

combined_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
combined_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

combined_train_dataset = Dataset.from_pandas(combined_train_df)
combined_test_dataset = Dataset.from_pandas(combined_test_df)

combined = DatasetDict({"train": combined_train_dataset,
                       "test": combined_test_dataset,})

In [7]:
combined_balanced_caa_train_df = pd.read_csv('../data/multilingual-task-data/balanced_data_both_language_train_df.csv')
combined_balanced_caa_test_df = pd.read_csv('../data/multilingual-task-data/balanced_data_both_language_test_df.csv')

combined_balanced_caa_train_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)
combined_balanced_caa_test_df.rename(columns={"monolingual": "label", "title": "text"}, inplace=True)

combined_balanced_caa_train_dataset = Dataset.from_pandas(combined_balanced_caa_train_df)
combined_balanced_caa_test_dataset = Dataset.from_pandas(combined_balanced_caa_test_df)

combined_balanced_caa = DatasetDict({"train": combined_balanced_caa_train_dataset,
                       "test": combined_balanced_caa_test_dataset,})

# Trained on CAA - Standard BERT model

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")



In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
tokenized_caa = caa.map(preprocess_function, batched=True)

Map:   0%|          | 0/2928 [00:00<?, ? examples/s]

Map:   0%|          | 0/732 [00:00<?, ? examples/s]

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import torch
torch.cuda.empty_cache()

In [16]:
training_args = TrainingArguments(
     output_dir = "./results_mono",
     learning_rate=2e-5,
     per_device_train_batch_size=8,
     per_device_eval_batch_size=8,
     num_train_epochs=5,
     weight_decay=0.01,
     evaluation_strategy="epoch")

trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_caa["train"],
     eval_dataset=tokenized_caa["test"],
     tokenizer=tokenizer,
     data_collator=data_collator,
     compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_monolingual/caa")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.140605,0.967213
2,No log,0.114863,0.969945
3,0.153500,0.120606,0.965847
4,0.153500,0.146879,0.961749
5,0.153500,0.134911,0.964481




In [17]:
#model = AutoModelForSequenceClassification.from_pretrained('bert_models_monolingual/caa/', local_files_only=True)

#model = model.to('cuda')

#trainer = Trainer(model=model)

#trainer.model = model.cuda()

In [18]:
from sklearn.metrics import classification_report

max_length = 512

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)

new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

eval_results_caa = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/732 [00:00<?, ? examples/s]

In [19]:
eval_results_caa

{'0': {'precision': 0.5882352941176471,
  'recall': 0.625,
  'f1-score': 0.6060606060606061,
  'support': 32.0},
 '1': {'precision': 0.9828080229226361,
  'recall': 0.98,
  'f1-score': 0.9814020028612304,
  'support': 700.0},
 'accuracy': 0.9644808743169399,
 'macro avg': {'precision': 0.7855216585201417,
  'recall': 0.8025,
  'f1-score': 0.7937313044609182,
  'support': 732.0},
 'weighted avg': {'precision': 0.965558941881981,
  'recall': 0.9644808743169399,
  'f1-score': 0.964993635787979,
  'support': 732.0}}

# tested on ECCO

In [20]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

caa_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/463 [00:00<?, ? examples/s]

In [21]:
caa_ecco_eval_results

{'0': {'precision': 0.47619047619047616,
  'recall': 0.18867924528301888,
  'f1-score': 0.2702702702702703,
  'support': 106.0},
 '1': {'precision': 0.7957244655581948,
  'recall': 0.938375350140056,
  'f1-score': 0.8611825192802056,
  'support': 357.0},
 'accuracy': 0.7667386609071274,
 'macro avg': {'precision': 0.6359574708743354,
  'recall': 0.5635272977115374,
  'f1-score': 0.565726394775238,
  'support': 463.0},
 'weighted avg': {'precision': 0.7225698157245487,
  'recall': 0.7667386609071274,
  'f1-score': 0.7258980735025531,
  'support': 463.0}}

# Tested on combined

In [22]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

caa_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

In [23]:
caa_combined_eval_results

{'0': {'precision': 0.691358024691358,
  'recall': 0.35,
  'f1-score': 0.46473029045643155,
  'support': 160.0},
 '1': {'precision': 0.9066427289048474,
  'recall': 0.9758454106280193,
  'f1-score': 0.9399720800372267,
  'support': 1035.0},
 'accuracy': 0.8920502092050209,
 'macro avg': {'precision': 0.7990003767981027,
  'recall': 0.6629227053140097,
  'f1-score': 0.7023511852468292,
  'support': 1195.0},
 'weighted avg': {'precision': 0.8778179986335852,
  'recall': 0.8920502092050209,
  'f1-score': 0.8763413801770366,
  'support': 1195.0}}

# tested on caa balanced

In [24]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

caa_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

# Trained on ECCO

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
tokenized_ecco = ecco.map(preprocess_function, batched=True)

Map:   0%|          | 0/1852 [00:00<?, ? examples/s]

Map:   0%|          | 0/463 [00:00<?, ? examples/s]

In [27]:
import torch
torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ecco["train"],
    eval_dataset=tokenized_ecco["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_monolingual/ecco")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.283713,0.917927
2,No log,0.268061,0.930886
3,No log,0.287628,0.930886
4,No log,0.3227,0.928726
5,0.207300,0.32304,0.928726




In [28]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/463 [00:00<?, ? examples/s]

# Tested on CAA

In [29]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

ecco_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)





Map:   0%|          | 0/732 [00:00<?, ? examples/s]

# Tested on combined 

In [30]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

ecco_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

# Tested on balanced

In [31]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

ecco_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/40 [00:00<?, ? examples/s]

# Trained on both

In [32]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
tokenized_combined = combined.map(preprocess_function, batched=True)

Map:   0%|          | 0/4778 [00:00<?, ? examples/s]

Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

In [34]:
import torch
torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined["train"],
    eval_dataset=tokenized_combined["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_monolingual/combined")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.196145,0.937238
2,0.239300,0.194198,0.945607
3,0.239300,0.198522,0.949791
4,0.123200,0.211805,0.94477
5,0.123200,0.243415,0.943096




In [35]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

# Tested on CAA

In [36]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

combined_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/732 [00:00<?, ? examples/s]

# Tested on ECCO

In [37]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

combined_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/463 [00:00<?, ? examples/s]

# Tested on CAA balanced

In [38]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

combined_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/40 [00:00<?, ? examples/s]

# Trained on CAA Balanced

In [39]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
tokenized_balanced_caa = balanced_caa.map(preprocess_function, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [41]:
import torch
torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_balanced_caa["train"],
    eval_dataset=tokenized_balanced_caa["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_monolingual/balanced_caa/")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.695817,0.425
2,No log,0.574828,0.75
3,No log,0.540289,0.7
4,No log,0.590447,0.7
5,No log,0.625232,0.75
6,No log,0.824046,0.675
7,No log,0.728272,0.75
8,No log,0.745822,0.75




In [42]:
new_text_list = balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = balanced_caa_test_df["label"]

caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/40 [00:00<?, ? examples/s]

# Tested on CAA

In [43]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

balanced_caa_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)






Map:   0%|          | 0/732 [00:00<?, ? examples/s]

# Tested on ECCO

In [44]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

balanced_caa_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/463 [00:00<?, ? examples/s]

# Tested on combined

In [45]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

balanced_caa_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)





Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

# Trained on combined_balanced

In [46]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
tokenized_combined_balanced_caa = combined_balanced_caa.map(preprocess_function, batched=True)

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [48]:
import torch
torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined_balanced_caa["train"],
    eval_dataset=tokenized_combined_balanced_caa["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

trainer.train()

trainer.save_model("bert_models_monolingual/balanced_combined/")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.686123,0.512821
2,No log,0.727558,0.551282
3,No log,0.67252,0.564103
4,No log,0.678008,0.628205
5,No log,0.714965,0.602564
6,No log,0.609603,0.641026
7,No log,0.67756,0.628205
8,No log,0.673988,0.615385




In [49]:
new_text_list = combined_balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_balanced_caa_test_df["label"]

combined_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

# Tested on CAA

In [50]:
new_text_list = caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = caa_test_df["label"]

combined_balanced_caa_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/732 [00:00<?, ? examples/s]

# Tested on ECCO

In [51]:
new_text_list = ecco_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = ecco_test_df["label"]

combined_balanced_ecco_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)



Map:   0%|          | 0/463 [00:00<?, ? examples/s]

# Tested on combined

In [52]:
new_text_list = combined_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_test_df["label"]

combined_balanced_combined_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)




Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

# Tested on CAA balanced

In [53]:
new_text_list = combined_balanced_caa_test_df["text"].tolist()

new_dataset = Dataset.from_dict({"text": new_text_list})

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=False)

predictions = trainer.predict(tokenized_new_dataset)

logits = predictions.predictions

import numpy as np

predicted_labels = np.argmax(logits, axis=1).tolist()

true_labels = combined_balanced_caa_test_df["label"]

combined_balanced_caa_balanced_eval_results = classification_report(true_labels, predicted_labels, output_dict=True)


Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [59]:
def list_eval_results_dictionaries():
    return {name: value for name, value in globals().items() if isinstance(value, dict) and 'eval_results' in name}

eval_result_dictionaries = list_eval_results_dictionaries()

dict_list = [{'DictName': name, ** eval_result_dictionaries[name]} for name in eval_result_dictionaries]

dict_df = pd.DataFrame(dict_list)

#dict_df = dict_df[['DictName'] + sorted(dict_df.columns.drop('DictName'),tolist())]


def expand_dict_columns(df):
    dict_columns = [col for col in df.columns if isinstance(df[col][0], dict)]
    expanded_cols = []
    for col in dict_columns:
        expanded = pd.json_normalize(df[col])
        expanded.columns = [f"{col}_{key}" for key in expanded.columns]
        expanded_cols.append(expanded)
    df = df.drop(columns=dict_columns)
    if expanded_cols:
        expanded_cols_df = pd.concat(expanded_cols, axis=1)
        df = pd.concat([df, expanded_cols_df], axis=1)
    return df


dff = expand_dict_columns(dict_df)

dff

Unnamed: 0,DictName,accuracy,0_precision,0_recall,0_f1-score,0_support,1_precision,1_recall,1_f1-score,1_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,eval_results_caa,0.964481,0.588235,0.625,0.606061,32.0,0.982808,0.98,0.981402,700.0,0.785522,0.8025,0.793731,732.0,0.965559,0.964481,0.964994,732.0
1,caa_ecco_eval_results,0.766739,0.47619,0.188679,0.27027,106.0,0.795724,0.938375,0.861183,357.0,0.635957,0.563527,0.565726,463.0,0.72257,0.766739,0.725898,463.0
2,caa_combined_eval_results,0.89205,0.691358,0.35,0.46473,160.0,0.906643,0.975845,0.939972,1035.0,0.799,0.662923,0.702351,1195.0,0.877818,0.89205,0.876341,1195.0
3,caa_caa_balanced_eval_results,0.85,0.9,0.642857,0.75,14.0,0.833333,0.961538,0.892857,26.0,0.866667,0.802198,0.821429,40.0,0.856667,0.85,0.842857,40.0
4,ecco_eval_results,0.928726,0.884211,0.792453,0.835821,106.0,0.940217,0.969188,0.954483,357.0,0.912214,0.88082,0.895152,463.0,0.927395,0.928726,0.927316,463.0
5,ecco_caa_eval_results,0.748634,0.104167,0.625,0.178571,32.0,0.977778,0.754286,0.851613,700.0,0.540972,0.689643,0.515092,732.0,0.939587,0.748634,0.82219,732.0
6,ecco_combined_eval_results,0.823431,0.417476,0.80625,0.550107,160.0,0.965011,0.826087,0.890161,1035.0,0.691244,0.816168,0.720134,1195.0,0.891701,0.823431,0.844631,1195.0
7,ecco_caa_balanced_eval_results,0.55,0.375,0.428571,0.4,14.0,0.666667,0.615385,0.64,26.0,0.520833,0.521978,0.52,40.0,0.564583,0.55,0.556,40.0
8,combined_eval_results,0.943096,0.815068,0.74375,0.777778,160.0,0.960915,0.973913,0.96737,1035.0,0.887992,0.858832,0.872574,1195.0,0.941388,0.943096,0.941986,1195.0
9,combined_caa_eval_results,0.983607,0.833333,0.78125,0.806452,32.0,0.990028,0.992857,0.991441,700.0,0.911681,0.887054,0.898946,732.0,0.983178,0.983607,0.983354,732.0


In [60]:
dff.to_csv('../results/bert-multilingual-task', index=False)