In [3]:
import pandas as pd

de_train = pd.read_csv("./train_dev_test_splits/de.train.csv", sep="\t")
de_val = pd.read_csv("./train_dev_test_splits/de.valid.csv", sep="\t")
de_test = pd.read_csv("./train_dev_test_splits/de.test.csv", sep="\t")

fr_train = pd.read_csv("./train_dev_test_splits/fr.train.csv", sep="\t")
fr_val = pd.read_csv("./train_dev_test_splits/fr.valid.csv", sep="\t")
fr_test = pd.read_csv("./train_dev_test_splits/fr.test.csv", sep="\t")

drop_cols = list(fr_train.columns)
drop_cols.remove("content")
drop_cols.remove("e1")

fr_train.drop(columns=drop_cols, inplace=True)
fr_val.drop(columns=drop_cols, inplace=True)
fr_test.drop(columns=drop_cols, inplace=True)

fr_train.rename(columns={'e1':'label'}, inplace = True)
fr_val.rename(columns={'e1':'label'}, inplace = True)
fr_test.rename(columns={'e1':'label'}, inplace = True)

In [4]:
from datasets import Dataset

de_train = Dataset.from_pandas(de_train)
de_val = Dataset.from_pandas(de_val)
de_test = Dataset.from_pandas(de_test)

fr_train = Dataset.from_pandas(fr_train)
fr_val = Dataset.from_pandas(fr_val)
fr_test = Dataset.from_pandas(fr_test)

Will use the train-test-splits that they use

In [6]:
from datasets import DatasetDict

# gather everyone if you want to have a single DatasetDict
train_valid_test_dataset_de = DatasetDict({
    'train': de_train,
    'validation': de_val,
    'test': de_test
})

train_valid_test_dataset_fr = DatasetDict({
    'train': fr_train,
    'validation': fr_val,
    'test': fr_test
})

In [7]:
model_name_de = "distilbert-base-multilingual-cased"
model_name_fr = "distilbert-base-multilingual-cased"
#model_name_fr = "camembert-base"

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name_fr)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [9]:
def preprocess_function(sample):
    return tokenizer(sample["content"], truncation=True)

In [10]:
tokenized_dataset = train_valid_test_dataset_fr.map(preprocess_function, batched=True)

Map:   0%|          | 0/2178 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name_fr, num_labels=3)

Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.we

In [12]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import load_metric
import numpy as np

metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [13]:
trainer.train()



  0%|          | 0/411 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer.save_model()

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

In [None]:
import torch

y_pred= []
for p in tokenized_dataset['test']['content']:
    ti = tokenizer(p, return_tensors="pt")
    out = model(**ti)
    pred = torch.argmax(out.logits)
    y_pred.append(pred)   # our labels are already 0 and 1

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

y_test = tokenized_dataset['test']['label']

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1: ', f1_score(y_test, y_pred, average='macro'))