In [1]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import confusion_matrix
import torch
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels)

In [2]:
df = pd.read_csv('entities.csv', sep=',')
df.rename(columns={'chef':'label', 'texte':'text'}, inplace=True)
df.label = df.label.astype('int')
df

Unnamed: 0,text,label
0,Breton Cyrille menuisier 25 Garçon française,0
1,Ferazzi Auguste vitrier 30 Garçon Piémontaise,1
2,Machol Pierre vitrier 24 Garçon Piémontaise,1
3,Desbois Alexandre prop re 48 Homme marié franç...,1
4,Vignat Zélie prop re sa fe 30 française,0
...,...,...
25075,Chameton-Dideron Marie chef 1869 idem Pailharès,1
25076,Ode Marie ouv chaus res chef Cara 1863 idem St...,1
25077,Berni Nello manoeuvre chef Baretto 1886 italie...,1
25078,Berni-Laureti Annunziata épouse 1887 idem idem,0


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [4]:
def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)

In [15]:
dataset = Dataset.from_pandas(df)

In [14]:
dataset["train"][0]

{'text': 'Rabier néant femme 1859 française', 'label': 0}

In [16]:
train_testvalid = dataset.train_test_split(0.2, seed=42)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset = train_test_valid_dataset
tokenized_text = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = 'longest')

In [11]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16 = True)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16 = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4156,0.228706,0.917065,0.828947,0.791209,0.870466
2,0.2396,0.200801,0.926635,0.849673,0.806202,0.8981
3,0.2211,0.191919,0.931021,0.859464,0.81135,0.913644
4,0.204,0.18784,0.933413,0.864558,0.814985,0.920553
5,0.1949,0.183977,0.934211,0.867043,0.812689,0.929188
6,0.1991,0.182843,0.933812,0.866559,0.810526,0.930915
7,0.1923,0.180792,0.933413,0.866932,0.804734,0.939551
8,0.1856,0.181291,0.935008,0.868654,0.814199,0.930915
9,0.1919,0.180251,0.934609,0.8688,0.80924,0.937824
10,0.1839,0.180036,0.934211,0.867682,0.80988,0.93437


TrainOutput(global_step=6270, training_loss=0.21743381019604452, metrics={'train_runtime': 369.9584, 'train_samples_per_second': 542.331, 'train_steps_per_second': 16.948, 'total_flos': 1194052852272768.0, 'train_loss': 0.21743381019604452, 'epoch': 10.0})

In [21]:
#Evaluate on test set 
predictions = trainer.predict(tokenized_text["test"])
print(np.mean(predictions.predictions.argmax(axis=1) != predictions.label_ids))



0.07177033492822966


In [23]:
confusion_matrix(predictions.label_ids, predictions.predictions.argmax(axis=1))

array([[1830,  147],
       [  33,  498]])

### Estimate variance

In [27]:
training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="no",
    fp16 = True)

all_test_metrics = []

for seed in range(5):
    cv_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id).to(device)
    

    ##### Split the data #####
    dataset = Dataset.from_pandas(df)
    train_testvalid = dataset.train_test_split(0.2, seed=seed)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(0.5)
    # gather everyone if you want to have a single DatasetDict
    train_test_valid_dataset = DatasetDict({
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']})

    dataset = train_test_valid_dataset
    tokenized_text = dataset.map(preprocess_function, batched=True)

    trainer = Trainer(
        model=cv_model,
        args=training_args,
        train_dataset=tokenized_text["train"],
        eval_dataset=tokenized_text["valid"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    #Evaluate on test set
    predictions = trainer.predict(tokenized_text["test"])
    all_test_metrics.append(predictions.metrics)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3934,0.224687,0.913078,0.813675,0.80678,0.82069
2,0.2326,0.197551,0.924242,0.847512,0.792793,0.910345
3,0.2122,0.192087,0.927831,0.853441,0.80458,0.908621
4,0.2031,0.18659,0.933812,0.866774,0.810811,0.931034
5,0.1973,0.187413,0.930223,0.8583,0.80916,0.913793
6,0.1897,0.182363,0.933812,0.866987,0.80988,0.932759
7,0.1939,0.182802,0.934211,0.86747,0.81203,0.931034
8,0.188,0.181591,0.933812,0.8672,0.808955,0.934483
9,0.188,0.181384,0.933812,0.866987,0.80988,0.932759
10,0.1844,0.181484,0.933812,0.866987,0.80988,0.932759


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.384,0.246059,0.909091,0.798942,0.773038,0.826642
2,0.2282,0.230042,0.916268,0.815141,0.787415,0.844891
3,0.2114,0.221751,0.92185,0.826549,0.802405,0.85219
4,0.1956,0.213415,0.927033,0.840731,0.803661,0.881387
5,0.1933,0.207909,0.928628,0.844753,0.804959,0.888686
6,0.191,0.212107,0.927432,0.841187,0.80602,0.879562
7,0.1903,0.209662,0.92823,0.843478,0.805648,0.885036
8,0.1873,0.208596,0.927831,0.842472,0.805324,0.883212
9,0.1823,0.207074,0.928628,0.844483,0.80597,0.886861
10,0.1875,0.20706,0.92823,0.843478,0.805648,0.885036


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3831,0.243398,0.904306,0.792746,0.758678,0.830018
2,0.2362,0.215203,0.919856,0.826275,0.791391,0.864376
3,0.2157,0.203493,0.924242,0.836207,0.799012,0.877034
4,0.1978,0.196594,0.925837,0.840753,0.798374,0.887884
5,0.1923,0.19625,0.926635,0.842466,0.8,0.889693
6,0.1965,0.189247,0.929825,0.850594,0.8016,0.905967
7,0.1897,0.187894,0.929825,0.850847,0.800638,0.907776
8,0.1886,0.187772,0.931818,0.855207,0.80414,0.913201
9,0.1855,0.188695,0.930223,0.851317,0.802885,0.905967
10,0.1832,0.187734,0.930622,0.852542,0.802233,0.909584


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.378,0.242089,0.905502,0.798982,0.757235,0.845601
2,0.2337,0.2047,0.925837,0.842373,0.797753,0.89228
3,0.2044,0.192277,0.931818,0.856664,0.803459,0.917415
4,0.1989,0.18911,0.934609,0.862647,0.808477,0.924596
5,0.1939,0.184624,0.933413,0.86164,0.8,0.933573
6,0.1828,0.18242,0.935407,0.865225,0.806202,0.933573
7,0.1947,0.180426,0.935008,0.86473,0.804012,0.935368
8,0.1831,0.17998,0.934609,0.864013,0.802773,0.935368
9,0.1785,0.180068,0.935008,0.86473,0.804012,0.935368
10,0.1875,0.180183,0.935008,0.86473,0.804012,0.935368


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3859,0.234158,0.907895,0.801034,0.761047,0.845455
2,0.2395,0.211402,0.920255,0.829352,0.78135,0.883636
3,0.2132,0.202619,0.924242,0.83705,0.792208,0.887273
4,0.1981,0.199678,0.926635,0.841924,0.798046,0.890909
5,0.1922,0.193613,0.927033,0.84557,0.788976,0.910909
6,0.1965,0.192572,0.929426,0.85038,0.794629,0.914545
7,0.1886,0.190747,0.929426,0.850633,0.793701,0.916364
8,0.1889,0.18884,0.927432,0.847059,0.7875,0.916364
9,0.1812,0.189435,0.929825,0.851351,0.794953,0.916364
10,0.1867,0.189231,0.929825,0.851351,0.794953,0.916364


In [37]:
res_table = pd.DataFrame(all_test_metrics)

full_res = pd.concat([res_table.mean(), res_table.std()], axis=1)
full_res.columns = ['mean', 'std']
full_res

Unnamed: 0,mean,std
test_loss,0.183553,0.013981
test_accuracy,0.931499,0.004755
test_f1,0.856752,0.008005
test_precision,0.793588,0.007768
test_recall,0.930851,0.008842
test_runtime,0.91148,0.00486
test_samples_per_second,2751.6332,14.688787
test_steps_per_second,172.2512,0.91969


## BERT (with LoRA)

In [4]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16 = True)


In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id
).to(device)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)


dataset = Dataset.from_pandas(df)
train_testvalid = dataset.train_test_split(0.2, seed=42)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset = train_test_valid_dataset
tokenized_text = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="bert",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16 = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20064 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2264,0.183298,0.929426,0.85456,0.781955,0.942029
2,0.1909,0.198609,0.924242,0.848967,0.756374,0.967391
3,0.1727,0.183502,0.931021,0.858313,0.783259,0.949275
4,0.1507,0.191341,0.926635,0.843003,0.796774,0.894928
5,0.1298,0.180579,0.933014,0.860465,0.794479,0.938406
6,0.1277,0.193539,0.931021,0.847845,0.823932,0.873188
