In [3]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import confusion_matrix
import torch
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np
from sklearn.model_selection import StratifiedKFold
device = 'cuda' if torch.cuda.is_available() else 'cpu'

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels)

- CamemBERT
- DistillBERT version fr (https://huggingface.co/cmarkea/distilcamembert-base)
    - Comparer avec distillbert de base
- LoRA avec Bert ou CamemBERT, faire varier le rang
- Baseline avec sklearn


- Prendre les milleures préd et compiler quelqes stats dessus (notamment les erreurs)




In [10]:
df = pd.read_csv('entities.csv', sep=',')
df.rename(columns={'chef':'label', 'texte':'text'}, inplace=True)
df.label = df.label.astype('int')
df

Unnamed: 0,text,label
0,Breton Cyrille menuisier 25 Garçon française,0
1,Ferazzi Auguste vitrier 30 Garçon Piémontaise,1
2,Machol Pierre vitrier 24 Garçon Piémontaise,1
3,Desbois Alexandre prop re 48 Homme marié franç...,1
4,Vignat Zélie prop re sa fe 30 française,0
...,...,...
25075,Chameton-Dideron Marie chef 1869 idem Pailharès,1
25076,Ode Marie ouv chaus res chef Cara 1863 idem St...,1
25077,Berni Nello manoeuvre chef Baretto 1886 italie...,1
25078,Berni-Laureti Annunziata épouse 1887 idem idem,0


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)

In [4]:
dataset = Dataset.from_pandas(df)

train_testvalid = dataset.train_test_split(0.2, seed=42)
# Split the 10% test + valid in half test, half valid
train_valid = train_testvalid['train'].train_test_split(0.2, seed=42)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_valid['train'],
    'test': train_testvalid['test'],
    'valid': train_valid['test']})


dataset = train_test_valid_dataset
tokenized_text = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/16051 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

Map:   0%|          | 0/4013 [00:00<?, ? examples/s]

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = 'longest')

In [5]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Vanilla DistilBERT

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=6e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.05,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16 = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2618,0.184132,0.93222,0.853921,0.795796,0.921205
2,0.1934,0.179823,0.930725,0.85353,0.782609,0.938586
3,0.1813,0.186108,0.933466,0.856683,0.798,0.924681
4,0.172,0.18269,0.935958,0.860553,0.809184,0.918888
5,0.1657,0.183073,0.934961,0.859903,0.801,0.928158


TrainOutput(global_step=2510, training_loss=0.1949544667247757, metrics={'train_runtime': 159.5191, 'train_samples_per_second': 503.106, 'train_steps_per_second': 15.735, 'total_flos': 478475089924740.0, 'train_loss': 0.1949544667247757, 'epoch': 5.0})

In [59]:
#Evaluate on test set 
predictions = trainer.predict(tokenized_text["test"])
print(np.mean(predictions.predictions.argmax(axis=1) != predictions.label_ids))



0.06957735247208932


In [60]:
confusion_matrix(predictions.label_ids, predictions.predictions.argmax(axis=1))

array([[3611,  295],
       [  54, 1056]])

### Estimate variance

In [7]:
str = "distilbert/distilbert-base-uncased"

dataset = Dataset.from_pandas(df)

tokenizer = AutoTokenizer.from_pretrained(str)


def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=6e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.05,
    evaluation_strategy="epoch",
    save_strategy="no",
    fp16 = True)

all_test_metrics = []

df = pd.read_csv('entities.csv', sep=',')
df.rename(columns={'chef':'label', 'texte':'text'}, inplace=True)
df.label = df.label.astype('int')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
dataset = Dataset.from_pandas(df)
for i, (train_index, test_index) in enumerate(skf.split(df.text, df.label)):
    dataset_train = dataset.select(train_index)
    dataset_test = dataset.select(test_index)
    dataset_train_val = dataset_train.train_test_split(0.05, seed=42)
    split_dataset = DatasetDict({
        'train': dataset_train_val['train'],
        'valid': dataset_train_val['test'],
        'test': dataset_test
    })

    cv_model = AutoModelForSequenceClassification.from_pretrained(
    str, num_labels=2, id2label=id2label, label2id=label2id).to(device)
    
    tokenized_text = split_dataset.map(preprocess_function, batched=True)

    trainer = Trainer(
        model=cv_model,
        args=training_args,
        train_dataset=tokenized_text["train"],
        eval_dataset=tokenized_text["valid"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    #Evaluate on test set
    predictions = trainer.predict(tokenized_text["test"])
    print("FOLD ", i)
    print(predictions.metrics)
    all_test_metrics.append(predictions.metrics)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/19060 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2546,0.180369,0.940239,0.881423,0.83209,0.936975
2,0.1868,0.167976,0.943227,0.887574,0.836431,0.945378
3,0.1794,0.160885,0.942231,0.887597,0.823741,0.962185
4,0.1689,0.162238,0.945219,0.891945,0.837638,0.953782


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  0
{'test_loss': 0.19174401462078094, 'test_accuracy': 0.9284290271132376, 'test_f1': 0.8540056933712892, 'test_precision': 0.7888805409466566, 'test_recall': 0.9308510638297872, 'test_runtime': 1.7646, 'test_samples_per_second': 2842.637, 'test_steps_per_second': 177.948}


Map:   0%|          | 0/19060 [00:00<?, ? examples/s]

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2465,0.186793,0.931275,0.863905,0.793478,0.948052
2,0.1909,0.183038,0.933267,0.867327,0.79927,0.948052
3,0.1754,0.179777,0.929283,0.860511,0.78777,0.948052
4,0.1743,0.178519,0.932271,0.866142,0.794224,0.952381


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  1
{'test_loss': 0.19034463167190552, 'test_accuracy': 0.9344098883572568, 'test_f1': 0.8633153302866639, 'test_precision': 0.8123534010946052, 'test_recall': 0.9210992907801419, 'test_runtime': 1.787, 'test_samples_per_second': 2807.0, 'test_steps_per_second': 175.717}


Map:   0%|          | 0/19060 [00:00<?, ? examples/s]

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2491,0.202319,0.930279,0.857724,0.790262,0.937778
2,0.1868,0.194275,0.931275,0.861167,0.786765,0.951111
3,0.1821,0.19263,0.932271,0.860656,0.798479,0.933333
4,0.1687,0.192298,0.933267,0.862986,0.799242,0.937778


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  2
{'test_loss': 0.18135252594947815, 'test_accuracy': 0.9302232854864434, 'test_f1': 0.8579545454545454, 'test_precision': 0.7911676646706587, 'test_recall': 0.9370567375886525, 'test_runtime': 1.7767, 'test_samples_per_second': 2823.189, 'test_steps_per_second': 176.731}


Map:   0%|          | 0/19060 [00:00<?, ? examples/s]

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2556,0.177844,0.938247,0.869198,0.834008,0.907489
2,0.188,0.167225,0.940239,0.877049,0.819923,0.942731
3,0.1763,0.168645,0.938247,0.873984,0.811321,0.947137
4,0.1739,0.167532,0.942231,0.881148,0.823755,0.947137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  3
{'test_loss': 0.1807997226715088, 'test_accuracy': 0.9322169059011164, 'test_f1': 0.862570735650768, 'test_precision': 0.7927191679049034, 'test_recall': 0.9459219858156028, 'test_runtime': 1.7811, 'test_samples_per_second': 2816.182, 'test_steps_per_second': 176.292}


Map:   0%|          | 0/19060 [00:00<?, ? examples/s]

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2505,0.179783,0.936255,0.867769,0.820312,0.921053
2,0.189,0.17362,0.941235,0.877847,0.831373,0.929825
3,0.1792,0.170909,0.943227,0.881988,0.835294,0.934211
4,0.1704,0.170942,0.941235,0.878351,0.828794,0.934211


FOLD  4
{'test_loss': 0.1854034811258316, 'test_accuracy': 0.9316188197767146, 'test_f1': 0.8586732591676968, 'test_precision': 0.8021555042340262, 'test_recall': 0.9237588652482269, 'test_runtime': 1.8042, 'test_samples_per_second': 2780.229, 'test_steps_per_second': 174.041}


In [8]:
res_table = pd.DataFrame(all_test_metrics)

full_res = pd.concat([res_table.mean(), res_table.std()], axis=1)
full_res.columns = ['mean', 'std']
full_res

Unnamed: 0,mean,std
test_loss,0.185929,0.005021
test_accuracy,0.93138,0.002235
test_f1,0.859304,0.003777
test_precision,0.797455,0.009741
test_recall,0.931738,0.010088
test_runtime,1.78272,0.014551
test_samples_per_second,2813.8474,22.905461
test_steps_per_second,176.1458,1.434034


## DistilCamemBERT

In [None]:
str = "cmarkea/distilcamembert-base"

dataset = Dataset.from_pandas(df)

tokenizer = AutoTokenizer.from_pretrained(str)


def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

train_testvalid = dataset.train_test_split(0.2, seed=42)
# Split the 10% test + valid in half test, half valid
train_valid = train_testvalid['train'].train_test_split(0.2, seed=42)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_valid['train'],
    'test': train_testvalid['test'],
    'valid': train_valid['test']})


dataset = train_test_valid_dataset
print(dataset)
tokenized_text = dataset.map(preprocess_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    str, num_labels=2, id2label=id2label, label2id=label2id
).to(device)


training_args = TrainingArguments(
    output_dir="distilCAMEMbert",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16 = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16051
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5016
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 4013
    })
})


Map:   0%|          | 0/16051 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

Map:   0%|          | 0/4013 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.25,0.177436,0.930725,0.851337,0.790467,0.922364
2,0.1884,0.176616,0.931473,0.854574,0.785992,0.936269
3,0.1774,0.173453,0.933466,0.857296,0.795635,0.929316
4,0.1708,0.172576,0.934712,0.860341,0.796644,0.93511


TrainOutput(global_step=2008, training_loss=0.19646803334177254, metrics={'train_runtime': 129.6097, 'train_samples_per_second': 495.364, 'train_steps_per_second': 15.493, 'total_flos': 341023605394308.0, 'train_loss': 0.19646803334177254, 'epoch': 4.0})

In [40]:
#Evaluate on test set 
predictions = trainer.predict(tokenized_text["test"])
print(np.mean(predictions.predictions.argmax(axis=1) != predictions.label_ids))



0.06778309409888357


In [41]:
predictions

PredictionOutput(predictions=array([[-0.9848633,  1.0458984],
       [ 3.0234375, -3.375    ],
       [-1.1279297,  1.2763672],
       ...,
       [ 2.984375 , -3.3515625],
       [ 2.9316406, -3.3574219],
       [ 2.6601562, -2.9082031]], dtype=float32), label_ids=array([1, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.17468680441379547, 'test_accuracy': 0.9322169059011164, 'test_f1': 0.8614506927465363, 'test_precision': 0.7864583333333334, 'test_recall': 0.9522522522522523, 'test_runtime': 1.7809, 'test_samples_per_second': 2816.554, 'test_steps_per_second': 176.315})

In [11]:
str = "cmarkea/distilcamembert-base"

dataset = Dataset.from_pandas(df)

tokenizer = AutoTokenizer.from_pretrained(str)


def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=8e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.05,
    evaluation_strategy="epoch",
    save_strategy="no",
    fp16 = True)

all_test_metrics = []

df = pd.read_csv('entities.csv', sep=',')
df.rename(columns={'chef':'label', 'texte':'text'}, inplace=True)
df.label = df.label.astype('int')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
dataset = Dataset.from_pandas(df)
for i, (train_index, test_index) in enumerate(skf.split(df.text, df.label)):
    dataset_train = dataset.select(train_index)
    dataset_test = dataset.select(test_index)
    dataset_train_val = dataset_train.train_test_split(0.01, seed=42)
    split_dataset = DatasetDict({
        'train': dataset_train_val['train'],
        'valid': dataset_train_val['test'],
        'test': dataset_test
    })

    cv_model = AutoModelForSequenceClassification.from_pretrained(
    str, num_labels=2, id2label=id2label, label2id=label2id).to(device)
    
    tokenized_text = split_dataset.map(preprocess_function, batched=True)

    trainer = Trainer(
        model=cv_model,
        args=training_args,
        train_dataset=tokenized_text["train"],
        eval_dataset=tokenized_text["valid"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    #Evaluate on test set
    predictions = trainer.predict(tokenized_text["test"])
    print("FOLD ", i)
    print(predictions.metrics)
    all_test_metrics.append(predictions.metrics)


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/19863 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2623,0.159148,0.935323,0.871287,0.862745,0.88
2,0.1908,0.159403,0.945274,0.888889,0.897959,0.88
3,0.1769,0.166142,0.940299,0.877551,0.895833,0.86
4,0.1772,0.172803,0.940299,0.877551,0.895833,0.86
5,0.1652,0.165153,0.945274,0.888889,0.897959,0.88


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  0
{'test_loss': 0.18460464477539062, 'test_accuracy': 0.9288277511961722, 'test_f1': 0.8548190321268808, 'test_precision': 0.7896318557475582, 'test_recall': 0.9317375886524822, 'test_runtime': 1.7547, 'test_samples_per_second': 2858.648, 'test_steps_per_second': 178.95}


Map:   0%|          | 0/19863 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2622,0.182556,0.935323,0.87619,0.793103,0.978723
2,0.1888,0.17149,0.935323,0.87619,0.793103,0.978723
3,0.1807,0.171524,0.925373,0.859813,0.766667,0.978723
4,0.1677,0.166233,0.940299,0.884615,0.807018,0.978723
5,0.1634,0.166313,0.935323,0.87619,0.793103,0.978723


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  1
{'test_loss': 0.18357910215854645, 'test_accuracy': 0.9350079744816587, 'test_f1': 0.8663934426229508, 'test_precision': 0.805640243902439, 'test_recall': 0.9370567375886525, 'test_runtime': 1.7855, 'test_samples_per_second': 2809.345, 'test_steps_per_second': 175.864}


Map:   0%|          | 0/19863 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2583,0.156931,0.940299,0.884615,0.836364,0.938776
2,0.1894,0.138456,0.950249,0.903846,0.854545,0.959184
3,0.1799,0.137356,0.950249,0.903846,0.854545,0.959184
4,0.1724,0.134795,0.950249,0.903846,0.854545,0.959184
5,0.1593,0.136748,0.950249,0.903846,0.854545,0.959184


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  2
{'test_loss': 0.18177413940429688, 'test_accuracy': 0.930622009569378, 'test_f1': 0.8592233009708737, 'test_precision': 0.7901785714285714, 'test_recall': 0.9414893617021277, 'test_runtime': 1.8505, 'test_samples_per_second': 2710.63, 'test_steps_per_second': 169.685}


Map:   0%|          | 0/19863 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2546,0.179219,0.940299,0.87234,0.854167,0.891304
2,0.1929,0.159892,0.945274,0.884211,0.857143,0.913043
3,0.1771,0.144239,0.950249,0.895833,0.86,0.934783
4,0.1771,0.133372,0.955224,0.905263,0.877551,0.934783
5,0.1649,0.132404,0.955224,0.905263,0.877551,0.934783


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FOLD  3
{'test_loss': 0.1750919222831726, 'test_accuracy': 0.9346092503987241, 'test_f1': 0.8674211802748585, 'test_precision': 0.7971768202080238, 'test_recall': 0.9512411347517731, 'test_runtime': 1.7541, 'test_samples_per_second': 2859.52, 'test_steps_per_second': 179.005}


Map:   0%|          | 0/19863 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/5016 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2598,0.160656,0.940299,0.88,0.814815,0.956522
2,0.1884,0.154001,0.945274,0.888889,0.830189,0.956522
3,0.1726,0.152318,0.940299,0.88,0.814815,0.956522
4,0.1764,0.151265,0.940299,0.88,0.814815,0.956522
5,0.1643,0.148314,0.940299,0.88,0.814815,0.956522


FOLD  4
{'test_loss': 0.18896128237247467, 'test_accuracy': 0.9302232854864434, 'test_f1': 0.8562037797863599, 'test_precision': 0.7978560490045942, 'test_recall': 0.9237588652482269, 'test_runtime': 1.7683, 'test_samples_per_second': 2836.698, 'test_steps_per_second': 177.576}


In [12]:
res_table = pd.DataFrame(all_test_metrics)

full_res = pd.concat([res_table.mean(), res_table.std()], axis=1)
full_res.columns = ['mean', 'std']
full_res

Unnamed: 0,mean,std
test_loss,0.182802,0.005057
test_accuracy,0.931858,0.002778
test_f1,0.860812,0.005799
test_precision,0.796097,0.00656
test_recall,0.937057,0.01032
test_runtime,1.78262,0.040044
test_samples_per_second,2814.9682,61.812322
test_steps_per_second,176.216,3.869132


# CamemBERT

In [5]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16 = True)


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    "camembert-base", num_labels=2, id2label=id2label, label2id=label2id
).to(device)

sum([param.nelement() for param in model.parameters()])

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


110623490

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(
    "camembert-base'", num_labels=2, id2label=id2label, label2id=label2id
).to(device)

tokenizer = AutoTokenizer.from_pretrained("camembert-base'")

def preprocess_function(examples, batched = True):
    return tokenizer(examples["text"], truncation=True)


dataset = Dataset.from_pandas(df)
train_testvalid = dataset.train_test_split(0.2, seed=42)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset = train_test_valid_dataset
tokenized_text = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="camembert",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16 = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

NameError: name 'AutoModelForSequenceClassification' is not defined