# Imports and metrics

In [None]:
!pip install datasets
!pip install transformers

import csv
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import transformers as tr
from transformers import CamembertTokenizerFast, AutoTokenizer
from datasets import load_metric, load_dataset
import torch
import re

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 17.8 MB/s eta 0:00:01[K     |██                              | 20 kB 11.0 MB/s eta 0:00:01[K     |███                             | 30 kB 8.1 MB/s eta 0:00:01[K     |████                            | 40 kB 6.4 MB/s eta 0:00:01[K     |█████                           | 51 kB 4.8 MB/s eta 0:00:01[K     |██████                          | 61 kB 5.6 MB/s eta 0:00:01[K     |███████                         | 71 kB 5.7 MB/s eta 0:00:01[K     |████████                        | 81 kB 5.7 MB/s eta 0:00:01[K     |█████████                       | 92 kB 6.3 MB/s eta 0:00:01[K     |██████████                      | 102 kB 5.4 MB/s eta 0:00:01[K     |███████████                     | 112 kB 5.4 MB/s eta 0:00:01[K     |████████████                    | 122 kB 5.4 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 5.4 MB/s eta 0:00:01[

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Load data

In [None]:
from sklearn.utils import shuffle

df = pd.read_csv('train_data_tweetsFR.csv')
data = df[['text', 'label']]

data_neg = data[data['label'] == 0]
data_neg = data_neg.sample(10000)
#data_neg = data_neg.sample(25000)
# data_pos.head()

data_pos = data[data['label'] == 1]
data_pos = data_pos.sample(10000)
#data_pos = data_pos.sample(25000)
# data_pos.head()

train_data = pd.concat([data_pos, data_neg])
train_data = shuffle(train_data)
train_data.head()

Unnamed: 0,text,label
327471,Nouveau sur Twitter pas encore d'amis,0
183862,La photo a fonctionné la deuxième fois que j'a...,0
794902,Va regarder 'glee' et le 'you've got mail' ave...,1
476436,Hey tweethearts !! À la journée de Stathe! All...,1
508458,Je n'ai plus de travail. Gf veut sortir. Mais ...,0


In [None]:
x_train_data = list(train_data['text'])
# print(x[:5])

y_train_data = list(train_data['label'])
# print(y[:5])

In [None]:
test_data = pd.read_csv('test_data_tweetsFR.csv')

x_test = list(test_data['text'])
y_test = list(test_data['label'])

# Tokenize data

- **do not forget** to uncomment tokenizer for transformer you are using!

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train_data, y_train_data, test_size=0.20, random_state=69)

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
#tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
train_encodings = tokenizer(x_train, truncation=True, padding=True)
val_encodings = tokenizer(x_val, truncation=True, padding=True)
test_encodings = tokenizer(x_test, truncation=True, padding=True)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [None]:
class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetsDataset(train_encodings, y_train)
val_dataset = TweetsDataset(val_encodings, y_val)
test_dataset = TweetsDataset(test_encodings, y_test)

# XLM-RoBERTa

## 1.Experiment

In [None]:
from transformers import CamembertForSequenceClassification, XLMRobertaForSequenceClassification, Trainer, TrainingArguments



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps = 500,                # number of eval steps
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
)

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # custom metrics for evaluation
)

trainer.train()

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5863,0.496481,0.772,0.791712,0.732593,0.761006
1000,0.4575,0.487666,0.7715,0.831677,0.67558,0.745546
1500,0.3286,0.504613,0.79375,0.786528,0.801211,0.793802


***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-1000 (score: 0.48766574263572693).


TrainOutput(global_step=1500, training_loss=0.45745875040690104, metrics={'train_runtime': 2330.0821, 'train_samples_per_second': 20.6, 'train_steps_per_second': 0.644, 'total_flos': 3403999278720000.0, 'train_loss': 0.45745875040690104, 'epoch': 3.0})

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


PredictionOutput(predictions=array([[-0.7495206 ,  0.8564812 ],
       [ 2.0985467 , -1.8531848 ],
       [-0.41320696,  0.49717987],
       ...,
       [ 0.44347495, -0.26459935],
       [ 2.529474  , -2.279411  ],
       [-0.48522452,  0.5688889 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.4672415554523468, 'test_accuracy': 0.7848, 'test_precision': 0.8441275978733688, 'test_recall': 0.6986, 'test_f1': 0.7644998905668636, 'test_runtime': 126.7177, 'test_samples_per_second': 78.916, 'test_steps_per_second': 2.47})

## 2.Experiment

In [None]:
from transformers import CamembertForSequenceClassification, XLMRobertaForSequenceClassification, Trainer, TrainingArguments



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps = 500,                # number of eval steps
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
)

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # custom metrics for evaluation
)

trainer.train()

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5757,0.485639,0.7777,0.745209,0.84219,0.790737
1000,0.5096,0.472725,0.771,0.720668,0.883096,0.793657
1500,0.452,0.458687,0.7873,0.747148,0.866854,0.802562
2000,0.4163,0.445242,0.8014,0.803438,0.796671,0.80004
2500,0.4121,0.431662,0.8092,0.798874,0.825145,0.811797
3000,0.3293,0.456372,0.8104,0.804532,0.818729,0.811568
3500,0.3172,0.454183,0.8088,0.808921,0.807299,0.808109


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_m

TrainOutput(global_step=3750, training_loss=0.4221941426595052, metrics={'train_runtime': 3998.5925, 'train_samples_per_second': 30.011, 'train_steps_per_second': 0.938, 'total_flos': 4563332366400000.0, 'train_loss': 0.4221941426595052, 'epoch': 3.0})

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


PredictionOutput(predictions=array([[-1.9863065 ,  1.7563646 ],
       [ 1.5380038 , -0.93808967],
       [-0.21578398,  0.42242977],
       ...,
       [ 1.2280968 , -0.6616828 ],
       [ 2.55148   , -1.9837543 ],
       [-1.1319151 ,  1.1132224 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.42216241359710693, 'test_accuracy': 0.8144, 'test_precision': 0.8054799844539449, 'test_recall': 0.829, 'test_f1': 0.8170707668046521, 'test_runtime': 126.0172, 'test_samples_per_second': 79.354, 'test_steps_per_second': 2.484})

# CamemBERT

## 3.Experiment

In [None]:
from transformers import CamembertForSequenceClassification, XLMRobertaForSequenceClassification, Trainer, TrainingArguments



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps = 500,                # number of eval steps
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
)

model = CamembertForSequenceClassification.from_pretrained("camembert-base")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # custom metrics for evaluation
)

trainer.train()

In [None]:
trainer.predict(test_dataset)

## 4.Experiment

In [None]:
from transformers import CamembertForSequenceClassification, XLMRobertaForSequenceClassification, Trainer, TrainingArguments



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps = 500,                # number of eval steps
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
)

model = CamembertForSequenceClassification.from_pretrained("camembert-base")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # custom metrics for evaluation
)

trainer.train()

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weig

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5499,0.447422,0.7946,0.792601,0.809673,0.801046
1000,0.4488,0.438032,0.7966,0.777798,0.842373,0.808799
1500,0.3959,0.452537,0.8007,0.852741,0.737028,0.790673
2000,0.3517,0.425724,0.8096,0.824256,0.797141,0.810472
2500,0.3478,0.432124,0.8103,0.807589,0.825142,0.816271
3000,0.2379,0.514079,0.8109,0.833748,0.786567,0.809471
3500,0.2336,0.506004,0.81,0.824005,0.798512,0.811058


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_m

TrainOutput(global_step=3750, training_loss=0.3581574717203776, metrics={'train_runtime': 3869.998, 'train_samples_per_second': 31.008, 'train_steps_per_second': 0.969, 'total_flos': 5179998902400000.0, 'train_loss': 0.3581574717203776, 'epoch': 3.0})

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


PredictionOutput(predictions=array([[-1.3034062 ,  1.0951177 ],
       [ 1.0603892 , -1.0462689 ],
       [-1.1909091 ,  1.038684  ],
       ...,
       [ 0.7031059 , -0.72442216],
       [ 2.1067336 , -1.9692304 ],
       [-1.3259143 ,  1.1134901 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.40727323293685913, 'test_accuracy': 0.8207, 'test_precision': 0.822700744616623, 'test_recall': 0.8176, 'test_f1': 0.8201424415688635, 'test_runtime': 116.3104, 'test_samples_per_second': 85.977, 'test_steps_per_second': 2.691})

# Save model

In [None]:
torch.save(model, 'modelthirdFR')