### Import depedencies 

In [None]:
import pandas as pd
import re
from transformers import CamembertTokenizer
from sklearn.model_selection import train_test_split
from transformers import CamembertModel, CamembertForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset


  from .autonotebook import tqdm as notebook_tqdm


### Import twitter data, preprocess, tokenize, divide in different sets

In [2]:
# Load Data
data = pd.read_csv('sampled_tweets.csv')

In [31]:
data.head()

Unnamed: 0,label,text,stemmed_content
0,0,Je pense que je devrais embaucher un de ces tr...,pens devrai embauch tranlateur parc personn a ...
1,0,"Et la pauvre ruth, vous voulez des tissus?",pauvr ruth voulez tissu
2,1,- a dessiné une carte pour vous - 6 baguettes ...,a dessin cart baguett connu comm seigneur vict...
3,0,"Mcfly 7 / juin / 2009 au Mexique, la grippe po...",mcfli juin mexiqu gripp porcin truit tout coup...
4,1,"Bon, l'hummer sort des rues - il faut que les ...",bon hummer sort rue faut voitur placent bon ma...


In [3]:
# Preprocess Data 
def preprocess_text(text):
    text = text.lower()  # Convertir en minuscules
    text = re.sub(r'[^a-zA-Zà-ÿ\s]', '', text)  # Supprimer les caractères spéciaux
    return text

data['stemmed_content'] = data['text'].apply(preprocess_text)

In [33]:
data.head()

Unnamed: 0,label,text,stemmed_content
0,0,Je pense que je devrais embaucher un de ces tr...,je pense que je devrais embaucher un de ces tr...
1,0,"Et la pauvre ruth, vous voulez des tissus?",et la pauvre ruth vous voulez des tissus
2,1,- a dessiné une carte pour vous - 6 baguettes ...,a dessiné une carte pour vous baguettes co...
3,0,"Mcfly 7 / juin / 2009 au Mexique, la grippe po...",mcfly juin au mexique la grippe porcine le...
4,1,"Bon, l'hummer sort des rues - il faut que les ...",bon lhummer sort des rues il faut que les voi...


In [4]:
data = data.drop(columns=['text'])


In [5]:
data = data.rename(columns={'label': 'labels'})

In [36]:
data.head()

Unnamed: 0,labels,stemmed_content
0,0,je pense que je devrais embaucher un de ces tr...
1,0,et la pauvre ruth vous voulez des tissus
2,1,a dessiné une carte pour vous baguettes co...
3,0,mcfly juin au mexique la grippe porcine le...
4,1,bon lhummer sort des rues il faut que les voi...


In [6]:
dataset = Dataset.from_pandas(data)


In [7]:
# Load tokenizer
# It is what is used to turn data into tokens
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenisation of data
def tokenize_function(examples):
    return tokenizer(
        examples['stemmed_content'],  # Column containing the text
        padding='max_length',        # Pad to the maximum sequence length
        truncation=True,             # Truncate sequences longer than the model's max length
        max_length=512,              # Set the maximum token sequence length
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 10000/10000 [00:02<00:00, 3834.45 examples/s]


In [8]:
# Split into train and temp datasets (80% train, 20% temp)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_data = train_test_split['train']
temp_data = train_test_split['test']

# Further split temp_data into validation and test datasets (50% each of the remaining 20%)
valid_test_split = temp_data.train_test_split(test_size=0.5, seed=42)
valid_data = valid_test_split['train']
test_data = valid_test_split['test']

In [40]:
print(train_data[0])

{'labels': 0, 'stemmed_content': 'ok merde une partie de cela est assez dur', 'input_ids': [5, 7330, 7894, 28, 245, 8, 207, 30, 424, 2498, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

### Initially import CamemBERT model to fine-tune it 

In [None]:
# Import camemBERT pretrained model
# model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### If there is already a fine-tuned model

In [None]:
save_path = r'C:\Users\matth\Documents\Project\fine_tuned_camembert'

# Charger le tokenizer
tokenizer = CamembertTokenizer.from_pretrained(save_path)

# Charger le modèle fine-tuné
model = CamembertForSequenceClassification.from_pretrained(save_path)

In [None]:
# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1, # One epoch runs in 6 hours on my hardware
    weight_decay=0.01,
    disable_tqdm=False
)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
)

# Entraîner le modèle
trainer.train()


  0%|          | 3/3000 [21:29<357:55:43, 429.94s/it]
 50%|█████     | 500/1000 [2:21:34<2:54:06, 20.89s/it]
 50%|█████     | 500/1000 [2:21:34<2:54:06, 20.89s/it]

{'loss': 0.5383, 'grad_norm': 14.526749610900879, 'learning_rate': 1e-05, 'epoch': 0.5}


100%|██████████| 1000/1000 [5:58:35<00:00, 16.85s/it] 
100%|██████████| 1000/1000 [5:58:35<00:00, 16.85s/it]

{'loss': 0.4879, 'grad_norm': 2.422028064727783, 'learning_rate': 0.0, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\matth\Documents\Project\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
  File "C:\Users\matth\AppData\Local\Temp\ipykernel_21416\1073847356.py", line 22, in <module>
    trainer.train()
  File "c:\Users\matth\Documents\Project\venv\Lib\site-packages\transformers\trainer.py", line 2164, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\matth\Documents\Project\venv\Lib\site-packages\transformers\trainer.py", line 2616, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time)
  File "c:\Users\matth\Documents\Project\venv\Lib\site-packages\transformers\trainer.py", line 3047, in _maybe_log_save_evaluate
    metrics = self._evaluate(trial, ignore_keys_for_eval)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\matth\Documents\Project\venv\Lib\site-packages\transformer

In [10]:
model2 = CamembertForSequenceClassification.from_pretrained(save_path)

In [None]:
text = "hello guys"

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs2 = model2(**inputs)

logits2 = outputs2.logits

print(f"Logits: {logits2}")

# Convert logits to probabilities
probabilities2 = torch.nn.functional.softmax(logits2, dim=-1)

# Predicted class
predicted_class2 = torch.argmax(probabilities2, dim=-1)


print(f"Probabilities2: {probabilities2}")
print(f"Predicted class: {predicted_class2.item()}")

Logits: tensor([[ 0.5342, -0.5230]])
Logits: tensor([[ 0.5342, -0.5230]])
Probabilities1: tensor([[0.7421, 0.2579]])
Predicted class: 0
Probabilities2: tensor([[0.7421, 0.2579]])
Predicted class: 0


### Save the model 

In [46]:
# Chemin d'accès pour sauvegarder le modèle CamemBERT 
save_path = r'C:\Users\matth\Documents\Project\fine_tuned_camembert'

# Sauvegarder le modèle fine-tuné
model.save_pretrained(save_path)

# Sauvegarder le tokenizer (optionnel, mais recommandé)
tokenizer.save_pretrained(save_path)

('C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\tokenizer_config.json',
 'C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\special_tokens_map.json',
 'C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\sentencepiece.bpe.model',
 'C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\added_tokens.json')