### Import depedencies 

In [12]:
import pandas as pd
import re
from transformers import CamembertTokenizer
from sklearn.model_selection import train_test_split
from transformers import CamembertModel, CamembertForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset


### Part 1 : preproccess the twitter data

In [2]:
# Load Data
data = pd.read_csv('sampled_tweets.csv')

In [31]:
data.head()

Unnamed: 0,label,text,stemmed_content
0,0,Je pense que je devrais embaucher un de ces tr...,pens devrai embauch tranlateur parc personn a ...
1,0,"Et la pauvre ruth, vous voulez des tissus?",pauvr ruth voulez tissu
2,1,- a dessiné une carte pour vous - 6 baguettes ...,a dessin cart baguett connu comm seigneur vict...
3,0,"Mcfly 7 / juin / 2009 au Mexique, la grippe po...",mcfli juin mexiqu gripp porcin truit tout coup...
4,1,"Bon, l'hummer sort des rues - il faut que les ...",bon hummer sort rue faut voitur placent bon ma...


In [3]:
# Preprocess Data 
def preprocess_text(text):
    text = text.lower()  # Convertir en minuscules
    text = re.sub(r'[^a-zA-Zà-ÿ\s]', '', text)  # Supprimer les caractères spéciaux
    return text

data['stemmed_content'] = data['text'].apply(preprocess_text)

In [33]:
data.head()

Unnamed: 0,label,text,stemmed_content
0,0,Je pense que je devrais embaucher un de ces tr...,je pense que je devrais embaucher un de ces tr...
1,0,"Et la pauvre ruth, vous voulez des tissus?",et la pauvre ruth vous voulez des tissus
2,1,- a dessiné une carte pour vous - 6 baguettes ...,a dessiné une carte pour vous baguettes co...
3,0,"Mcfly 7 / juin / 2009 au Mexique, la grippe po...",mcfly juin au mexique la grippe porcine le...
4,1,"Bon, l'hummer sort des rues - il faut que les ...",bon lhummer sort des rues il faut que les voi...


In [4]:
data = data.drop(columns=['text'])


In [5]:
# For the model need a column named labels
data = data.rename(columns={'label': 'labels'})

In [36]:
data.head()

Unnamed: 0,labels,stemmed_content
0,0,je pense que je devrais embaucher un de ces tr...
1,0,et la pauvre ruth vous voulez des tissus
2,1,a dessiné une carte pour vous baguettes co...
3,0,mcfly juin au mexique la grippe porcine le...
4,1,bon lhummer sort des rues il faut que les voi...


In [6]:
# Convert the type to Dataset
dataset = Dataset.from_pandas(data)


### Part 2 : Tokenize and split the data 

In [None]:
# Load tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# OR if we have already imported and saved a tokenizer 
# save_path = r'C:\Users\matth\Documents\Project\fine_tuned_camembert'
# tokenizer = CamembertTokenizer.from_pretrained(save_path)


# Tokenisation of data
def tokenize_function(examples):
    return tokenizer(
        examples['stemmed_content'],  # Column containing the text
        padding='max_length',        # Pad to the maximum sequence length
        truncation=True,             # Truncate sequences longer than the model's max length
        max_length=512,              # Set the maximum token sequence length
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 10000/10000 [00:02<00:00, 4644.79 examples/s]


In [8]:
# Split into train and temp datasets (80% train, 20% temp)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_data = train_test_split['train']
temp_data = train_test_split['test']

# Further split temp_data into validation and test datasets (50% each of the remaining 20%)
valid_test_split = temp_data.train_test_split(test_size=0.5, seed=42)
valid_data = valid_test_split['train']
test_data = valid_test_split['test']

In [40]:
print(train_data[0])

{'labels': 0, 'stemmed_content': 'ok merde une partie de cela est assez dur', 'input_ids': [5, 7330, 7894, 28, 245, 8, 207, 30, 424, 2498, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

### Part 3 : import the model 

In [13]:
# If no model was previously imported, import camemBERT pretrained model from HuggingFace
# model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)

# If there is already a saved model, import the saved model
# Don't forget to change the save_path 
save_path = r'C:\Users\matth\Documents\Project\fine_tuned_camembert' #
model = CamembertForSequenceClassification.from_pretrained(save_path)


In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=4e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1, # One epoch runs in 6 hours on my hardware
    weight_decay=0.01,
    disable_tqdm=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
)

# Train the model 
trainer.train()


                                             
100%|██████████| 1/1 [09:38<00:00, 578.78s/it]   

{'eval_loss': 0.48083239793777466, 'eval_runtime': 559.9305, 'eval_samples_per_second': 1.786, 'eval_steps_per_second': 0.223, 'epoch': 0.0}
{'train_runtime': 578.7801, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.002, 'train_loss': 0.35079216957092285, 'epoch': 0.0}





TrainOutput(global_step=1, training_loss=0.35079216957092285, metrics={'train_runtime': 578.7801, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.002, 'total_flos': 2104888442880.0, 'train_loss': 0.35079216957092285, 'epoch': 0.001})

In [None]:
text = "hello guys"

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs2 = model(**inputs)

logits2 = outputs2.logits

print(f"Logits: {logits2}")

# Convert logits to probabilities
probabilities2 = torch.nn.functional.softmax(logits2, dim=-1)

# Predicted class
predicted_class2 = torch.argmax(probabilities2, dim=-1)


print(f"Probabilities2: {probabilities2}")
print(f"Predicted class: {predicted_class2.item()}")

Logits: tensor([[ 0.5342, -0.5230]])
Logits: tensor([[ 0.5342, -0.5230]])
Probabilities1: tensor([[0.7421, 0.2579]])
Predicted class: 0
Probabilities2: tensor([[0.7421, 0.2579]])
Predicted class: 0


### Part 4 : Save the model 

In [46]:
save_path = r'C:\Users\matth\Documents\Project\fine_tuned_camembert'

# Save the fine-tuned model 
model.save_pretrained(save_path)

# Save the tokenizer (optionnal)
tokenizer.save_pretrained(save_path)

('C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\tokenizer_config.json',
 'C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\special_tokens_map.json',
 'C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\sentencepiece.bpe.model',
 'C:\\Users\\matth\\Documents\\Project\\fine_tuned_camembert\\added_tokens.json')