In [1]:
%pip uninstall -y pyarrow
%pip install pyarrow==19.0.0
%pip install datasets
%pip install accelerate>=0.26.0

Found existing installation: pyarrow 22.0.0
Uninstalling pyarrow-22.0.0:
  Successfully uninstalled pyarrow-22.0.0
Note: you may need to restart the kernel to use updated packages.
Collecting pyarrow==19.0.0
  Using cached pyarrow-19.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Using cached pyarrow-19.0.0-cp313-cp313-win_amd64.whl (25.2 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.4.2 requires pyarrow>=21.0.0, but you have pyarrow 19.0.0 which is incompatible.



Collecting pyarrow>=21.0.0 (from datasets)
  Using cached pyarrow-22.0.0-cp313-cp313-win_amd64.whl.metadata (3.3 kB)
Using cached pyarrow-22.0.0-cp313-cp313-win_amd64.whl (28.0 MB)
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.0
    Uninstalling pyarrow-19.0.0:
      Successfully uninstalled pyarrow-19.0.0
Successfully installed pyarrow-22.0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from sentence_transformers import SentenceTransformer




In [3]:
# Infrastructure
torch.cuda.is_available()   # False: pas de GPU sur ma machine
device = torch.device('cpu')
device

device(type='cpu')

In [4]:
# Importation des donnees
from datasets import load_dataset

train_split = 'train[:15%]'
test_split = 'test[:5%]'

dataset = load_dataset('imdb', split={ 'train': train_split, 'test': test_split })

In [5]:
dataset['train']['text'][10]  

'It was great to see some of my favorite stars of 30 years ago including John Ritter, Ben Gazarra and Audrey Hepburn. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing.<br /><br />Some of the smaller female roles were fine, Patty Henson and Colleen Camp were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think Dorothy Stratten got a chance to act in this her only important film role.<br /><br />The film appears to have some fans, and I was very open-minded when I started watching it. I am a big Peter Bogdanovich fan and I enjoyed his last movie, "Cat\'s Meow" and all his early ones from "Targets" to "Nickleodeon". So, it really surprised me that I was barely able to keep awake watching this one.<br /><br />It is ironic that this movie is about a de

In [6]:
# Modèle et tokenizer à utiliser
model_name = "bert-base-uncased"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Exple
texte = 'This movie was fantastic! I really loved it.'
tokens = tokenizer.tokenize(texte)
tokens

['this', 'movie', 'was', 'fantastic', '!', 'i', 'really', 'loved', 'it', '.']

In [7]:
# Tokenizer mes donnees

MAX_LENGTH =256
def tokenize_function(texte):
    return tokenizer(texte, padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [8]:
dataset_tokenized = dataset.map(lambda x: tokenize_function(x['text']))
dataset_tokenized['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [9]:
# Modèle de génération de texte
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,
    problem_type="single_label_classification" 
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.config    # Affiche la configuration du modèle
model.num_parameters()  # Nombre de paramètres du modèle

# Mettre le modèle sur le bon device
model = model.to(device)

In [11]:
# Test du modèle avec un texte d'exemple

example_text = ["This movie is beautiful.", "This movie is terrible."]
# Tokenization
tokens = tokenizer(example_text, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
tokens
model(**tokens).logits

tensor([[0.1516, 0.3402],
        [0.1528, 0.3460]], grad_fn=<AddmmBackward0>)

In [12]:
# Fine-tuning du modèle avec Trainer
# Configuration de l'entraînement
variable_entrainnement = TrainingArguments(
    # Repértoires de sortie et de logs
    output_dir="./results",
    logging_dir="./logs",
    # Paramètres d'entraînement
    num_train_epochs=3,                 # nombre d'époques
    per_device_train_batch_size=2,      # taille de batch pour l'entraînement
    per_device_eval_batch_size=2,       # taille de batch pour l'évaluation
    learning_rate=2e-5,                 # taux d'apprentissage (il doit être petit pour éviter de détruire les connaissances préalables du modèle)
    weight_decay=0.01,                  # taux de décroissance du poids
    # Optimisation du modèle
    warmup_steps=100,                   # nombre de pas de warm-up(Optimise du modele 100fois)
    lr_scheduler_type="linear",         # type de scheduler (l'optimisation se fait de manière linéaire)
    # Evaluation et sauvegarde du modèle
    eval_strategy="epoch",              # évaluer à la fin de chaque époque
    save_strategy="epoch",              # sauvegarder le modèle à la fin de chaque époque
    load_best_model_at_end=True,        # charger le meilleur modèle à la fin de l'entraînement
    metric_for_best_model="accuracy",   # métrique pour déterminer le meilleur modèle
    
    save_total_limit=2,                 # nombre maximum de modèles sauvegardés
    
    # Enregistrement des logs
    logging_strategy="steps",           # stratégie de logging (enregistrer les logs à chaque étape)
    logging_steps=10,                   # enregistrer les logs tous les 10 pas
    logging_first_step=True,            # enregistrer le premier pas
    report_to="tensorboard"             # utiliser TensorBoard pour le reporting
)
    

In [13]:
from sklearn.metrics import accuracy_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [None]:
# Data collator pour le padding : cela permet de gérer les séquences de longueurs différentes dans un batch
datacollator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,          # activer le padding
    return_tensors='pt'    # retourner des tenseurs PyTorch
    )

# Entrainement du modèle
trainer = Trainer(
    model=model,                                # le modèle à entraîner
    args=variable_entrainnement,                # les arguments d'entraînement
    train_dataset=dataset_tokenized['train'],   # les donnees d'entraînement
    eval_dataset=dataset_tokenized['test'] ,    # les donnees d'évaluation
    tokenizer=tokenizer,                        # le tokenizer utilisé
    compute_metrics=compute_metrics,            # fonction de calcul des métriques
    data_collator=datacollator                  # le data collator pour le padding( pour gérer les séquences de longueurs différentes)
)

# Fine-tuning du modèle
fineModel = trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
fineModel.metrics

{'train_runtime': 5938.8443,
 'train_samples_per_second': 1.894,
 'train_steps_per_second': 0.947,
 'total_flos': 1479999686400000.0,
 'train_loss': 0.0035185262354989794,
 'epoch': 3.0}

In [None]:
# Evaluation du modèle
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 1.4687998373119626e-05, 'eval_accuracy': 1.0, 'eval_runtime': 57.2391, 'eval_samples_per_second': 21.838, 'eval_steps_per_second': 10.919, 'epoch': 3.0}


In [None]:
# Enregistrement du modèle
path = './models'
trainer.save_model(path)

In [None]:
# test du modèle enregistré
model = AutoModelForSequenceClassification.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)
# model
text_1 = "I really enjoyed this movie. It was fantastic!"
text_2 = "I hated this movie. It was awful!"
inputs = tokenizer(text_1, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH)
model(**inputs).logits

tensor([[ 5.0844, -5.4209]], grad_fn=<AddmmBackward0>)

In [None]:
def prediction_text(text, model, tokenizer, device):
    model.eval()
    text_input = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH).to(device)
    pred = model(**text_input)
    prediction = torch.nn.functional.softmax(pred.logits, dim=-1)
    predict_class = torch.argmax(prediction, dim=1).item()
    # print(predict_class)
    return predict_class


In [None]:
prediction_text("I hated this movie. It was awful!", model, tokenizer, device)

0

# Exercice 1: Fine-tuning d'un modèle multilingue pour le wolof
1. Telecharger un model multilangue sur huggingface
2. Tester le modele avec un texte en wolof
3. Creer un dataset en wolof sous format: description, reponse
4. Fine-tuner le modele sur ce dataset
5. tester le nouveau modele avec un texte en wolof

# Exercice 2: Fine-tuning d'un modèle de question reponse pour le wolof
1. Telecharger un model question reponse sur huggingface (Anglais, Francais, vice versa) ex: distilbert-base-uncased-distilled-squad
2. Creer un dataset de question reponse en (Francais-Wolof, vice versa)
3. Fine-tuner le modele sur ce dataset

In [None]:
# Exo1
#1 Telechargement du model
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM

model_name_for_classification = "FacebookAI/xlm-roberta-base"
model_name_for_gereneration = "google/mt5-base"
model_name_mistralai = "mistralai/Mistral-7B-instruct-v0.2"

tokenizer_mistralai = AutoTokenizer.from_pretrained(model_name_mistralai)
model_mistralai = 

# model_for_classification = AutoModelForSequenceClassification.from_pretrained(model_name_for_classification)
# model_for_generation = AutoModelForSeq2SeqLM.from_pretrained(model_name_for_gereneration)

# tokenizer_for_classification = AutoTokenizer.from_pretrained(model_name_for_classification)
# tokenizer_for_generation = AutoTokenizer.from_pretrained(model_name_for_gereneration)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#2 Tester le model avec un texte en Wolof
texte_1 = "Taay dama contane"

print("="*50)
print("TEST 1: Modèle de CLASSIFICATION")
print("="*50)
# Test du modèle de classification
tokens_class = tokenizer_for_classification(
    texte_1, 
    return_tensors="pt", 
    padding='max_length', 
    truncation=True, 
    max_length=MAX_LENGTH
    )
print(f"Texte: {texte_1}")
predit_class = model_for_classification(**tokens_class).logits
print(predit_class.argmax(-1))


TEST 1: Modèle de CLASSIFICATION
Texte: Taay dama contane
tensor([1])


In [None]:
# 3. Creer un dataset en wolof sous format: texte, sentiment (0=négatif, 1=positif)
data = {
    "text": [
        # Sentiments POSITIFS (label = 1)
        "Dama contane lool, sama liggéey dafa baax bi.",
        "Film bi dafa interessant, man dama ko bëgg.",
        "Sama xarit dafa baax lool, dama ko sopp.",
        "Lekk bi dafa neex, jërëjëf.",
        "Jàng bi dafa am solo, prof bi dafa baax.",
        
        # Sentiments NÉGATIFS (label = 0)
        "Dama sonn lool, tee sama xol néxoul.",
        "Dama togg, liggéey bi dafa méti.",
        "Maa ngi dem ci université, te dama bëggul liggéey.",
        "Film bi dafa ndaw, man dama soone ko.",
        "Sama xarit dafa mënul, te dama sonnal."
    ],
    "label": [
        # Labels pour sentiments positifs
        1, 1, 1, 1, 1,
        # Labels pour sentiments négatifs
        0, 0, 0, 0, 0
    ]
}


# Créer un DataFrame pandas pour visualiser
import pandas as pd
dataset_wolof = pd.DataFrame(data)
print(f"Nombre d'exemples: {len(dataset_wolof)}")
print(f"Positifs: {sum(dataset_wolof['label'] == 1)}")
print(f"Négatifs: {sum(dataset_wolof['label'] == 0)}")
print("\nPremiers exemples:")
dataset_wolof.head(10)

Nombre d'exemples: 20
Positifs: 10
Négatifs: 10

Premiers exemples:


Unnamed: 0,text,label
0,"Dama contane lool, sama liggéey dafa baax bi.",1
1,"Film bi dafa interessant, man dama ko bëgg.",1
2,"Sama xarit dafa baax lool, dama ko sopp.",1
3,"Lekk bi dafa neex, jërëjëf.",1
4,"Jàng bi dafa am solo, prof bi dafa baax.",1
5,"Dama bëgg Dakar, dëkk bi dafa rafet.",1
6,"Yaram dafa fi rekk, alhumdulillah.",1
7,"Làmb bi dafa am solo lool, ñu baax.",1
8,Suma waxtu ci université bi dafa nekk baax.,1
9,"Dama contane ci sama ndey, dafa ma jëkkër.",1


In [None]:
# 4. Fine-tuner le modele sur ce dataset

In [None]:
# print("\n" + "="*50)
# print("TEST 2: Modèle de GÉNÉRATION")
# print("="*50)
# # Test du modèle de génération
# tokens_gen = tokenizer_for_generation(texte_2, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH)
# print(f"Texte d'entrée: {texte_2[:50]}...")
# model_for_generation.eval()
# with torch.no_grad():
#     # Générer une réponse
#     generated_ids = model_for_generation.generate(
#         tokens_gen['input_ids'], 
#         max_length=50,
#         num_beams=4,           # beam search pour meilleure qualité
#         early_stopping=True
#     )
    
# # Décoder la sortie générée
# generated_text = tokenizer_for_generation.decode(generated_ids[0], skip_special_tokens=True)
# print(f"Texte généré: {generated_text}")
# print("="*50)