In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments


In [2]:
# Charger le dataset fusionné
df = pd.read_csv("data/fake_news.csv")

# Afficher les 5 premières lignes
df.head()


Unnamed: 0,title,text,subject,date,label
0,Senate panel to vote next week on online sex-t...,WASHINGTON (Reuters) - The U.S. Senate Commerc...,politicsNews,"November 1, 2017",0
1,"Republicans Surrender To Trump, Fear Historic...",The Washington Post reports that the Republica...,News,"March 24, 2016",1
2,WOW! LIBERAL Fox News Host Geraldo Rivera SLAM...,"Liberal FOX News host, Geraldo Rivera, took to...",left-news,"Nov 10, 2017",1
3,Former key ally of Nigeria's Buhari joins oppo...,"ABUJA (Reuters) - Nigeria s Atiku Abubakar, a ...",worldnews,"December 3, 2017",0
4,NICOLE KIDMAN BREAKS RANKS With Hollywood Left...,She s travelling the world to promote her new ...,left-news,"Jan 12, 2017",1


In [3]:
# Combiner titre + contenu pour le modèle
texts = df['title'] + ". " + df['text']

# Labels : 1 = fake, 0 = réel
labels = df['label']

# Diviser en train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


In [4]:
import transformers
print(transformers.__version__)


4.57.1


In [5]:
# Créer le tokenizer DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenizer les textes (convertir en tokens, attention mask, padding)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


In [6]:
import torch

class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Créer les datasets
train_dataset = FakeNewsDataset(train_encodings, list(train_labels))
test_dataset = FakeNewsDataset(test_encodings, list(test_labels))


In [7]:
from transformers import DistilBertForSequenceClassification

# DistilBERT pour classification binaire (fake = 1, real = 0)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # 2 classes : fake ou real
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import inspect
from transformers import TrainingArguments

print("version transformers:", __import__("transformers").__version__)
print("TrainingArguments signature:\n", inspect.signature(TrainingArguments))
# Optionnel (si tu veux voir le code source — peut être très long):
# import textwrap
# print(textwrap.shorten(inspect.getsource(TrainingArguments), width=2000))


version transformers: 4.57.1
TrainingArguments signature:


In [9]:
pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",                 # dossier de sortie
    eval_strategy="epoch",                  # évaluer à la fin de chaque époque
    save_strategy="epoch",                  # sauvegarder à la fin de chaque époque
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,            # charger le meilleur modèle à la fin
    metric_for_best_model="eval_loss"       # métrique pour choisir "meilleur"
)

print("TrainingArguments créé :", training_args)


TrainingArguments créé : TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval

In [11]:
from transformers import Trainer

# Créer le Trainer
trainer = Trainer(
    model=model,                         # le modèle DistilBERT
    args=training_args,                   # paramètres d'entraînement
    train_dataset=train_dataset,          # dataset d'entraînement
    eval_dataset=test_dataset             # dataset d'évaluation
)

print("Trainer créé :", trainer)


Trainer créé : <transformers.trainer.Trainer object at 0x000002970B869550>


In [12]:
trainer.train()      # lancer l’entraînement réel




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 