In [None]:
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install datasets


In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [3]:
file_path = '/content/drive/MyDrive/notebooks/fakenews.csv'
df = pd.read_csv(file_path)
df = df[['text', 'label']]
df = df.dropna()


### **Preprocessing for model training**

In [4]:
label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'].tolist())

In [5]:
train_df, test_df = train_test_split(df, train_size=0.7, shuffle=True, random_state=0)
val_df, test_df = train_test_split(test_df, train_size=0.5, shuffle=True, random_state=0)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
import tensorflow as tf

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_val = val_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

### **Training the model**

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding


model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/results", 
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


trainer.train()


trainer.save_model('/content/drive/MyDrive/model')



Copied from Colab output

| Epoch | Training Loss | Validation Loss |
|-------|---------------|-----------------|
| 1     | 0.090100      | 0.047792        |
| 2     | 0.012500      | 0.047118        |
| 3     | 0.005000      | 0.015286        |
| 4     | 0.001800      | 0.014778        |
| 5     | 0.000400      | 0.014350        |


### **Evaluate on test set**

In [10]:
from sklearn.metrics import classification_report
import numpy as np


predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=1)


labels = tokenized_test['label']

print("Validation Classification Report:")
print(classification_report(labels, preds, target_names=["Class 0", "Class 1"]))


Validation Classification Report:
              precision    recall  f1-score   support

     Class 0       0.99      0.99      0.99      1567
     Class 1       0.99      0.99      0.99      1548

    accuracy                           0.99      3115
   macro avg       0.99      0.99      0.99      3115
weighted avg       0.99      0.99      0.99      3115

