In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv("../Dataset/fake_and_real_news.csv")

df['label_num'] = df['label'].map({'Fake':0,'Real':1})

train_texts,test_texts,train_labels,test_labels = train_test_split(
    df['Text'].tolist(),
    df['label_num'].tolist(),
    test_size=0.2,
    random_state=42
)

Tokenization with Hugging Face Tokenizer
We'll use the BertTokenizer for tokenizing the text data:


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_texts,truncation=True,padding=True,max_length =64)
test_encodings = tokenizer(test_texts,truncation=True,padding=True,max_length =64)


In [None]:
""" from transformers import DistilBertTokenizerFast

# Intiallize the Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the Data 
train_encodings = tokenizer(train_texts,truncation=True,padding=True,max_length =128)
test_encodings = tokenizer(test_texts,truncation=True,padding=True,max_length =128) """

  from .autonotebook import tqdm as notebook_tqdm
W0517 00:05:48.404000 9532 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [13]:
# Print the encoding for the first training example
first_encoding = {k: v[0] for k, v in train_encodings.items()}
print(first_encoding)

{'input_ids': [101, 2004, 2047, 10807, 2095, 6440, 2015, 1010, 3246, 2005, 4307, 5166, 11737, 2015, 3190, 1006, 26665, 1007, 1011, 26875, 1037, 3181, 4530, 2000, 18015, 5416, 3570, 1010, 4307, 2211, 2049, 2353, 3442, 10807, 2095, 2302, 1037, 5166, 2006, 5095, 2004, 2576, 17519, 2075, 11737, 7583, 8069, 2005, 1037, 12170, 26053, 5938, 1998, 6599, 7427, 15933, 2574, 1012, 2096, 1996, 2160, 5115, 5219, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
for key, value in train_encodings.items():
    print(f"Key: {key}")
    print(f"Value: {value[:2]} ...")  # print only the first 2 items for brevity

Key: input_ids
Value: [[101, 2004, 2047, 10807, 2095, 6440, 2015, 1010, 3246, 2005, 4307, 5166, 11737, 2015, 3190, 1006, 26665, 1007, 1011, 26875, 1037, 3181, 4530, 2000, 18015, 5416, 3570, 1010, 4307, 2211, 2049, 2353, 3442, 10807, 2095, 2302, 1037, 5166, 2006, 5095, 2004, 2576, 17519, 2075, 11737, 7583, 8069, 2005, 1037, 12170, 26053, 5938, 1998, 6599, 7427, 15933, 2574, 1012, 2096, 1996, 2160, 5115, 5219, 102], [101, 8040, 28600, 2121, 2758, 1057, 1012, 1055, 1012, 5166, 3066, 2079, 3085, 2065, 8398, 12237, 2041, 1997, 2009, 2899, 1006, 26665, 1007, 1011, 4001, 3537, 3003, 8057, 8040, 28600, 2121, 2056, 2006, 4465, 2002, 7164, 2375, 12088, 2071, 3362, 1037, 2460, 1011, 2744, 1057, 1012, 1055, 1012, 5166, 3066, 2011, 5958, 2065, 2343, 6221, 8398, 2106, 2025, 19960, 10362, 1999, 2037, 7566, 102]] ...
Key: token_type_ids
Value: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Create a PyTorch Dataset
Convert the tokenized inputs into a PyTorch dataset:

In [15]:
import torch

In [16]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, index):
        item = {key : torch.tensor(val[index]) for key,val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item
    
    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = FakeNewsDataset(train_encodings, train_labels)
test_dataset = FakeNewsDataset(test_encodings, test_labels)


Load Pretrained BERT Model for Sequence Classification
We'll use the BertForSequenceClassification model:

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
""" from transformers import DistilBertForSequenceClassification

# load the model
model =DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2) """

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the Model

In [24]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    eval_strategy="epoch", 
    save_strategy="epoch", # Added to match eval_strategy
    logging_dir='/',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,0.003,0.01501,0.997475
2,0.0362,0.000932,1.0
3,0.0008,0.001175,0.999495




TrainOutput(global_step=1485, training_loss=0.009167637690111544, metrics={'train_runtime': 99.7494, 'train_samples_per_second': 238.197, 'train_steps_per_second': 14.887, 'total_flos': 3773346508800.0, 'train_loss': 0.009167637690111544, 'epoch': 3.0})

In [25]:
# Evaluate the model
eval_result = trainer.evaluate()
print(f"Test Accuracy: {eval_result['eval_accuracy']:.4f}")



Test Accuracy: 1.0000


Make Predictions on New Data

In [26]:
new_text = ["The World Health Organization declared COVID-19 a global pandemic in March 2020."]

# Tokenize the text
inputs = tokenizer(new_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Make prediction
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)

# Map prediction to label
label_map = {0: "FAKE", 1: "REAL"}
predicted_label = label_map[predictions.item()]
print(f"Predicted Label: {predicted_label}")

Predicted Label: REAL
