In [1]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Load dataset
def load_data(filepath):
    df = pd.read_csv(filepath)
    return df["text"].tolist(), df["label"].tolist()

In [3]:
# Custom Dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {"input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0),
                "labels": torch.tensor(self.labels[idx], dtype=torch.long)}

In [5]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Load dataset with sampling
train_df = pd.read_csv("../train.csv").sample(frac=0.1, random_state=42)  # Use only 10% data
test_df = pd.read_csv("../test.csv").sample(frac=0.1, random_state=42)  # Use only 10% data

train_texts, train_labels = train_df["text"].tolist(), train_df["label"].tolist()
test_texts, test_labels = test_df["text"].tolist(), test_df["label"].tolist()


In [7]:
# Create Dataset objects
train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer)
test_dataset = FakeNewsDataset(test_texts, test_labels, tokenizer)

In [7]:
%pip install accelerate>=0.26.0

^C
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)



In [9]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss


In [5]:
from transformers import AutoModelForSequenceClassification

checkpoint = "../content/bert_model/checkpoint-696"  # Adjust this if needed
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Save the model (this will create pytorch_model.bin)
model.save_pretrained("./final_model")

print("✅ Model and weights saved successfully.")


✅ Model and weights saved successfully.
