In [3]:
!pip install torch transformers scikit-learn pandas numpy





In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification


In [5]:
# Load both CSVs
df_true = pd.read_csv("/content/True.csv")
df_fake = pd.read_csv("/content/Fake.csv")

# Add a label column: 0 for Real, 1 for Fake
df_true['label'] = 0
df_fake['label'] = 1

# Combine datasets
df = pd.concat([df_true, df_fake]).reset_index(drop=True)

# Example: use 'title' + 'text' for best results
df['content'] = df['title'] + " " + df['text']
df['content'] = df['content'].apply(lambda x: str(x).lower())

print(df.head())


                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  label  \
0  December 31, 2017       0   
1  December 29, 2017       0   
2  December 31, 2017       0   
3  December 30, 2017       0   
4  December 29, 2017       0   

                                             cont

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df['content'],
    df['label'],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)



In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = CrossEntropyLoss(weight=weights)



In [12]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train.values)
)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)



In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [15]:
model.train()

for epoch in range(2):  # increase if needed
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")



Epoch 1 - Loss: 0.0091
Epoch 2 - Loss: 0.0030


In [17]:
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

# Create a TensorDataset and DataLoader for the test set
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(y_test.values)
)

test_dataloader = DataLoader(test_dataset, batch_size=8) # Use the same batch size as training

model.eval()
with torch.no_grad():
    all_preds = []
    all_labels = []
    for batch in test_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


y_pred = np.array(all_preds)
y_true = np.array(all_labels)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

Accuracy: 0.999554565701559
Confusion Matrix:
 [[4283    1]
 [   3 4693]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4284
           1       1.00      1.00      1.00      4696

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [24]:
def predict_article(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.eval()
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=1).item()
    return "✅ Real" if pred == 0 else "🚫 Fake"

print(predict_article("The World Health Organization issues guidelines for post-pandemic recovery plans"))
print(predict_article("Aliens spotted living secretly under the Pacific Ocean, whistleblower claims."))



✅ Real
🚫 Fake


In [19]:
model.save_pretrained("./fine_tuned_bert_news")
tokenizer.save_pretrained("./fine_tuned_bert_news")

!zip -r fine_tuned_bert_news.zip fine_tuned_bert_news


  adding: fine_tuned_bert_news/ (stored 0%)
  adding: fine_tuned_bert_news/config.json (deflated 48%)
  adding: fine_tuned_bert_news/tokenizer_config.json (deflated 75%)
  adding: fine_tuned_bert_news/model.safetensors (deflated 7%)
  adding: fine_tuned_bert_news/vocab.txt (deflated 53%)
  adding: fine_tuned_bert_news/special_tokens_map.json (deflated 42%)


In [20]:
from google.colab import files
files.download("fine_tuned_bert_news.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>