In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
# ============================================
# CORE PYTHON & DATA HANDLING
# ============================================
import numpy as np
import pandas as pd
import os
import re
from collections import Counter

# ============================================
# VISUALIZATION
# ============================================
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# ============================================
# NLP PREPROCESSING (NLTK)
# ============================================
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# ============================================
# CLASSICAL MACHINE LEARNING
# ============================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# ============================================
# STATISTICAL TESTS
# ============================================
from scipy.stats import ttest_ind, mannwhitneyu, chi2_contingency, shapiro

# ============================================
# SPARSE MATRIX HANDLING
# ============================================
from scipy.sparse import hstack

# ============================================
# DEEP LEARNING (TF/KERAS)
# ============================================
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Embedding, LSTM, GRU, Bidirectional,
    Conv1D, GlobalMaxPooling1D, Dense, Dropout
)


# Hugging Face
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import torch

# ============================================
# TRANSFORMERS (HUGGINGFACE)
# ============================================
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertForSequenceClassification
)

# ============================================
# MODEL EXPLAINABILITY
# ============================================
# import shap
# from lime.lime_text import LimeTextExplainer

# ============================================
# UTILITY
# ============================================
import warnings
warnings.filterwarnings("ignore")


In [4]:
## 2. Load cleaned datasets

isot = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data_clean/clean_isot.csv")
wel  = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data_clean/clean_welfake.csv")

# Merge datasets
df = pd.concat([isot, wel], ignore_index=True)

# Features and labels
X = df["text_clean"].astype(str)
y = df["label"].astype(int)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=47,
    stratify=y
)

print("Merged shape:", df.shape)
print("Train size:", len(X_train_text))
print("Test size:", len(X_test_text))


Merged shape: (101131, 4)
Train size: 80904
Test size: 20227


In [5]:
# -------------------------
# Tokenizer
# -------------------------
VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq  = tokenizer.texts_to_sequences(X_test_text)

# -------------------------
# Padding
# -------------------------
MAX_LEN = 300  # good length for news data

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post", truncating="post")
X_test_pad  = pad_sequences(X_test_seq,  maxlen=MAX_LEN, padding="post", truncating="post")

# -------------------------
# Compute Class Weights
# -------------------------
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))

print("Class Weights:", class_weights)


Class Weights: {0: np.float64(0.9726844282004424), 1: np.float64(1.0288940889205413)}


In [9]:
# ============================================================
# Build BiLSTM Model
# ============================================================

EMBED_DIM = 128
LSTM_UNITS = 128

model_bilstm = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=MAX_LEN),

    # Bidirectional LSTM
    tf.keras.layers.Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)),

    Dropout(0.3),
    Dense(64, activation="relu"),

    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

# Compile
model_bilstm.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model_bilstm.summary()

# ============================================================
# Train the Model
# ============================================================

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

history_bilstm = model_bilstm.fit(
    X_train_pad,
    y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=128,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# ============================================================
# Evaluate on Test Set
# ============================================================

pred_probs = model_bilstm.predict(X_test_pad)
y_pred = (pred_probs > 0.5).astype(int)

print("\n=== BiLSTM Results ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Epoch 1/5
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 29ms/step - accuracy: 0.5666 - loss: 0.6479 - val_accuracy: 0.5981 - val_loss: 0.6088
Epoch 2/5
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.5920 - loss: 0.6006 - val_accuracy: 0.5862 - val_loss: 0.6121
Epoch 3/5
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.6113 - loss: 0.5719 - val_accuracy: 0.5674 - val_loss: 0.6307
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step

=== BiLSTM Results ===
              precision    recall  f1-score   support

           0       0.57      0.89      0.69     10397
           1       0.70      0.28      0.40      9830

    accuracy                           0.59     20227
   macro avg       0.63      0.58      0.55     20227
weighted avg       0.63      0.59      0.55     20227

Accuracy: 0.5915360656548179


In [10]:
# =======================================================
#  CNN + LSTM Hybrid Model
# =======================================================

def build_cnn_lstm(vocab_size, max_len):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),

        # 1D CNN layer
        Conv1D(filters=128, kernel_size=5, activation='relu'),

        # Max pooling to reduce sequence size
        GlobalMaxPooling1D(),

        # LSTM block
        tf.keras.layers.Reshape((1, 128)),  # reshaping for LSTM
        LSTM(64, return_sequences=False),

        # Dense layers
        Dense(64, activation='relu'),
        Dropout(0.3),

        Dense(1, activation='sigmoid')
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    return model

cnn_lstm_model = build_cnn_lstm(VOCAB_SIZE, MAX_LEN)
cnn_lstm_model.summary()

history_cnn_lstm = cnn_lstm_model.fit(
    X_train_pad, y_train,
    epochs=3,
    batch_size=128,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    ]
)

# Evaluation
y_pred_probs = cnn_lstm_model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)

print("\n=== CNN-LSTM Results ===")
print(classification_report(y_test, y_pred))

acc_cnn_lstm = accuracy_score(y_test, y_pred)
print("Accuracy:", acc_cnn_lstm)


Epoch 1/3
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5550 - loss: 0.6559 - val_accuracy: 0.5951 - val_loss: 0.5815
Epoch 2/3
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6109 - loss: 0.5605 - val_accuracy: 0.5954 - val_loss: 0.5803
Epoch 3/3
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6180 - loss: 0.5377 - val_accuracy: 0.5953 - val_loss: 0.6084
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

=== CNN-LSTM Results ===
              precision    recall  f1-score   support

           0       0.95      0.23      0.37     10397
           1       0.55      0.99      0.70      9830

    accuracy                           0.60     20227
   macro avg       0.75      0.61      0.54     20227
weighted avg       0.75      0.60      0.53     20227

Accuracy: 0.598160874079201


In [11]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import evaluate
os.environ["WANDB_DISABLED"] = "true"  # No WandB prompt

# Prepare data for HF
df[['text_clean', 'label']].rename(columns={'text_clean': 'text'}).to_csv('merged_hf.csv', index=False)
data = pd.read_csv('merged_hf.csv')

# Stratified split (rubric-approved: handles imbalance)
train_df, test_df = train_test_split(
    data, test_size=0.2, random_state=47, stratify=data['label']
)

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_test  = test_ds.map(tokenize_function, batched=True)

# Dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Accuracy metric
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2
)

# Training arguments (correct parameter names for 2025 versions)
training_args = TrainingArguments(
    output_dir='./distilbert-fake-news',
    num_train_epochs=2,                  # 2 is enough for capstone
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy='epoch',         # Corrected from evaluation_strategy
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none",                    # Extra safety - no WandB
    fp16=True,                           # Use GPU mixed precision (faster on Colab)
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train!
trainer.train()

# Final evaluation
results = trainer.evaluate()
print("=== DistilBERT Final Results ===")
print(results)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/80904 [00:00<?, ? examples/s]

Map:   0%|          | 0/20227 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5579,0.565966,0.608148
2,0.5409,0.55351,0.608543


=== DistilBERT Final Results ===
{'eval_loss': 0.5535104870796204, 'eval_accuracy': 0.608543036535324, 'eval_runtime': 45.7364, 'eval_samples_per_second': 442.251, 'eval_steps_per_second': 27.658, 'epoch': 2.0}


In [7]:
!pip install transformers datasets evaluate accelerate --quiet

import os
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

os.environ["WANDB_DISABLED"] = "true"   # Disable W&B

# === Load your cleaned CSV ===
df[['text_clean', 'label']].rename(columns={'text_clean': 'text'}).to_csv("merged_hf.csv", index=False)
data = pd.read_csv("merged_hf.csv")

# === Stratified train/test split ===
train_df, test_df = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data['label']
)

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

# === Tokenizer (RoBERTa-LARGE) ===
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_test  = test_ds.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# === Metrics ===
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

# === Model (RoBERTa-LARGE) ===
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=2
)

# === Training arguments ===
training_args = TrainingArguments(
    output_dir="./roberta-large-fake-news",
    num_train_epochs=2,
    per_device_train_batch_size=4,         # IMPORTANT for large model VRAM
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,         # Effective batch size = 16
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to="none",
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# === Train ===
trainer.train()

# === Final Evaluation ===
results = trainer.evaluate()
print("=== RoBERTa-LARGE Final Results ===")
print(results)


Map:   0%|          | 0/80904 [00:00<?, ? examples/s]

Map:   0%|          | 0/20227 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5572,0.545571,0.615761


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5572,0.545571,0.615761
2,0.5411,0.540575,0.616602


=== RoBERTa-LARGE Final Results ===
{'eval_loss': 0.5405745506286621, 'eval_accuracy': 0.6166015721560291, 'eval_runtime': 152.2225, 'eval_samples_per_second': 132.878, 'eval_steps_per_second': 33.221, 'epoch': 2.0}


In [18]:
from google.colab import drive
drive.mount('/content/drive')

!find "/content/drive" -name "*.ipynb"




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Untitled0.ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled1.ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled2.ipynb
/content/drive/MyDrive/Colab Notebooks/02_feature_engineering_modeling_part2.ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled3.ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled4.ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled5.ipynb
/content/drive/MyDrive/Colab Notebooks/02_Feature_Advanced_Engineering_Modeling.ipynb
