# PROJECT | Natural Language Processing Challenge


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
import re, nltk, string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

# Load & explore data


In [102]:
import pandas as pd

file_path = "/Users/madahbar/Documents/IRONHACK/Week 4/PROJECT/project-nlp-challenge/dataset/training_data_lowercase.csv"

# Read with no header and proper separator (tab-separated)
data = pd.read_csv(file_path, encoding="utf-8", engine="python", sep="\t", header=None)

# Rename columns manually
data.columns = ["label", "text"]   # first column = labels (0/1), second = news text

print("Shape:", data.shape)
print(data.head())

# Optional: Check balance
print("\nLabel distribution:")
print(data['label'].value_counts(normalize=True))




Shape: (34152, 2)
   label                                               text
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...

Label distribution:
label
0    0.514523
1    0.485477
Name: proportion, dtype: float64


# Stratified Split


In [105]:
# First define features (X) and target (y) from the data
X = data['text']  # Using text column as features
y = data['label'] # Using label column as target

# Split the data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y,
    test_size=0.15,           # 15% of the data becomes validation
    stratify=y,               # keeps class proportions balanced
    random_state=42
)

print("Train split:", len(X_train_split))
print("Validation split:", len(X_val))


Train split: 29029
Validation split: 5123


# TF-IDF + Logistic Regression


# Preprocess Text


In [106]:
# Clean Text Function

def clean_for_fakenews(text):
    text = text.lower()                                        # 0 Lowercase
    text = re.sub(r"https?://\S+|www\.\S+", " <URL> ", text)   # 1 Replaces URLs with tokens
    text = re.sub(r"@\w+", " <USER> ", text)                   # 2 Replaces user names with tokens
    text = re.sub(r"\d+", " <NUM> ", text)                     # 3 Replaces numbers with tokens
    text = re.sub(r"[^\w\s'<NUM><URL><USER>]", " ", text)      # 4 Replaces punctuation except special tokens
    text = re.sub(r"\s+", " ", text).strip()                   # 5 Replaces extra whitespace
    return text

# We keep cleaning minimal because TF-IDF is robust.
# You can add lemmatization/stopwords later if CV shows gains.

print(stopwords.words('english'))

print("FULL DATA:", data['label'].value_counts(dropna=False))
print("After cleaning (X,y lengths):", len(X), len(y))



['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# Model


In [107]:
# Clean the text data
X_train = X_train_split.apply(clean_for_fakenews)
X_test = X_val.apply(clean_for_fakenews)
y_train = y_train_split
y_test = y_val

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
])

model.fit(X_train, y_train)



0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


# Evaluate the model


In [108]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
    
# Evaluate
print("\n=== Logistic Regression Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      2636
           1       0.93      0.95      0.94      2487

    accuracy                           0.94      5123
   macro avg       0.94      0.94      0.94      5123
weighted avg       0.94      0.94      0.94      5123

Accuracy: 0.944563732188171

=== Logistic Regression Results ===
Accuracy: 0.9446
F1 Score: 0.9436

Classification Report:

Confusion Matrix:
[[2465  171]
 [ 113 2374]]


# Validate the model


In [110]:
val['clean_text'] = val['title'].fillna('') + ' ' + val['text'].fillna('')
val['clean_text'] = val['clean_text'].apply(clean_for_fakenews)

preds = lr_model.predict(val['clean_text'])
val['label'] = preds

val.to_csv("outputs/predictions.csv", index=False)


# TF-IDF + SVM (LinearSVC)

In [111]:
# =========================
# SVM (TF-IDF) — Full Cell

# Why it’s different (but similar to your LR):

# Same preprocessing and TF-IDF features; only the classifier changes.

# LinearSVC optimizes a hinge loss (margin maximization). It often gets a tiny accuracy bump vs. Logistic Regression on sparse TF-IDF.

# Needs tuning of C; doesn’t output probabilities by default.

# =========================

import os, re, string
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk import download


# --- Pipeline + small tuning ---
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=40000, ngram_range=(1,2), sublinear_tf=True)),
    ('svm', LinearSVC())
])

param_grid = {
    'svm__C': [0.5, 1.0, 2.0],
    'svm__loss': ['hinge', 'squared_hinge']
}

gs = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
best_svm = gs.best_estimator_


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best params: {'svm__C': 1.0, 'svm__loss': 'squared_hinge'}


In [112]:
# --- Evaluate ---
y_pred = best_svm.predict(X_test)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9539
F1 Score: 0.9526

Classification Report:
              precision    recall  f1-score   support

           0     0.9563    0.9541    0.9552      2636
           1     0.9515    0.9538    0.9526      2487

    accuracy                         0.9539      5123
   macro avg     0.9539    0.9539    0.9539      5123
weighted avg     0.9539    0.9539    0.9539      5123


Confusion Matrix:
[[2515  121]
 [ 115 2372]]


In [115]:
# ========================
# DistilBERT fine-tuning —

# Why it’s different:

# Don’t heavy-clean the text. Transformers learn subword patterns; removing punctuation/casing often hurts.

# We do a light merge of fields and normalize whitespace only.

# Tokenization is handled by the model’s tokenizer; we train with Trainer.

# It’s slower than TF-IDF, but tends to reach the highest accuracy.
# ========================


%pip install transformers datasets accelerate torch tf-keras

import os, re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer,
                          pipeline)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

os.makedirs("outputs", exist_ok=True)

MODEL_NAME = "distilbert-base-uncased"  # keep uncased; cleaning remains LIGHT

def light_merge_for_transformer(title, text, subject=None):
    """
    Transformer-friendly: keep punctuation/case (model handles it),
    just remove HTML and normalize whitespace. Add subject tag lightly.
    """
    subj = f" [SUBJ={subject}]" if pd.notna(subject) and str(subject).strip() else ""
    s = f"{title or ''}. {text or ''}"
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return subj + " " + s

# --- Load ---
# Use existing data variable that's already loaded
# data is already available from previous cells
data['label'] = data['label'].astype(int)
# Add missing columns with empty strings if they don't exist
if 'title' not in data.columns:
    data['title'] = ''
if 'subject' not in data.columns:
    data['subject'] = ''
for c in ['title','text','subject']:
    data[c] = data[c].fillna('')

# --- Merge (LIGHT) ---
data['merged'] = data.apply(lambda r: light_merge_for_transformer(r['title'], r['text'], r['subject']), axis=1)

# --- Split (stratified) ---
X_train, X_test, y_train, y_test = train_test_split(
    data['merged'], data['label'], test_size=0.2, random_state=42, stratify=data['label']
)

train_df = pd.DataFrame({'text': X_train.values, 'label': y_train.values})
test_df  = pd.DataFrame({'text': X_test.values,  'label': y_test.values})

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "test":  Dataset.from_pandas(test_df,  preserve_index=False)
})

# --- Tokenizer & tokenization ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=512)

tokenized = ds.map(tokenize_fn, batched=True, remove_columns=['text'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- Metrics callback ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f}

# --- Model ---
id2label = {0: "fake", 1: "real"}
label2id = {"fake": 0, "real": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id
)

# --- Device select ---
if torch.cuda.is_available():
    device = 0                  # CUDA
elif torch.backends.mps.is_available():
    device = "mps"              # Apple Silicon
else:
    device = -1                 # CPU

# --- Training args (good defaults) ---
args = TrainingArguments(
    output_dir="outputs/bert_runs",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    num_train_epochs=3,                  # bump to 4–5 if you have time/compute
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),      # mixed precision on CUDA
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
print(trainer.evaluate())

# --- Inference pipeline for validation ---
clf = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    device=device,    # 0 for CUDA, "mps" for Apple Silicon, -1 for CPU
    truncation=True
)

# Use existing val DataFrame that's already loaded
# val is already available from previous cells
for c in ['title','text','subject']:
    val[c] = val[c].fillna('')

val['merged'] = val.apply(lambda r: light_merge_for_transformer(r['title'], r['text'], r['subject']), axis=1)

labels = []
batch_size = 64
texts = val['merged'].tolist()
for i in range(0, len(texts), batch_size):
    out = clf(texts[i:i+batch_size])
    for o in out:
        lab = o['label']
        # Map HF labels to 0/1 robustly
        if lab in ('LABEL_0', 'fake'):
            labels.append(0)
        elif lab in ('LABEL_1', 'real'):
            labels.append(1)
        else:
            labels.append(int(lab.split('_')[-1]))

val['label'] = labels
val.to_csv("outputs/predictions_distilbert.csv", index=False)
print("Saved: outputs/predictions_distilbert.csv")




Note: you may need to restart the kernel to use updated packages.


Map: 100%|██████████| 27321/27321 [00:01<00:00, 20175.06 examples/s]
Map: 100%|██████████| 6831/6831 [00:00<00:00, 20349.07 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0719,0.054277,0.984629,0.982277,0.986128,0.984199
2,0.0368,0.083588,0.980823,0.967146,0.99427,0.98052
3,0.0107,0.065885,0.986386,0.983789,0.988239,0.986009




Device set to use mps


{'eval_loss': 0.06588546186685562, 'eval_accuracy': 0.9863855950812472, 'eval_precision': 0.9837886520564395, 'eval_recall': 0.988238841978287, 'eval_f1': 0.9860087257409358, 'eval_runtime': 5.1343, 'eval_samples_per_second': 1330.471, 'eval_steps_per_second': 41.681, 'epoch': 3.0}
Saved: outputs/predictions_distilbert.csv


In [116]:
# =======================================
# MODEL COMPARISON SECTION
# =======================================

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pandas as pd

# ---- 1. Logistic Regression (TF-IDF) ----
# Recreate the original TF-IDF + Logistic Regression model since 'model' was overwritten
lr_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
])
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
p_lr, r_lr, f_lr, _ = precision_recall_fscore_support(y_test, y_pred_lr, average='binary')

# ---- 2. SVM (TF-IDF) ----
y_pred_svm = best_svm.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
p_svm, r_svm, f_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='binary')

# ---- 3. DistilBERT ----
# For BERT, we can reuse the trainer’s eval results (already computed)
bert_eval = trainer.evaluate()
acc_bert = bert_eval.get("eval_accuracy", np.nan)
p_bert = bert_eval.get("eval_precision", np.nan)
r_bert = bert_eval.get("eval_recall", np.nan)
f_bert = bert_eval.get("eval_f1", np.nan)

# ---- Combine results into a table ----
compare_df = pd.DataFrame({
    "Model": ["TF-IDF + Logistic Regression", "TF-IDF + SVM", "DistilBERT Transformer"],
    "Accuracy": [acc_lr, acc_svm, acc_bert],
    "Precision": [p_lr, p_svm, p_bert],
    "Recall": [r_lr, r_svm, r_bert],
    "F1-Score": [f_lr, f_svm, f_bert]
}).set_index("Model")

# ---- Display nicely ----
print("\n🔍 Model Comparison Summary:\n")
display(compare_df.style
        .format("{:.3f}")
        .background_gradient(cmap="Greens", subset=["Accuracy", "F1-Score"])
        .set_caption("Model Performance on Test Set"))





🔍 Model Comparison Summary:



Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TF-IDF + Logistic Regression,0.945,0.933,0.954,0.943
TF-IDF + SVM,0.965,0.962,0.966,0.964
DistilBERT Transformer,0.986,0.984,0.988,0.986
