# Spam vs Ham classification
Dataset: Enron emails or Kaggle spam dataset.

Instructions:
- Prepare a `spam.csv` under `spam_classification/data/` with columns `text`, `label` (`spam`/`ham`).
- Run the pipeline below (Logistic Regression / Naive Bayes) and report Precision, Recall, F1.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Data file
csv_path = '../data/spam.csv'

df = pd.read_csv(csv_path)
assert {'text','label'}.issubset(df.columns), 'CSV must contain columns text and label'

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

pipelines = {
    'LogReg': Pipeline([
        ('tfidf', TfidfVectorizer(min_df=3, ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=200))
    ]),
    'NaiveBayes': Pipeline([
        ('tfidf', TfidfVectorizer(min_df=3, ngram_range=(1,2))),
        ('clf', MultinomialNB())
    ])
}

reports = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    reports[name] = classification_report(y_test, y_pred, output_dict=False)

print('\n\n'.join([f'== {k} ==\n{v}' for k,v in reports.items()]))
