In [1]:
import pandas as pd
import numpy as np
import random
import re
import requests, zipfile
from io import BytesIO
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from scipy.sparse import hstack, csr_matrix
import pickle

# --------------------------
# NLTK stopwords
# --------------------------
try:
    import nltk
    nltk.data.find('corpora/stopwords')
except:
    import nltk
    nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# --------------------------
# Helper functions
# --------------------------
EMOJI_PATTERN = re.compile("[\U0001F300-\U0001F6FF\U0001F900-\U0001F9FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
def count_emojis(s): return len(EMOJI_PATTERN.findall(str(s)))
def count_digits(s): return sum(c.isdigit() for c in str(s))
def uppercase_ratio(s):
    letters = [c for c in str(s) if c.isalpha()]
    return sum(1 for c in letters if c.isupper())/len(letters) if letters else 0
def tokenize_simple(s): return re.findall(r"\w+", str(s).lower())

# --------------------------
# Feature engineering
# --------------------------
def add_features(df):
    df = df.copy()
    df['char_count'] = df['message'].str.len()
    df['word_count'] = df['message'].apply(lambda s: len(tokenize_simple(s)))
    df['digit_count'] = df['message'].apply(count_digits)
    df['digit_ratio'] = df['digit_count']/df['char_count'].replace(0,1)
    df['emoji_count'] = df['message'].apply(count_emojis)
    df['upper_ratio'] = df['message'].apply(uppercase_ratio)
    df['punct_count'] = df['message'].apply(lambda s: sum(1 for c in str(s) if c in '!?.,:;'))
    df['stopword_count'] = df['message'].apply(lambda s: sum(1 for w in tokenize_simple(s) if w in STOPWORDS))
    return df

# --------------------------
# Load datasets
# --------------------------
SYNTH_PATH = Path(r"C:\Users\Lohith\OneDrive\Desktop\AI COURSE PROJECTS\synthetic-clean.csv")
if not SYNTH_PATH.exists(): raise FileNotFoundError(f"Synthetic dataset not found at {SYNTH_PATH}")
synth_df = pd.read_csv(SYNTH_PATH)
synth_df = synth_df.rename(columns={synth_df.columns[0]:'label', synth_df.columns[1]:'message'})
synth_df['label'] = synth_df['label'].str.lower()
synth_df['message'] = synth_df['message'].astype(str)

USER_PATH = Path(r"C:\Users\Lohith\OneDrive\Desktop\AI COURSE PROJECTS\spam.csv")
if not USER_PATH.exists(): raise FileNotFoundError(f"User dataset not found at {USER_PATH}")
user_df = pd.read_csv(USER_PATH, encoding='latin-1')
if 'message' not in user_df.columns:
    for c in user_df.columns:
        if 'message' in c.lower() or 'text' in c.lower():
            user_df.rename(columns={c:'message'}, inplace=True)
if 'label' not in user_df.columns:
    for c in user_df.columns:
        if user_df[c].astype(str).str.lower().isin(['ham','spam']).all():
            user_df.rename(columns={c:'label'}, inplace=True)
user_df = user_df[['label','message']]
user_df['label'] = user_df['label'].str.lower()
user_df['message'] = user_df['message'].astype(str)

# --------------------------
# Download UCI dataset
# --------------------------
UCI_ZIP_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
resp = requests.get(UCI_ZIP_URL, timeout=30)
resp.raise_for_status()
with zipfile.ZipFile(BytesIO(resp.content)) as z:
    with z.open('SMSSpamCollection') as f:
        uci_data = f.read().decode('utf-8')
rows = [line.split('\t',1) for line in uci_data.splitlines() if line.strip()]
uci_df = pd.DataFrame(rows, columns=['label','message'])
uci_df['label'] = uci_df['label'].str.lower()
uci_df['message'] = uci_df['message'].astype(str)

# --------------------------
# Generate OTP + emoji spam
# --------------------------
def make_otp_samples(n=300):
    samples = []
    banks = ['HDFC','SBI','ICICI','Axis','Bank','YourBank']
    for _ in range(n):
        otp = ''.join(str(random.randint(0,9)) for _ in range(random.choice([4,5,6])))
        bank = random.choice(banks)
        if random.random()<0.5:
            s = f"{bank}: Your OTP is {otp}. Do not share this with anyone."
        else:
            s = f"URGENT! Verify your account now using OTP {otp} to avoid suspension. Reply now."
        samples.append(('spam',s))
    return samples

emoji_spams = [
    ('spam',"WIN 🎉🎁! You have won a prize worth $1000. Click http://bit.ly/win-now"),
    ('spam',"Congratulations! 🎉 Claim your ₹5000 cashback now 💵. Reply YES to claim"),
    ('spam',"LIMITED OFFER 💥 Buy 1 get 1 FREE. Visit our site now!")
]

otp_samples = make_otp_samples(400)
synth_otp_emoji_df = pd.DataFrame(otp_samples + emoji_spams, columns=['label','message'])

# --------------------------
# Combine all datasets
# --------------------------
combined = pd.concat([synth_df, user_df, uci_df, synth_otp_emoji_df], ignore_index=True)
combined['message'] = combined['message'].astype(str)

# Balance ham and spam
counts = combined['label'].value_counts()
n_ham, n_spam = counts.get('ham',0), counts.get('spam',0)
if n_spam < n_ham:
    needed = n_ham - n_spam
    spam_pool = combined[combined['label']=='spam']
    sampled = spam_pool.sample(n=needed, replace=True, random_state=42)
    combined = pd.concat([combined, sampled], ignore_index=True)
elif n_ham < n_spam:
    needed = n_spam - n_ham
    ham_pool = combined[combined['label']=='ham']
    sampled = ham_pool.sample(n=needed, replace=True, random_state=42)
    combined = pd.concat([combined, sampled], ignore_index=True)

# --------------------------
# Feature engineering
# --------------------------
combined_feat = add_features(combined)
num_cols = ['char_count','word_count','digit_count','digit_ratio','emoji_count','upper_ratio','punct_count','stopword_count']

# --------------------------
# Train/test split
# --------------------------
X_text = combined_feat['message'].values
y = combined_feat['label'].map({'ham':0,'spam':1}).values
X_train_text, X_test_text, y_train, y_test, train_idx, test_idx = train_test_split(
    X_text, y, np.arange(len(y)), test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

X_train_num = csr_matrix(combined_feat.iloc[train_idx][num_cols].values)
X_test_num = csr_matrix(combined_feat.iloc[test_idx][num_cols].values)

X_train = hstack([X_train_tfidf, X_train_num])
X_test = hstack([X_test_tfidf, X_test_num])

# --------------------------
# Train Logistic Regression
# --------------------------
clf = LogisticRegression(max_iter=5000, class_weight='balanced')
clf.fit(X_train, y_train)

probs = clf.predict_proba(X_test)[:,1]
preds = (probs >= 0.4).astype(int)

print("\nEvaluation on test set:")
print("Accuracy:", accuracy_score(y_test, preds))
print("ROC AUC:", roc_auc_score(y_test, probs))
print(classification_report(y_test, preds, target_names=['ham','spam']))

# --------------------------
# Prepare batch function
# --------------------------
def prepare_batch(emails):
    df = pd.DataFrame({'message': emails})
    df_feat = add_features(df)
    X_text_batch = vectorizer.transform(df_feat['message'])
    X_num_batch = csr_matrix(df_feat[num_cols].values)
    X_batch = hstack([X_text_batch, X_num_batch])
    return X_batch

# --------------------------
# Batch Prediction
# --------------------------
SPAM_KEYWORDS = ['offer','win','cash','prize','free','click','urgent','limited','buy now','reward','claim']

def predict_spam_batch(emails, clf, threshold=0.2):
    X_batch = prepare_batch(emails)
    probs = clf.predict_proba(X_batch)[:,1]
    preds = []
    for email, prob in zip(emails, probs):
        email_lower = email.lower()
        if prob >= threshold or any(k in email_lower for k in SPAM_KEYWORDS):
            preds.append(1)
        else:
            preds.append(0)
    return preds, probs

# --------------------------
# Runtime testing
# --------------------------
emails_batch = [
    "Congratulations! You won a brand new MacBook. Click here to claim 🎉",
    "Dear friend, I hope this email finds you well. Can we meet tomorrow?",
    "URGENT: Verify your bank account immediately to avoid suspension!",
    "Limited time offer! Buy 2 get 1 free on all electronics.",
    "Meeting reminder: Project discussion at 10 AM today.",
    "You have been selected for a $5000 cash prize. Reply now!",
    "Happy birthday! Wishing you a wonderful day filled with joy.",
    "Claim your reward points now before the offer expires 💥",
    "Can you review the attached document and send feedback?",
    "Free tickets to the concert! First come first serve!"
]

true_labels = [1,0,1,1,0,1,0,1,0,1]  # 1=spam, 0=ham

batch_preds, batch_probs = predict_spam_batch(emails_batch, clf)

print("\nBatch Prediction Results:")
for email, pred, prob in zip(emails_batch, batch_preds, batch_probs):
    label = 'spam' if pred==1 else 'ham'
    print(f"\nEmail: {email}")
    print(f"Predicted Label: {label}, Probability: {prob:.3f}")
    print("-"*50)

accuracy = accuracy_score(true_labels, batch_preds)
print(f"\nBatch Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(true_labels, batch_preds, target_names=['ham','spam']))

# --------------------------
# Save model, vectorizer, and numeric column info
# --------------------------
model_filename = "spam_classifier.pkl"
with open(model_filename, "wb") as f:
    pickle.dump({
        'model': clf,
        'vectorizer': vectorizer,
        'num_cols': num_cols
    }, f)

print(f"\nModel successfully saved to {model_filename}")

# --------------------------
# Load the model later
# --------------------------
with open(model_filename, "rb") as f:
    saved = pickle.load(f)

clf_loaded = saved['model']
vectorizer_loaded = saved['vectorizer']
num_cols_loaded = saved['num_cols']

print("\nModel successfully reloaded and ready for prediction.")



Evaluation on test set:
Accuracy: 0.989464245684824
ROC AUC: 0.9996492553963414
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      2231
        spam       0.99      0.99      0.99      2230

    accuracy                           0.99      4461
   macro avg       0.99      0.99      0.99      4461
weighted avg       0.99      0.99      0.99      4461


Batch Prediction Results:

Email: Congratulations! You won a brand new MacBook. Click here to claim 🎉
Predicted Label: spam, Probability: 0.918
--------------------------------------------------

Email: Dear friend, I hope this email finds you well. Can we meet tomorrow?
Predicted Label: ham, Probability: 0.010
--------------------------------------------------

Email: URGENT: Verify your bank account immediately to avoid suspension!
Predicted Label: spam, Probability: 0.811
--------------------------------------------------

Email: Limited time offer! Buy 2 get 1 free on all electron

In [2]:
import pandas as pd
from scipy.sparse import hstack, csr_matrix

# --------------------------
# Single email prediction
# --------------------------
def predict_single_email(email, clf, vectorizer, num_cols, threshold=0.2):
    df = pd.DataFrame({'message': [email]})
    df_feat = add_features(df)
    X_text = vectorizer.transform(df_feat['message'])
    X_num = csr_matrix(df_feat[num_cols].values)
    X = hstack([X_text, X_num])

    prob = clf.predict_proba(X)[0,1]
    label = "spam" if prob >= threshold else "ham"
    return label, prob

# --------------------------
# Batch prediction
# --------------------------
def predict_spam_batch(emails, clf, vectorizer, num_cols, threshold=0.2):
    df = pd.DataFrame({'message': emails})
    df_feat = add_features(df)
    X_text_batch = vectorizer.transform(df_feat['message'])
    X_num_batch = csr_matrix(df_feat[num_cols].values)
    X_batch = hstack([X_text_batch, X_num_batch])

    probs = clf.predict_proba(X_batch)[:,1]
    preds = ["spam" if p >= threshold else "ham" for p in probs]

    return pd.DataFrame({
        "Email": emails,
        "Predicted Label": preds,
        "Probability": probs.round(3)
    })

# --------------------------
# Test single email
# --------------------------
email = "Win a free iPhone now! 🎉"
label, prob = predict_single_email(email, clf_loaded, vectorizer_loaded, num_cols_loaded)
print(f"Single Email Test:\nEmail: {email}\nPredicted Label: {label}, Probability: {prob:.3f}\n")

# --------------------------
# Test batch emails
# --------------------------
batch_emails = [
    "Congratulations! You won a brand new MacBook. Click here to claim 🎉",
    "Dear friend, I hope this email finds you well. Can we meet tomorrow?",
    "URGENT: Verify your bank account immediately to avoid suspension!",
    "Meeting reminder: Project discussion at 10 AM today."
]

results = predict_spam_batch(batch_emails, clf_loaded, vectorizer_loaded, num_cols_loaded)
display(results)


Single Email Test:
Email: Win a free iPhone now! 🎉
Predicted Label: spam, Probability: 0.681



Unnamed: 0,Email,Predicted Label,Probability
0,Congratulations! You won a brand new MacBook. ...,spam,0.918
1,"Dear friend, I hope this email finds you well....",ham,0.01
2,URGENT: Verify your bank account immediately t...,spam,0.811
3,Meeting reminder: Project discussion at 10 AM ...,ham,0.004


In [None]:
# --------------------------
# Single email runtime input
# --------------------------
email = input("Enter a single email to classify: ")

label, prob = predict_single_email(email, clf_loaded, vectorizer_loaded, num_cols_loaded)
print(f"\nSingle Email Test:")
print(f"Email: {email}")
print(f"Predicted Label: {label}, Probability: {prob:.3f}")

# --------------------------
# Batch runtime input
# --------------------------
emails_text = input("\nEnter multiple emails separated by ';':\n")
batch_emails = [e.strip() for e in emails_text.split(";") if e.strip()]

if batch_emails:
    print("\nBatch Prediction Results:")
    results = predict_spam_batch(batch_emails, clf_loaded, vectorizer_loaded, num_cols_loaded)
    display(results)
