In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import gradio as gr

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Simulate sample loading - replace this with your actual data
data = pd.read_csv('/content/drive/My Drive/emails.csv', header=None, names=['text', 'label'])  # If already combined


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # URLs
    text = re.sub(r'\@w+|\#','', text)  # Mentions & hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Punctuation
    text = re.sub(r'\d+', '', text)  # Numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['clean_text'] = data['text'].apply(clean_text)


In [4]:
X = data['clean_text']
y = data['label']
data_balanced = data.groupby('label').filter(lambda x: len(x) > 1)
X = data_balanced['clean_text']
y = data_balanced['label']
X_vec = tfidf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, stratify=y, random_state=42)

tfidf = TfidfVectorizer()
X_vec = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


NameError: name 'tfidf' is not defined

In [None]:
def predict_spam(email_text):
    cleaned = clean_text(email_text)
    vectorized = tfidf.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    proba = model.predict_proba(vectorized)[0].max()
    label = "Spam" if prediction == 1 else "Not Spam"
    return f"{label} (Confidence: {round(proba * 100, 2)}%)"

interface = gr.Interface(fn=predict_spam,
                         inputs=gr.Textbox(lines=15, placeholder="Paste your email here..."),
                         outputs="text",
                         title="Email Spam Classifier")

interface.launch()


In [None]:
# 📦 Install Gradio if not already
!pip install -q gradio


In [None]:

# 📚 Imports
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import gradio as gr
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 📁 Load CSV from Google Drive
data = pd.read_csv('/content/drive/My Drive/emails.csv', header=None, names=['text', 'label'])

# 🧹 Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# 🧹 Apply cleaning
data['clean_text'] = data['text'].apply(clean_text)

# ✂️ Features & Labels
X = data['clean_text']
y = data['label']

# 🔢 TF-IDF Vectorization (n-grams help detect spam phrases)
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
X_vec = tfidf.fit_transform(X)

# 🔀 Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


# 🤖 Train Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# 📊 Evaluate
y_pred = model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 📩 Spam Prediction Function
def predict_spam(email_text):
    cleaned = clean_text(email_text)
    vectorized = tfidf.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    proba = model.predict_proba(vectorized)[0].max()
    label = "Spam" if prediction == 1 else "Not Spam"
    return f"{label} (Confidence: {round(proba * 100, 2)}%)"

# 🖼️ Gradio Interface
interface = gr.Interface(
    fn=predict_spam,
    inputs=gr.Textbox(lines=15, placeholder="Paste your email here..."),
    outputs="text",
    title="📧 Email Spam Classifier (MultinomialNB)",
    description="Paste a full email (subject + body) and get a spam prediction."
)

# 🚀 Launch app
interface.launch(debug=True, share=True)
