In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
emails = [
    # Spam emails
    "Congratulations! You won a $1000 gift card. Click to claim!",
    "URGENT: Your account needs verification. Confirm now!",
    "Make money fast! Work from home. Earn $5000 monthly!",
    "Free iPhone! Just pay shipping. Limited time offer!",
    "Package delivery failed. Click link to reschedule.",

    # Ham emails
    "Hi John, let's meet for lunch tomorrow at 12 PM.",
    "Team meeting scheduled for Friday at 3 PM.",
    "Your Amazon order has been shipped and will arrive tomorrow.",
    "Monthly project report attached. Please review.",
    "Reminder: Doctor's appointment on Monday at 10 AM."
]

labels = ['spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham']

In [None]:
stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
  text=text.lower()
  words = word_tokenize(text)
  words = [stemmer.stem(word) for word in words if word not in stopwords]
  return ' '.join(words)


In [None]:
nltk.download('punkt_tab')
preprocessed_emails = [preprocess_text(email) for email in emails]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(preprocessed_emails)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, labels, test_size=0.3, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.3333333333333333


In [None]:
new_emails = [
    "You won a prize! Claim your free gift now!",
    "Meeting rescheduled to next week, please update calendar.",
    "Earn cash quickly with this amazing opportunity!"
]

# Preprocess new emails
new_emails_processed = [preprocess_text(email) for email in new_emails]
new_X = vectorizer.transform(new_emails_processed)

# Predict
predictions = model.predict(new_X)
probabilities = model.predict_proba(new_X)

print("\nPredictions for new emails:")
for i, email in enumerate(new_emails):
    print(f"Email: {email}")
    print(f"Prediction: {predictions[i]}")
    print(f"Confidence: {max(probabilities[i]):.4f}")
    print("---")


Predictions for new emails:
Email: You won a prize! Claim your free gift now!
Prediction: spam
Confidence: 0.8690
---
Email: Meeting rescheduled to next week, please update calendar.
Prediction: ham
Confidence: 0.5466
---
Email: Earn cash quickly with this amazing opportunity!
Prediction: spam
Confidence: 0.6948
---


In [None]:
feature_names = vectorizer.get_feature_names_out()
spam_probs = model.feature_log_prob_[1]  # spam class probabilities

# Get top 10 spam indicators
top_spam_indices = spam_probs.argsort()[-10:][::-1]
print("\nTop spam indicators:")
for idx in top_spam_indices:
    print(f"{feature_names[idx]}: {spam_probs[idx]:.4f}")


Top spam indicators:
click: -3.3081
ship: -3.7136
work: -3.7136
money: -3.7136
packag: -3.7136
offer: -3.7136
time: -3.7136
monthli: -3.7136
pay: -3.7136
claim: -3.7136
