In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data
data = pd.read_csv('./spam.csv')  # Replace "your_data.csv" with your actual file path

# Pre-processing
def preprocess_text(text):
  """
  Preprocesses text data for spam classification.

  Args:
      text: The text to preprocess.

  Returns:
      The preprocessed text.
  """
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation)).split()
  stopwords_set = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
  return ' '.join(text)

data['Message'] = data['Message'].apply(preprocess_text)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.3, random_state=42)

# Feature Extraction
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define Classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(n_jobs=1),
    "Naive Bayes": MultinomialNB()
}

# Train Classifiers
for name, clf in classifiers.items():
  clf.fit(X_train, y_train)

# Evaluate Classifiers
for name, clf in classifiers.items():
  y_pred = clf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, pos_label="spam")  # Assuming "spam" is positive
  recall = recall_score(y_test, y_pred, pos_label="spam")
  f1 = f1_score(y_test, y_pred, pos_label="spam")

  print(f"\n{name} Accuracy:", accuracy)
  print(f"{name} Precision:", precision)
  print(f"{name} Recall:", recall)
  print(f"{name} F1-score:", f1)

# Classify a New Email
def classify_new_email(email_text):
  """
  Classifies a new email as spam or ham.

  Args:
      email_text: The text of the new email.

  Returns:
      A string indicating the classification ("spam" or "ham").
  """
  email_text = preprocess_text(email_text)
  email_corpus = [email_text]
  X_email = vectorizer.transform(email_corpus)

  # Choose the best performing classifier based on your evaluation results
  best_clf = classifiers["Random Forest"]  # Replace with the best performing classifier

  prediction = best_clf.predict(X_email)[0]
  if prediction == 1:  # Assuming "spam" is encoded as 1
    return "spam"
  else:
    return "ham"

new_email = "This is an important work email. Please respond ASAP."
classification = classify_new_email(new_email)
print(f"\nNew Email Classification: {classification}")



Random Forest Accuracy: 0.9760765550239234
Random Forest Precision: 1.0
Random Forest Recall: 0.8214285714285714
Random Forest F1-score: 0.9019607843137255

Naive Bayes Accuracy: 0.9856459330143541
Naive Bayes Precision: 0.9629629629629629
Naive Bayes Recall: 0.9285714285714286
Naive Bayes F1-score: 0.9454545454545454

New Email Classification: ham
