In [1]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import pandas as pd

# Step 1: Dataset Collection
# Assuming you have a CSV file containing emails and their labels (0 for ham, 1 for spam)
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

# Step 2: Data Preprocessing
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]

    return ' '.join(filtered_text)

# Use the appropriate column name containing the email text
data['processed_text'] = data['v2'].apply(preprocess_text)


# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['v1'], test_size=0.2, random_state=42)

# Step 4: Feature Extraction
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Model Selection and Training
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 6: Model Evaluation
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Accuracy: 0.967713004484305
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

