In [None]:
import pandas as pd
import numpy as np
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score



In [None]:
# Load the dataset
file_path = 'spam_ham_dataset.csv'
data = pd.read_csv(file_path)

# Preprocessing function
def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Remove numbers and punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Tokenize - split the text into words
    text = text.split()

    # Stemming - reducing words to their root form
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text]

    # Join the words back into one string
    text = ' '.join(text)
    return text

In [None]:
# Apply preprocessing to the email texts
data['processed_text'] = data['text'].apply(preprocess)

In [None]:
# Feature Extraction with TF-IDF
# TF-IDF converts text to a meaningful representation of numbers
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
X = tfidfconverter.fit_transform(data['processed_text']).toarray()

# Defining the target variable (spam or ham)
y = data['label_num']


In [None]:
#Splitting the dataset into training and test sets
# This allows us to train on part of the data and test on unseen data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
# Training the SVM model
# SVM is chosen for its effectiveness in high-dimensional spaces (like text)
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Making predictions on the test set
y_pred = svm_classifier.predict(X_test)


In [None]:
#Evaluating the model
# Confusion matrix shows the model's performance with actual vs predicted values
conf_matrix = confusion_matrix(y_test, y_pred)

# Classification report gives a detailed performance analysis by class
class_report = classification_report(y_test, y_pred)

# Overall accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy: {:.2f}%".format(accuracy * 100))

Confusion Matrix:
 [[717  15]
 [  8 295]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       732
           1       0.95      0.97      0.96       303

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035


Accuracy: 97.78%
