In [None]:
# ------------------------------------------------------------
# EMAIL SPAM DETECTION USING KNN & SVM
# ------------------------------------------------------------
# Aim: Classify emails as Spam (1) or Not Spam (0)
# Steps: Load → Preprocess → Vectorize Text → Train Models → Evaluate
# ------------------------------------------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("emails.csv")
print("Dataset Shape:", df.shape)

# Display first few rows
print(df.head())

# -----------------------------
# 1. PRE-PROCESSING
# -----------------------------
# Last column "Prediction" is target → 1 = Spam, 0 = Not Spam
y = df['Prediction']                       # Target column

# Drop non-feature column "Email No." as it is just an ID
X = df.drop(columns=['Email No.', 'Prediction'])   # Features for training

# -----------------------------
# 2. TRAIN-TEST SPLIT
# -----------------------------

'''This code divides the dataset into two subsets using the train_test_split() function.
X_train and y_train contain 80% of the data and are used for training the model, 
while X_test and y_test hold the remaining 20% for evaluating model performance.
The parameter test_size=0.2 specifies the split ratio, and random_state=42 ensures 
reproducibility by generating the same split each time the code runs.'''
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 3. MODEL TRAINING
# -----------------------------
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

'''This block initializes and trains two classification models on the training dataset.
KNeighborsClassifier creates a KNN model with n_neighbors=5, meaning the algorithm uses 
the 5 nearest data points to classify an email. The .fit(X_train, y_train) method trains 
the model by learning patterns from the training features and labels.
For SVM, the SVC(kernel='linear') initializes a Support Vector Machine with a linear kernel,
which is suitable for linearly separable text data. The .fit() function trains the SVM by finding 
the best decision boundary (hyperplane) that separates spam and non-spam classes.'''

# Train KNN model (k=5)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Train SVM model (linear kernel works well for classification)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# -----------------------------
# 4. MODEL EVALUATION
# -----------------------------
from sklearn.metrics import accuracy_score

'''The .predict() method is used on X_test to generate predicted class labels for KNN and SVM, stored in knn_pred and svm_pred.
The function accuracy_score() from sklearn.metrics compares the predicted labels with the true labels (y_test) to calculate the 
percentage of correctly classified emails.
Finally, the accuracy values for both models are printed, enabling a direct performance comparison between KNN and SVM.'''

# Predictions
knn_pred = knn_model.predict(X_test)
svm_pred = svm_model.predict(X_test)

# Print accuracy comparison
print("\n--- MODEL PERFORMANCE ---")
print("KNN Accuracy :", accuracy_score(y_test, knn_pred))
print("SVM Accuracy :", accuracy_score(y_test, svm_pred))

'''KNN and SVM models were used to classify emails into Spam and Not Spam. The dataset 
already contained numerical word frequency features, so both models were trained 
directly on these values. After evaluation, SVM achieved higher accuracy than KNN, 
showing that SVM is more effective for spam detection due to its ability to better 
separate the two classes.'''

# -----------------------------
# 5. SHOW SAMPLE PREDICTIONS
# -----------------------------

'''This section creates a DataFrame to compare actual and predicted outputs.
We use .copy() to duplicate X_test and avoid modifying the original data.
New columns ('Actual', 'KNN_Pred', 'SVM_Pred') are added to store ground-truth labels and model predictions.
A label_map dictionary is defined to convert numeric class labels (0, 1) into categorical strings (“Not Spam”, “Spam”), 
and .map() is used to transform column values for better readability.Finally, .head(10) prints the first 10 rows, 
enabling a side-by-side comparison of model predictions against actual labels.'''

# Add predictions back to a sample of the test data
sample = X_test.copy()
sample['Actual'] = y_test
sample['KNN_Pred'] = knn_pred
sample['SVM_Pred'] = svm_pred

# Map 0 → Not Spam, 1 → Spam for readability
label_map = {0: "Not Spam", 1: "Spam"}
sample['Actual'] = sample['Actual'].map(label_map)
sample['KNN_Pred'] = sample['KNN_Pred'].map(label_map)
sample['SVM_Pred'] = sample['SVM_Pred'].map(label_map)

# Show 10 sample results
print("\nSample Predictions:")
print(sample.head(10))