# K-Means Email Spam Detection

K-Means is a clustering algorithm that learns to identify relevant features in a dataset without use of labels. We can use this for binary classification

Note: This notebook was run on kaggle

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import numpy as np
from statistics import mode
from matplotlib import pyplot as plt

In [2]:
# Load your training data into a pandas DataFrame
train_df = pd.read_csv("/kaggle/input/email-spam/train.csv") 
train_df.rename(columns={'spam': 'labels'}, inplace=True)
train_df = train_df[['text', 'labels']]
train_df.head()

Unnamed: 0,text,labels
0,subject institute international finance annual...,0
1,subject mortgage even worst credit zwzm detail...,1
2,subject partnership mr edward moko independenc...,1
3,subject de la part de enfants ama rue de marty...,1
4,subject synfuel option valuation lenny believe...,0


In [3]:
# Load your training data into a pandas DataFrame
test_df = pd.read_csv("/kaggle/input/email-spam/test.csv") 
test_df.rename(columns={'spam': 'labels'}, inplace=True)
test_df = test_df[['text', 'labels']]
test_df.head()

Unnamed: 0,text,labels
0,subject perfect logo charset koi r thinking br...,1
1,subject storage model security stinson added t...,0
2,subject wall street micro news report homeland...,1
3,subject logo stationer website design much lt ...,1
4,subject video conference ross mcintyre vince r...,0


In [4]:
X_train = train_df['text']
y_train = train_df['labels']

X_test = test_df['text']
y_test = test_df['labels']

### Use Vectorizor to extract text features

In [5]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X_test_tfidf)



In [7]:
# Predict the samples to a cluster
y_train_pred = kmeans.predict(X_train_tfidf)
y_test_pred = kmeans.predict(X_test_tfidf)

In [8]:
# Map the clusters to the corresponding classification
class_mapping = {}
for i, center in enumerate(kmeans.cluster_centers_):
    # Calculate Euclidean Distance of each point to the center
    distances = np.linalg.norm(center - X_train_tfidf, axis=1)
    
    # Get indices of the 5 closest points to each center
    closest_points_indices = np.argsort(distances)[:5]  
    closest_classes = y_train[closest_points_indices]
    
    # Get the most common class among the closest points
    class_mapping[i] = mode(closest_classes)
    print(f"Center {i} maps to class {class_mapping[i]}")

y_train_pred_mapped = np.array([class_mapping[label] for label in y_train_pred])
y_test_pred_mapped = np.array([class_mapping[label] for label in y_test_pred])


Center 0 maps to class 0
Center 1 maps to class 1


### Results

In [9]:
# Get Train Set Results

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_train, y_train_pred_mapped)
conf_matrix = confusion_matrix(y_train, y_train_pred_mapped)
class_report = classification_report(y_train, y_train_pred_mapped)
f1 = f1_score(y_train, y_train_pred_mapped)

print("Train Set Results:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Train Set Results:
Accuracy: 0.7416
F1 Score: 0.7557655954631379
Confusion Matrix:
[[1709 1291]
 [   1 1999]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.57      0.73      3000
           1       0.61      1.00      0.76      2000

    accuracy                           0.74      5000
   macro avg       0.80      0.78      0.74      5000
weighted avg       0.84      0.74      0.74      5000



In [10]:
# Get Test Set Results

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_test_pred_mapped)
conf_matrix = confusion_matrix(y_test, y_test_pred_mapped)
class_report = classification_report(y_test, y_test_pred_mapped)
f1 = f1_score(y_test, y_test_pred_mapped)

print("Train Set Results:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Train Set Results:
Accuracy: 0.7920353982300885
F1 Score: 0.8278388278388279
Confusion Matrix:
[[ 66  47]
 [  0 113]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.58      0.74       113
           1       0.71      1.00      0.83       113

    accuracy                           0.79       226
   macro avg       0.85      0.79      0.78       226
weighted avg       0.85      0.79      0.78       226

