# K-Means Email Spam Detection

K-Means is a clustering algorithm that learns to identify relevant features in a dataset without use of labels. We can use this for binary classification

Note: This notebook was run on kaggle

In [43]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import numpy as np
from statistics import mode
from matplotlib import pyplot as plt

In [44]:
# Load your training data into a pandas DataFrame
train_df = pd.read_csv("/kaggle/input/email-spam/train.csv")
train_df.rename(columns={'spam': 'labels'}, inplace=True)
train_df = train_df[['text', 'labels']]
train_df.head()

Unnamed: 0,text,labels
0,subject institute international finance annual...,0
1,subject mortgage even worst credit zwzm detail...,1
2,subject partnership mr edward moko independenc...,1
3,subject de la part de enfants ama rue de marty...,1
4,subject synfuel option valuation lenny believe...,0


In [45]:
# Load your training data into a pandas DataFrame
test_df = pd.read_csv("test.csv")
test_df.rename(columns={'spam': 'labels'}, inplace=True)
test_df = test_df[['text', 'labels']]
test_df.head()

Unnamed: 0,text,labels
0,subject perfect logo charset koi r thinking br...,1
1,subject storage model security stinson added t...,0
2,subject wall street micro news report homeland...,1
3,subject logo stationer website design much lt ...,1
4,subject video conference ross mcintyre vince r...,0


In [46]:
X_train = train_df['text']
y_train = train_df['labels']

X_test = test_df['text']
y_test = test_df['labels']

### Use Vectorizor to extract text features

In [47]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [48]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X_test_tfidf)



In [49]:
# Predict the samples to a cluster
y_train_pred = kmeans.predict(X_train_tfidf)
y_test_pred = kmeans.predict(X_test_tfidf)

In [50]:
# Map the clusters to the corresponding classification
class_mapping = {}
for i, center in enumerate(kmeans.cluster_centers_):
    # Calculate Euclidean Distance of each point to the center
    distances = np.linalg.norm(center - X_train_tfidf, axis=1)

    # Get indices of the 5 closest points to each center
    closest_points_indices = np.argsort(distances)[:5]
    closest_classes = y_train[closest_points_indices]

    # Get the most common class among the closest points
    class_mapping[i] = mode(closest_classes)
    print(f"Center {i} maps to class {class_mapping[i]}")

y_train_pred_mapped = np.array([class_mapping[label] for label in y_train_pred])
y_test_pred_mapped = np.array([class_mapping[label] for label in y_test_pred])


Center 0 maps to class 0
Center 1 maps to class 1


### Results

In [51]:
# Get Train Set Results

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_train, y_train_pred_mapped)
conf_matrix = confusion_matrix(y_train, y_train_pred_mapped)
class_report = classification_report(y_train, y_train_pred_mapped)
f1 = f1_score(y_train, y_train_pred_mapped)

print("Train Set Results:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Train Set Results:
Accuracy: 0.8612
F1 Score: 0.8520886615515771
Confusion Matrix:
[[2307  693]
 [   1 1999]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.77      0.87      3000
           1       0.74      1.00      0.85      2000

    accuracy                           0.86      5000
   macro avg       0.87      0.88      0.86      5000
weighted avg       0.90      0.86      0.86      5000



In [52]:
# Get Test Set Results

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_test_pred_mapped)
conf_matrix = confusion_matrix(y_test, y_test_pred_mapped)
class_report = classification_report(y_test, y_test_pred_mapped)
f1 = f1_score(y_test, y_test_pred_mapped)

print("Train Set Results:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Train Set Results:
Accuracy: 0.8938053097345132
F1 Score: 0.904
Confusion Matrix:
[[ 89  24]
 [  0 113]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.79      0.88       113
           1       0.82      1.00      0.90       113

    accuracy                           0.89       226
   macro avg       0.91      0.89      0.89       226
weighted avg       0.91      0.89      0.89       226



In [53]:
predicted_df = test_df.copy()
predicted_df['predicted_spam'] = y_test_pred_mapped
predicted_df['prediction'] = ['Spam' if x == 1 else 'Ham' for x in predicted_df['predicted_spam']]
predicted_df

Unnamed: 0,text,labels,predicted_spam,prediction
0,subject perfect logo charset koi r thinking br...,1,1,Spam
1,subject storage model security stinson added t...,0,1,Spam
2,subject wall street micro news report homeland...,1,1,Spam
3,subject logo stationer website design much lt ...,1,1,Spam
4,subject video conference ross mcintyre vince r...,0,0,Ham
...,...,...,...,...
221,subject sorry see hyatt lobby vince j kaminski...,0,0,Ham
222,subject yyyy know hgh difference hello jm netn...,1,1,Spam
223,subject try ouut hello welcome pharmon content...,1,1,Spam
224,subject department energy deploying corporate ...,0,1,Spam


In [54]:
# View mispredicted emails in testing dataset
mispredictions_df = predicted_df[predicted_df['labels'] != predicted_df['predicted_spam']]
mispredictions_df

Unnamed: 0,text,labels,predicted_spam,prediction
1,subject storage model security stinson added t...,0,1,Spam
7,subject erisk iconference please save e mail c...,0,1,Spam
13,subject ferc soft price cap mean cera conferen...,0,1,Spam
28,subject natural gas storage item vince somethi...,0,1,Spam
46,subject new website exciting conference applic...,0,1,Spam
53,subject wichai narongwanich dear toni please a...,0,1,Spam
61,subject internal guest access enrononline vinc...,0,1,Spam
74,subject garp convention dear garp speaker prog...,0,1,Spam
82,subject amerada hess day rate hedge number att...,0,1,Spam
88,subject new research tool cool tool really bre...,0,1,Spam


In [55]:
with open ("K-means_formatted_example_email_spam_predictions.txt", "w") as predictions_file:
  for i in range(0,50,2):
    pred = "Email: "+ predicted_df['text'][i] + ".\nPrediction: This is a "+ predicted_df['prediction'][i]+ " email.\n"
    print(pred)
    predictions_file.write(pred+'\n')

Email: subject perfect logo charset koi r thinking breathing new life business start revamping front end logo visuai identity loqodentity offer creative custom design logo stationery web site careful hand powerfui marketinq toois wiii bring breath fresh air business make stand among competitor click away future success click see sample artwork check price hot offer.
Prediction: This is a Spam email.

Email: subject wall street micro news report homeland security investment terror attack united state september changed security landscape foreseeable future physical logical security become paramount industry segment especially banking national resource government sector according giga wholly owned subsidiary forrester research worldwide demand information security product service set eclipse b homeland security investment newsletter dedicated providing reader information pertaining investment opportunity lucrative sector know event related homeland security happen lightning speed investor