In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load dataset
data = pd.read_csv('emails.csv')

In [None]:
# Explore the dataset
print(data.head())

  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [None]:
data.isnull().any()

Unnamed: 0,0
Email No.,False
the,False
to,False
ect,False
and,False
...,...
military,False
allowing,False
ff,False
dry,False


In [None]:
# Combine relevant columns into a single text column
data['email_content'] = (
data['the'].astype(str) + ' ' +
data['to'].astype(str) + ' ' +
data['ect'].astype(str) + ' ' +
data['and'].astype(str) + ' ' +
data['military'].astype(str) + ' ' +
data['allowing'].astype(str) + ' ' +
data['ff'].astype(str) + ' ' +
data['dry'].astype(str)
)

In [None]:
# Define features and target
X = data['email_content']
y = data['Prediction'] # Make sure this is the correct target column

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [None]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [None]:
# Evaluate KNN
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.96      0.82       739
           1       0.32      0.05      0.09       296

    accuracy                           0.70      1035
   macro avg       0.52      0.50      0.45      1035
weighted avg       0.60      0.70      0.61      1035

KNN Confusion Matrix:
 [[707  32]
 [281  15]]


In [None]:
# SVM
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [None]:
# Evaluate SVM
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.72      1.00      0.83       739
           1       0.50      0.01      0.02       296

    accuracy                           0.71      1035
   macro avg       0.61      0.50      0.43      1035
weighted avg       0.65      0.71      0.60      1035

SVM Confusion Matrix:
 [[736   3]
 [293   3]]
