In [1]:
import pandas as pd  
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split  
from sklearn.svm import SVC, LinearSVC  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn import metrics  
from sklearn.preprocessing import StandardScaler  
from sklearn.feature_extraction.text import TfidfVectorizer  


In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [4]:
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [5]:
df.drop(columns=['Email No.'], inplace=True)

In [6]:
df.isna().sum()

the           0
to            0
ect           0
and           0
for           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3001, dtype: int64

In [7]:
df.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


In [8]:
# Extracting the independent variables (features) and dependent variable (target)
X = df.iloc[:, :-1]  
y = df.iloc[:, -1]   

In [9]:
print(X.shape, y.shape)


(5172, 3000) (5172,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=8)


In [11]:
# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = vectorizer.fit_transform(X_train.astype('str'))  # Convert training data into TF-IDF features
X_test = vectorizer.transform(X_test.astype('str'))  # Convert test data into TF-IDF features


In [12]:
# Convert the sparse matrix from TF-IDF to dense arrays (necessary for scaling)
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

In [13]:
# Feature scaling using StandardScaler (standardize the data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_dense)  # Scaling the training features
X_test_scaled = scaler.transform(X_test_dense)  # Scaling the test features


In [14]:
print("Scaled X_train shape:", X_train_scaled.shape)
print("Scaled X_test shape:", X_test_scaled.shape)

Scaled X_train shape: (3000, 1000)
Scaled X_test shape: (3000, 1000)


In [15]:
models = {
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=2),
    "Linear SVM": LinearSVC(random_state=8, max_iter=900000),
    "Polynomial SVM": SVC(kernel="poly", degree=2, random_state=8),
    "RBF SVM": SVC(kernel="rbf", random_state=8),
    "Sigmoid SVM": SVC(kernel="sigmoid", random_state=8)
}

In [16]:
# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)  # Accuracy score
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='binary')  # Precision, Recall, F1-score
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)  # Confusion Matrix

ValueError: Found input variables with inconsistent numbers of samples: [3000, 4396]

In [None]:
# Print the performance results for each model
    print(f"Performance for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("-" * 50)  # Separator for clarity

In [18]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('emails.csv')

# Display basic information about the dataset
df.info()

# Drop the 'Email No.' column as it's not useful for the model
df.drop(columns=['Email No.'], inplace=True)

# Check for missing values
df.isna().sum()

# Define feature matrix (X) and target vector (y)
X = df.iloc[:, :-1]  # Independent variables (all columns except the last one)
y = df.iloc[:, -1]   # Dependent variable (last column, the target)

# Split the dataset into training and testing sets (85% training, 15% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=8)

# Normalize the feature data (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models to evaluate
models = {
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=2),
    "Linear SVM": LinearSVC(random_state=8, max_iter=900000),
    "Polynomial SVM": SVC(kernel="poly", degree=2, random_state=8),
    "RBF SVM": SVC(kernel="rbf", random_state=8),
    "Sigmoid SVM": SVC(kernel="sigmoid", random_state=8)
}

# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)  # Accuracy score
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_test, y_pred, average='binary')  # Precision, Recall, F1-score
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)  # Confusion Matrix
    
    # Print the performance results for each model
    print(f"Performance for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("-" * 50)  # Separator for clarity


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Performance for K-Nearest Neighbors:
Accuracy: 0.8814
Precision: 0.8131, Recall: 0.7699, F1-Score: 0.7909
Confusion Matrix:
[[510  40]
 [ 52 174]]
--------------------------------------------------
Performance for Linear SVM:
Accuracy: 0.9369
Precision: 0.8391, Recall: 0.9690, F1-Score: 0.8994
Confusion Matrix:
[[508  42]
 [  7 219]]
--------------------------------------------------
Performance for Polynomial SVM:
Accuracy: 0.7680
Precision: 0.9423, Recall: 0.2168, F1-Score: 0.3525
Confusion Matrix:
[[547   3]
 [177  49]]
--------------------------------------------------
Performance for RBF SVM:
Accuracy: 0.9381
Precision: 0.9944, Recall: 0.7920, F1-Score: 0.8818
Confusion Matrix:
[[549   1]
 [ 47 179]]
--------------------------------------------------
Performance for Sigmoid SVM:
Accuracy: 0.9021
Precision: 0.9167, Recall: 0.7301, F1-Score: 0.8128
Confusion Matrix:
[[535  15]
 [ 61 165]]
--------------------------------------------------


In [None]:
# True Negatives (TN): 510 — These are the emails that are correctly identified as not spam (non-spam).
# False Positives (FP): 40 — These are the emails incorrectly identified as spam when they are not (non-spam emails predicted as spam).
# False Negatives (FN): 52 — These are the emails that are actually spam but incorrectly identified as non-spam (spam emails predicted as non-spam).
# True Positives (TP): 174 — These are the emails that are correctly identified as spam.