In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [6]:
# Load the dataset
data = pd.read_csv('emails.csv')

In [7]:
print(data.head())

# Check for missing values
print(data.isnull().sum())

  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]
Email No.     0
the           0
to            0
ect           

In [8]:
data = data.dropna()

In [9]:
print(data.dtypes)

Email No.     object
the            int64
to             int64
ect            int64
and            int64
               ...  
military       int64
allowing       int64
ff             int64
dry            int64
Prediction     int64
Length: 3002, dtype: object


In [10]:
# Split the data into features and target variable
X = data.drop(columns=['Email No.', 'Prediction'])
y = data['Prediction']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Initialize the model (e.g., Naive Bayes)
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9545893719806763
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.97       739
           1       0.89      0.96      0.92       296

    accuracy                           0.95      1035
   macro avg       0.94      0.96      0.95      1035
weighted avg       0.96      0.95      0.96      1035

Confusion Matrix:
 [[704  35]
 [ 12 284]]


In [15]:
# Define parameter grid for Naive Bayes (example)
param_grid = {
    'alpha': [0.1, 0.5, 1.0]  # Example for Naive Bayes
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

In [16]:
# Best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'alpha': 0.1}


In [17]:
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_best = best_model.predict(X_test)

# Re-evaluate
print("Accuracy with Best Model:", accuracy_score(y_test, y_pred_best))
print("Classification Report with Best Model:\n", classification_report(y_test, y_pred_best))

Accuracy with Best Model: 0.957487922705314
Classification Report with Best Model:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       739
           1       0.90      0.96      0.93       296

    accuracy                           0.96      1035
   macro avg       0.94      0.96      0.95      1035
weighted avg       0.96      0.96      0.96      1035

