In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('emails.csv')

# Encode categorical labels
le = LabelEncoder()
data['Prediction'] = le.fit_transform(data['Prediction'])

# Split data into features and target
X = data.drop(columns=['Email No.', 'Prediction'])
y = data['Prediction']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Hyperparameters grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Grid Search Best Parameters:")
print(grid_search.best_params_)

# Evaluate on test set
y_pred_grid = grid_search.predict(X_test)
print("\nGrid Search Accuracy:", accuracy_score(y_test, y_pred_grid))
print("\nGrid Search Classification Report:")
print(classification_report(y_test, y_pred_grid))

# Hyperparameters grid for Random Search
random_param_grid = {
    'n_estimators': [int(x) for x in range(50, 201, 10)],
    'max_depth': [None] + [int(x) for x in range(10, 31, 5)],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

# Random Search
random_search = RandomizedSearchCV(rf_classifier, random_param_grid, n_iter=20, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

print("\nRandom Search Best Parameters:")
print(random_search.best_params_)

# Evaluate on test set
y_pred_random = random_search.predict(X_test)
print("\nRandom Search Accuracy:", accuracy_score(y_test, y_pred_random))
print("\nRandom Search Classification Report:")
print(classification_report(y_test, y_pred_random))


Grid Search Best Parameters:
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Grid Search Accuracy: 0.9748792270531401

Grid Search Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       739
           1       0.95      0.97      0.96       296

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.97      0.97      1035


Random Search Best Parameters:
{'n_estimators': 110, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 25}

Random Search Accuracy: 0.9739130434782609

Random Search Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       739
           1       0.95      0.96      0.95       296

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weigh