In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Dataset
data = pd.read_csv('emails.csv')

# Convert to DataFrame
df = pd.DataFrame(data)

# Separate features and target
X = df.drop(columns=['Email No.', 'Prediction'])
y = df['Prediction']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the model pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5)  # 5-fold cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())


Accuracy: 0.9735824742268041
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1102
           1       0.95      0.96      0.95       450

    accuracy                           0.97      1552
   macro avg       0.97      0.97      0.97      1552
weighted avg       0.97      0.97      0.97      1552

Cross-Validation Scores: [0.96618357 0.96135266 0.95357834 0.96711799 0.94390716]
Mean Cross-Validation Score: 0.9584279427017632
