In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

In [32]:
df = pd.read_csv('emails_prepared.csv')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9353 entries, 0 to 9352
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   path                      9353 non-null   object 
 1   subject                   9327 non-null   object 
 2   body                      9347 non-null   object 
 3   text_all                  9353 non-null   object 
 4   clean_text                9353 non-null   object 
 5   label                     9353 non-null   object 
 6   label_encoded             9353 non-null   int64  
 7   url_count                 9353 non-null   int64  
 8   email_count               9353 non-null   int64  
 9   money_count               9353 non-null   int64  
 10  num_count                 9353 non-null   int64  
 11  subject_len               9353 non-null   int64  
 12  subject_upper_ratio       9353 non-null   float64
 13  subject_exclaim           9353 non-null   int64  
 14  subject_

In [34]:
df.drop(columns=['path','subject','body','text_all','clean_text','label','from_domain','from_tld'],axis=1,inplace=True)

In [35]:
target = 'label_encoded'

In [36]:
X= df.drop(target, axis=1)
y = df[target]

In [37]:
model = RandomForestClassifier(random_state=42,class_weight="balanced")

In [45]:
skf = StratifiedKFold(n_splits=7,shuffle=True,random_state=53)

In [46]:
f1_scores = []
auc_scores = []

In [47]:
for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1]

    f1 = f1_score(y_val, y_pred)       
    auc = roc_auc_score(y_val, y_prob)

    f1_scores.append(f1)
    auc_scores.append(auc)

    print(f"Fold {fold} - F1: {f1:.4f}, AUC: {auc:.4f}")

print("\nmean")
print(f"F1 mean score : {np.mean(f1_scores):.4f}")
print(f"AUC mean score: {np.mean(auc_scores):.4f}")


Fold 1 - F1: 0.9748, AUC: 0.9950
Fold 2 - F1: 0.9690, AUC: 0.9992
Fold 3 - F1: 0.9838, AUC: 0.9993
Fold 4 - F1: 0.9510, AUC: 0.9963
Fold 5 - F1: 0.9738, AUC: 0.9971
Fold 6 - F1: 0.9596, AUC: 0.9973
Fold 7 - F1: 0.9765, AUC: 0.9988

mean
F1 mean score : 0.9698
AUC mean score: 0.9976
