### **Cross Validation Analysis**

In [None]:
import os
import sys
from pathlib import Path
import matplotlib.pyplot as plt

try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path().resolve().parent
    
sys.path.append(str(project_root))

from src.utils import read_processed_data, config_loader, get_logger


log = get_logger()
config = config_loader()
path_svc = os.path.join(project_root, config['cv_result'], 'svc_results.csv')
path_knn = os.path.join(project_root, config['cv_result'], 'knn_results.csv')
path_rf = os.path.join(project_root, config['cv_result'], 'rf_results.csv')
path_xgb = os.path.join(project_root, config['cv_result'], 'xgb_results.csv')

cv_result = read_processed_data(path=path_svc, log=log)

In [None]:
# Display basic information about the cv_result DataFrame
print("Shape of cv_result:", cv_result.shape)
print("\nFirst 5 rows:")
print(cv_result.head())

print("\nSummary statistics:")
print(cv_result.describe(include='all'))

# Check for missing values
print("\nMissing values per column:")
print(cv_result.isnull().sum())

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(cv_result['param_C'], cv_result['mean_test_accuracy'], marker='o')
plt.xscale('log')
plt.xlabel('C (Regularizaton Parameter)')
plt.title("SVC: Accuracy vs. C")
plt.ylabel('Mean CV Accuracy')
plt.grid(True)
plt.savefig('svc_accuracy_vs_C.png', dpi=500)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(cv_result['param_C'], cv_result['mean_test_f1'], marker='o')
plt.xscale('log')
plt.xlabel('C (Regularizaton Parameter)')
plt.title("SVC: F1-Score vs. C")
plt.ylabel('Mean CV F1-Score')
plt.grid(True)
plt.savefig('svc_f1score_vs_C.png', dpi=500)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(cv_result['param_C'], cv_result['mean_test_roc_auc'], marker='o')
plt.xscale('log')
plt.xlabel('C (Regularizaton Parameter)')
plt.title("SVC: ROC-AUC vs. C")
plt.ylabel("Mean CV ROC-AUC")
plt.grid(True)
plt.savefig('svc_roc_vs_C.png', dpi=500)
plt.show()

In [None]:
#  KNN Plot..
cv_result_knn = read_processed_data(path=path_knn, log=log)

metrics = [
    ('mean_test_accuracy', 'Accuracy'),
    ('mean_test_f1', 'F1 Score'),
    ('mean_test_recall', 'Recall'),
    ('mean_test_precision', 'Precision'),
    ('mean_test_roc_auc', 'ROC-AUC')
]

for metric, label in metrics:
    plt.figure(figsize=(8, 5))
    plt.plot(cv_result_knn['param_n_neighbors'], cv_result_knn[metric], marker='o')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel(f'Mean CV {label}')
    plt.title(f'KNN: {label} vs. Number of Neighbors')
    plt.grid(True)
    plt.savefig(f'knn_{metric}_vs_k.png', dpi=500)
    plt.show()


In [None]:
# XGBoost Plot..
cv_result_xgb = read_processed_data(path=path_xgb, log=log)

metrics = [
    ('accuracy', 'mean_test_accuracy', 'mean_train_accuracy'),
    ('f1', 'mean_test_f1', 'mean_train_f1'),
    ('recall', 'mean_test_recall', 'mean_train_recall'),
    ('precision', 'mean_test_precision', 'mean_train_precision'),
    ('roc_auc', 'mean_test_roc_auc', 'mean_train_roc_auc')
]

x_param = 'param_n_estimators'  

for label, test_metric, train_metric in metrics:
    plt.figure(figsize=(8, 5))
    plt.plot(cv_result_xgb[x_param], cv_result_xgb[test_metric], marker='o', label='Mean Test')
    plt.plot(cv_result_xgb[x_param], cv_result_xgb[train_metric], marker='s', label='Mean Train')
    plt.xlabel(x_param.replace('param_', '').replace('_', ' ').title())
    plt.ylabel(label.title())
    plt.title(f'XGBoost: Train vs. Test {label.title()} vs. {x_param.replace("param_", "").title()}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'xgb_{label}_train_test_vs_{x_param}.png', dpi=500)
    plt.show()

x_param = 'param_max_depth'  

for label, test_metric, train_metric in metrics:
    plt.figure(figsize=(8, 5))
    plt.plot(cv_result_xgb[x_param], cv_result_xgb[test_metric], marker='o', label='Mean Test')
    plt.plot(cv_result_xgb[x_param], cv_result_xgb[train_metric], marker='s', label='Mean Train')
    plt.xlabel(x_param.replace('param_', '').replace('_', ' ').title())
    plt.ylabel(label.title())
    plt.title(f'XGBoost: Train vs. Test {label.title()} vs. {x_param.replace("param_", "").title()}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'xgb_{label}_train_test_vs_{x_param}.png', dpi=500)
    plt.show()



x_param = 'param_learning_rate'  

for label, test_metric, train_metric in metrics:
    plt.figure(figsize=(8, 5))
    plt.plot(cv_result_xgb[x_param], cv_result_xgb[test_metric], marker='o', label='Mean Test')
    plt.plot(cv_result_xgb[x_param], cv_result_xgb[train_metric], marker='s', label='Mean Train')
    plt.xlabel(x_param.replace('param_', '').replace('_', ' ').title())
    plt.ylabel(label.title())
    plt.title(f'XGBoost: Train vs. Test {label.title()} vs. {x_param.replace("param_", "").title()}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'xgb_{label}_train_test_vs_{x_param}.png', dpi=500)
    plt.show()


In [None]:
# Random Forest Plot..
cv_result_rf = read_processed_data(path=path_rf, log=log)

metrics = [
    ('mean_test_accuracy', 'Accuracy'),
    ('mean_test_f1', 'F1 Score'),
    ('mean_test_recall', 'Recall'),
    ('mean_test_precision', 'Precision'),
    ('mean_test_roc_auc', 'ROC-AUC')
]

for metric, label in metrics:
    plt.figure(figsize=(8, 5))
    plt.plot(cv_result_rf['param_n_estimators'], cv_result_rf[metric], marker='o')
    plt.xlabel('Number of Trees (n_estimators)')
    plt.ylabel(f'Mean CV {label}')
    plt.title(f'XGB: {label} vs. Number of Trees')
    plt.grid(True)
    plt.savefig(f'xgb_{metric}_vs_estimators.png', dpi=500)
    plt.show()