In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.compose import make_column_transformer
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GroupKFold
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GroupShuffleSplit
from statsmodels.stats.contingency_tables import mcnemar

# Load the dataset
file_path = 'HR_data.csv'
df = pd.read_csv(file_path)

# Drop the unnecessary column
df.drop(columns=['Unnamed: 0'], inplace=True)
#print(df.head())
# Separate features and target
X = df.drop(columns=['Frustrated'])
y = df['Frustrated']


groups = df['Individual']  # Assume 'Individual' column identifies different individuals


## Preprocess categorical 
categorical_features = ['Round', 'Phase', 'Cohort', 'Puzzler', 'Individual']
numerical_features = ['HR_Mean', 'HR_Median', 'HR_std', 'HR_Min', 'HR_Max', 'HR_AUC']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])




# Define parameter grid for Logistic Regression
param_grid_lr = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l2'],  # Only 'l2' penalty is supported by 'lbfgs' solver
    'classifier__solver': ['lbfgs']  # Use 'lbfgs' solver for Logistic Regression
}

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Define parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}



# Split the data using GroupShuffleSplit
group_split = GroupShuffleSplit(test_size=0.2, random_state=42)
train_idx, val_idx = next(group_split.split(X, y, groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Determine the smallest class size in the training set
class_counts = Counter(y_train)
smallest_class_size = min(class_counts.values())

# Variable to enable or disable SMOTE
use_smote = False

if use_smote:
    # Apply SMOTE to handle class imbalance with dynamically adjusted k_neighbors
    smote = SMOTE(k_neighbors=max(1, smallest_class_size - 1), random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
else:
    X_train_res, y_train_res = X_train, y_train

# Perform GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(estimator=pipeline_rf, param_grid=param_grid_rf, cv=3)
grid_search_rf.fit(X_train_res, y_train_res)

# Perform GridSearchCV for Logistic Regression
grid_search_lr = GridSearchCV(estimator=pipeline_lr, param_grid=param_grid_lr, cv=3)
grid_search_lr.fit(X_train_res, y_train_res)

# Best parameters and best scores for Random Forest
print("Random Forest - Best Parameters:")
print(grid_search_rf.best_params_)
print("Random Forest - Best CV Score:")
print(grid_search_rf.best_score_)

# Best parameters and best scores for Logistic Regression
print("\nLogistic Regression - Best Parameters:")
print(grid_search_lr.best_params_)
print("Logistic Regression - Best CV Score:")
print(grid_search_lr.best_score_)

# Evaluate best Random Forest model
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_val)
rf_report = classification_report(y_val, y_pred_rf, zero_division=0)
print("\nRandom Forest Performance:")
print(rf_report)

# Evaluate best Logistic Regression model
best_lr_model = grid_search_lr.best_estimator_
y_pred_lr = best_lr_model.predict(X_val)
lr_report = classification_report(y_val, y_pred_lr, zero_division=0)
print("\nLogistic Regression Performance:")
print(lr_report)

# Evaluate best Random Forest model
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, y_pred_rf)
print(f"Random Forest Accuracy on Validation Set: {rf_accuracy * 100:.2f}%")


# Evaluate best Logistic Regression model
best_lr_model = grid_search_lr.best_estimator_
y_pred_lr = best_lr_model.predict(X_val)
lr_accuracy = accuracy_score(y_val, y_pred_lr)
print(f"Logistic Regression Accuracy on Validation Set: {lr_accuracy * 100:.2f}%")




Statistical evaluation

In [44]:
# Perform McNemar test

# Construct the contingency table
y_pred_rf_correct = y_pred_rf == y_val
y_pred_lr_correct = y_pred_lr == y_val

# Construct the contingency table
contingency_table = np.zeros((2, 2))
for rf_correct, lr_correct in zip(y_pred_rf_correct, y_pred_lr_correct):
    contingency_table[int(rf_correct), int(lr_correct)] += 1
print("\nContingency Table:")
print(contingency_table)

# Perform the McNemar test
result = mcnemar(contingency_table, exact=True)
print(f"\nMcNemar's Test p-value: {result.pvalue}")



Contingency Table:
[[22.  4.]
 [ 1.  9.]]

McNemar's Test p-value: 0.375
