Mini Project 02 [Ohid Reza 2212087642 | Group T]

In [None]:
#libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#ml libraries
from scipy.stats import skew, kurtosis, zscore
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from imblearn.over_sampling import SMOTE

In [None]:
data = pd.read_csv("data.csv")
data.head()

Manual Labeling

In [None]:
#[logic]
#label: 1 if accuracy >= 95 and avg_reac_time <= 200, else 0
data['manual_label'] = data.apply(
    lambda row: 1 if row['accuracy'] >= 95 and row['avg_reac_time'] <= 200 else 0,
    axis=1
)
data.head()

In [None]:
#label distribution
print("Label Distribution:")
print(data['manual_label'].value_counts())

Statistical Features

In [None]:
#summary
print(data.describe())

#skewness and kurtosis
print("\nSkewness of Accuracy:", skew(data['accuracy']))
print("Skewness of Reaction Time:", skew(data['avg_reac_time']))
print("\nKurtosis of Accuracy:", kurtosis(data['accuracy']))
print("Kurtosis of Reaction Time:", kurtosis(data['avg_reac_time']))


In [None]:
#z scores
data['accuracy_zscore'] = zscore(data['accuracy'])
data['reac_time_zscore'] = zscore(data['avg_reac_time'])

#normalization
scaler_norm = MinMaxScaler()
data[['accuracy_normalized', 'avg_reac_time_normalized']] = scaler_norm.fit_transform(
    data[['accuracy', 'avg_reac_time']]
)

#Performance Ratio
data['performance_ratio'] = data['accuracy'] / data['avg_reac_time']

#entropy of accuracy
epsilon = 1e-10
prob_accuracy = data['accuracy_normalized'] / data['accuracy_normalized'].sum()
data['accuracy_entropy'] = -prob_accuracy * np.log2(prob_accuracy + epsilon)
print("Total Accuracy Entropy:", data['accuracy_entropy'].sum())

#scaling
scaler_std = StandardScaler()
data[['accuracy_scaled', 'avg_reac_time_scaled']] = scaler_std.fit_transform(
    data[['accuracy', 'avg_reac_time']]
)

data.head()

Train Test Split and Imbalance Handaling

In [None]:
X_raw = data[['accuracy', 'avg_reac_time', 'performance_ratio']]
X_scaled = data[['accuracy_scaled', 'avg_reac_time_scaled', 'accuracy_zscore', 'reac_time_zscore', 'performance_ratio']]
y = data['manual_label']

X_train_raw, X_test_raw, y_train_raw, y_test = train_test_split(X_raw, y, test_size=0.3, random_state=42)
X_train_scaled, X_test_scaled, y_train_scaled, _ = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

#imbalance handaling with smote
sm = SMOTE(random_state=42)
X_train_raw, y_train_raw = sm.fit_resample(X_train_raw, y_train_raw)
X_train_scaled, y_train_scaled = sm.fit_resample(X_train_scaled, y_train_scaled)

Neural Network

In [None]:
param_grid_nn = {
    'hidden_layer_sizes': [(10,), (20,), (10, 10)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001]
}

grid_nn = GridSearchCV(MLPClassifier(max_iter=1500, random_state=42), param_grid_nn, cv=3)
grid_nn.fit(X_train_scaled, y_train_scaled)
print("Best NN Parameters:", grid_nn.best_params_)

y_p_nn = grid_nn.predict(X_test_scaled)

In [None]:
cm_nn = confusion_matrix(y_test, y_p_nn)
ConfusionMatrixDisplay(cm_nn).plot()
plt.title("Neural Network Confusion Matrix")
plt.savefig("nn_confusion_matrix.png")
plt.show()

In [None]:
print(classification_report(y_test, y_p_nn))

Random Forest

In [None]:
param_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_rf, cv=3)
grid_rf.fit(X_train_raw, y_train_raw)
print("Best RF Parameters:", grid_rf.best_params_)

y_p_rf = grid_rf.predict(X_test_raw)


In [None]:
cm_rf = confusion_matrix(y_test, y_p_rf)
ConfusionMatrixDisplay(cm_rf).plot()
plt.title("Random Forest Confusion Matrix")
plt.savefig("rf_confusion_matrix.png")
plt.show()

In [None]:
print(classification_report(y_test, y_p_rf))

In [None]:
importances_rf = grid_rf.best_estimator_.feature_importances_
plt.barh(X_raw.columns, importances_rf, color='teal')
plt.title("Random Forest Feature Importance")
plt.xlabel("Importance Score")
plt.show()

XGBoost

In [None]:
param_xgb = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_xgb = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), param_xgb, cv=3)
grid_xgb.fit(X_train_raw, y_train_raw)
print("Best XGB Parameters:", grid_xgb.best_params_)

y_p_xgb = grid_xgb.predict(X_test_raw)

In [None]:
cm_xgb = confusion_matrix(y_test, y_p_xgb)
ConfusionMatrixDisplay(cm_xgb).plot()
plt.title("XGBoost Confusion Matrix")
plt.savefig("xgb_confusion_matrix.png")
plt.show()

In [None]:

print(classification_report(y_test, y_p_xgb))

In [None]:
importances_xgb = grid_xgb.best_estimator_.feature_importances_
plt.barh(X_raw.columns, importances_xgb, color='darkorange')
plt.title("XGBoost Feature Importance")
plt.xlabel("Importance Score")
plt.show()

Final Data

In [None]:
data.to_csv("data_final.csv", index=False)
print("saved as data_final.csv")