In [1]:
import pandas as pd
import numpy as np
from google.cloud import storage
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import os

In [2]:
load_dotenv()

df = pd.read_csv('gs://project-abd/notebook-data/fusion_data.csv',  
                 sep=',', 
                 storage_options={'token':os.getenv('CREDENTIAL')})

In [3]:
df.head()

Unnamed: 0,class_id,room,student_id,gender_code,age,timestamp,hr_mean,temp_mean,eda_mean,ibi_mean,bvp_mean,engagement_level
0,7,R3,21,1,15,10:55:00,96.891734,29.0758,0.240994,0.760156,0.147336,Highly Engaged
1,202,R1,20,1,16,14:15:00,84.538662,32.529366,0.15323,0.759181,0.024288,Highly Engaged
2,78,R3,1,1,16,11:25:00,71.108227,32.392486,0.161321,0.802537,-0.034383,Not Engaged
3,107,R2,7,1,15,10:50:00,83.337133,27.8148,0.169119,0.753507,0.004243,Not Engaged
4,161,R1,10,1,15,13:20:00,92.0142,32.5388,0.343952,0.767184,0.013166,Not Engaged


In [4]:
df['engagement_level'].value_counts()

engagement_level
Highly Engaged    118
Not Engaged        93
Engaged            37
Name: count, dtype: int64

In [5]:
data_prep = df

In [6]:
data_prep['engagement_level'].replace('Engaged', 'Not Engaged', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_prep['engagement_level'].replace('Engaged', 'Not Engaged', inplace=True)


In [7]:
data_prep['engagement_level'].value_counts()

engagement_level
Not Engaged       130
Highly Engaged    118
Name: count, dtype: int64

In [8]:
categorical_columns = data_prep[['room','engagement_level']]
label_encoders = {}

for col in categorical_columns.columns:
    le = LabelEncoder()
    data_prep[col] = le.fit_transform(data_prep[col])
    label_encoders[col] = le

In [9]:
data_prep

Unnamed: 0,class_id,room,student_id,gender_code,age,timestamp,hr_mean,temp_mean,eda_mean,ibi_mean,bvp_mean,engagement_level
0,7,2,21,1,15,10:55:00,96.891734,29.075800,0.240994,0.760156,0.147336,0
1,202,0,20,1,16,14:15:00,84.538662,32.529366,0.153230,0.759181,0.024288,0
2,78,2,1,1,16,11:25:00,71.108227,32.392486,0.161321,0.802537,-0.034383,1
3,107,1,7,1,15,10:50:00,83.337133,27.814800,0.169119,0.753507,0.004243,1
4,161,0,10,1,15,13:20:00,92.014200,32.538800,0.343952,0.767184,0.013166,1
...,...,...,...,...,...,...,...,...,...,...,...,...
243,237,3,3,1,16,10:00:00,86.797033,32.615667,0.053965,0.919313,0.177886,0
244,17,2,3,1,16,10:55:00,92.064000,35.704634,0.310514,0.760156,-0.199961,0
245,162,4,20,1,16,15:30:00,85.151701,34.430209,1.119918,0.760156,-0.285726,0
246,89,1,12,1,16,15:20:00,84.232400,34.618134,0.465747,0.760156,0.368063,1


In [10]:
target = data_prep['engagement_level']
features = data_prep[['room','gender_code','age','hr_mean','temp_mean','eda_mean','ibi_mean','bvp_mean']]

## Normal Splitting 

In [11]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [12]:
smote = SMOTE(random_state=42)
x_train_sampled, y_train_sampled = smote.fit_resample(x_train_scaled, y_train)

In [13]:
def evaluate_model(model_name, model, x_test_scaled, y_test):
    y_pred = model.predict(x_test_scaled)
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [14]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(solver='liblinear', random_state=42, max_iter=10000, class_weight='balanced')
logreg_model.fit(x_train_scaled, y_train)

evaluate_model('Logistic Regression', logreg_model, x_test_scaled, y_test)

--- Logistic Regression ---
Accuracy: 0.6000
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.63      0.55        19
           1       0.72      0.58      0.64        31

    accuracy                           0.60        50
   macro avg       0.60      0.61      0.59        50
weighted avg       0.63      0.60      0.61        50



In [15]:
from sklearn.svm import SVC

svm_model = SVC(kernel='sigmoid',max_iter=1000, random_state=42, gamma='scale', C=2.0, class_weight='balanced')
svm_model.fit(x_train_scaled, y_train)

evaluate_model('SVM', svm_model, x_test_scaled, y_test)

--- SVM ---
Accuracy: 0.4400
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.63      0.46        19
           1       0.59      0.32      0.42        31

    accuracy                           0.44        50
   macro avg       0.48      0.48      0.44        50
weighted avg       0.50      0.44      0.43        50



In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train_scaled, y_train)

evaluate_model('KNN',knn_model, x_test_scaled, y_test)

--- KNN ---
Accuracy: 0.6400
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.63      0.57        19
           1       0.74      0.65      0.69        31

    accuracy                           0.64        50
   macro avg       0.63      0.64      0.63        50
weighted avg       0.66      0.64      0.64        50



In [17]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(x_train_scaled, y_train)

evaluate_model("Gaussian Naive Bayes", gnb_model, x_test_scaled, y_test)

--- Gaussian Naive Bayes ---
Accuracy: 0.5800
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.79      0.59        19
           1       0.78      0.45      0.57        31

    accuracy                           0.58        50
   macro avg       0.62      0.62      0.58        50
weighted avg       0.66      0.58      0.58        50



In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42) 
rf_model.fit(x_train_scaled, y_train)

evaluate_model("Random Forest", rf_model, x_test_scaled, y_test)

--- Random Forest ---
Accuracy: 0.6200
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.68      0.58        19
           1       0.75      0.58      0.65        31

    accuracy                           0.62        50
   macro avg       0.62      0.63      0.62        50
weighted avg       0.66      0.62      0.63        50



## Fine Tuning and Data Folding

### Promising Models
1. Random Forest
2. KNN
3. Logistic Regression

In [19]:
from sklearn.model_selection import RepeatedKFold, GridSearchCV

In [25]:
# Random Forest
param_grid = {
    'n_estimators' : [100,200,300,400,500,1000,2000],
    'criterion' : ['gini','entropy','log_loss'],
    'max_depth' : [10,50,100],
    'max_features' : ['sqrt','log2']
}

tree_model = RandomForestClassifier(random_state=12)
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=11)

grid_cv = GridSearchCV(tree_model, 
                       param_grid=param_grid, 
                       scoring='accuracy',
                       n_jobs=5, 
                       cv=cv, 
                       refit=True)
grid_cv.fit(x_train_scaled, y_train)
print(grid_cv.best_params_)

{'criterion': 'entropy', 'max_depth': 50, 'max_features': 'sqrt', 'n_estimators': 500}


In [26]:
# KNN
param_grid = {
    'n_neighbors' : [2,4,5,6,8,10],
    'weights' : ['uniform','distance'],
    'algorithm' : ['auto','ball_tree','kd_tree','brute'],
}

knn_model = KNeighborsClassifier()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=11)

grid_cv = GridSearchCV(knn_model, 
                       param_grid=param_grid, 
                       scoring='accuracy',
                       n_jobs=5, 
                       cv=cv, 
                       refit=True)
grid_cv.fit(x_train_scaled, y_train)
print(grid_cv.best_params_)

{'algorithm': 'auto', 'n_neighbors': 4, 'weights': 'distance'}


In [28]:
# Logistc Regression
param_grid = [
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10]},
    {'solver': ['lbfgs'], 'penalty': ['l2'], 'C': [0.1, 1.0, 10]},
    {'solver': ['saga'], 'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10]},
    {'solver': ['saga'], 'penalty': ['elasticnet'], 'l1_ratio': [0.0, 0.5, 1.0], 'C': [0.1, 1.0, 10]},
]


logreg_model = LogisticRegression(random_state=12)
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=11)

grid_cv = GridSearchCV(logreg_model, 
                       param_grid=param_grid, 
                       scoring='accuracy',
                       n_jobs=5, 
                       cv=cv, 
                       refit=True)
grid_cv.fit(x_train_scaled, y_train)
print(grid_cv.best_params_)

{'C': 10, 'penalty': 'l1', 'solver': 'saga'}
