In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as snsv
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

### Create behavioural_analysis_tutorial_df for 9-point Paas Score in Tutorial

In [6]:
behavioural_analysis_df = pd.read_csv('behavioural_analysis.csv')

columns_to_select = ['participant_number', 'feedback_score_tutorial_1', 'feedback_score_tutorial_2', 'feedback_score_tutorial_3']
behavioural_analysis_df = behavioural_analysis_df[columns_to_select]

behavioural_analysis_df = behavioural_analysis_df.drop_duplicates(subset=['participant_number'])

data = []
for _, row in behavioural_analysis_df.iterrows():
    participant_number = row['participant_number']
    data.append({'participant_number': participant_number, 'tutorial_number': 0, 'feedback_score_tutorial': row['feedback_score_tutorial_1']})
    data.append({'participant_number': participant_number, 'tutorial_number': 1, 'feedback_score_tutorial': row['feedback_score_tutorial_2']})
    data.append({'participant_number': participant_number, 'tutorial_number': 2, 'feedback_score_tutorial': row['feedback_score_tutorial_3']})

behavioural_analysis_tutorial_df = pd.DataFrame(data)

print(behavioural_analysis_tutorial_df)

     participant_number  tutorial_number  feedback_score_tutorial
0                     2                0                        1
1                     2                1                        3
2                     2                2                        8
3                     3                0                        4
4                     3                1                        1
..                  ...              ...                      ...
175                  80                1                        4
176                  80                2                        7
177                  81                0                        3
178                  81                1                        1
179                  81                2                        7

[180 rows x 3 columns]


### Output master_tutorial.csv with Behavioural, fNIRS, and Eye-Tracking Features

In [7]:
fnirs_analysis_tutorial_df = pd.read_csv('fnirs_analysis_tutorial.csv')
eye_tracking_tutorial_df = pd.read_csv('eye_tracking_analysis_tutorial.csv')

merged_df = behavioural_analysis_tutorial_df.merge(fnirs_analysis_tutorial_df, on=['participant_number', 'tutorial_number'])
merged_df = merged_df.merge(eye_tracking_tutorial_df, on=['participant_number', 'tutorial_number'])

merged_df.to_csv('master_tutorial.csv', index=False)

print("master_tutorial.csv has been updated successfully.")

master_tutorial.csv has been updated successfully.


### Supervised Classification: Logistic Regression Model

In [8]:
master_tutorial_df = pd.read_csv('master_tutorial.csv')

master_tutorial_df = master_tutorial_df.drop(columns=['participant_number'])
master_tutorial_df = master_tutorial_df.drop(columns=['feedback_score_tutorial'])
master_tutorial_df = master_tutorial_df[master_tutorial_df['tutorial_number'] != 0]
master_tutorial_df['tutorial_number'] = master_tutorial_df['tutorial_number'].apply(lambda x: x - 1)

X = master_tutorial_df.drop(columns=['tutorial_number'])
y = master_tutorial_df['tutorial_number']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(max_iter=1000)
parameters = {
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': [0.01, 0.05, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],  
    'l1_ratio': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9] 
}

clf = GridSearchCV(logistic_regression, parameters, cv=5, scoring='f1_weighted')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))
print(clf.best_params_)



Accuracy Score:  0.7916666666666666
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        10
           1       0.91      0.71      0.80        14

    accuracy                           0.79        24
   macro avg       0.80      0.81      0.79        24
weighted avg       0.82      0.79      0.79        24

{'C': 1, 'l1_ratio': 0.05, 'penalty': 'l1', 'solver': 'liblinear'}


180 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mobile Workstation 3\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mobile Workstation 3\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Mobile Workstation 3\AppData\Roaming\Python\Python38\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  

### Supervised Classification: Random Forest Model

In [None]:
rf_model = RandomForestClassifier(random_state=42)

param_distributions = {
    'n_estimators': randint(50, 300),  
    'max_depth': [None, 10, 20, 30, 40, 50], 
    'min_samples_split': randint(2, 20),  
    'min_samples_leaf': randint(1, 10),  
    'max_features': ['sqrt', 'log2', None],  
    'bootstrap': [True, False]  
}

random_search = RandomizedSearchCV(
    rf_model,
    param_distributions=param_distributions,
    n_iter=50, 
    scoring='f1_weighted', 
    cv=5,  
    random_state=42,
    n_jobs=-1 
)
random_search.fit(X_train, y_train)

y_pred = random_search.best_estimator_.predict(X_test)

print("Best Parameters: ", random_search.best_params_)
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

feature_importances = random_search.best_estimator_.feature_importances_
features = X.columns
importances = pd.Series(feature_importances, index=features).sort_values(ascending=False)
print(importances)

plt.figure(figsize=(10, 6))
importances.plot(kind='bar')
plt.title('Feature Importances')
plt.ylabel('Importance')
plt.show()

Best Parameters:  {'bootstrap': True, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 13, 'n_estimators': 104}
F1 Score:  0.8344988344988344
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        10
           1       0.92      0.79      0.85        14

    accuracy                           0.83        24
   macro avg       0.83      0.84      0.83        24
weighted avg       0.85      0.83      0.83        24



### Supervised Classification: Linear Discriminant Analysis Model

In [None]:
lda_model = LinearDiscriminantAnalysis()

parameters = {
    'solver': ['svd', 'lsqr', 'eigen'], 
    'shrinkage': [None, 'auto', 0.1, 0.3, 0.5, 0.7, 0.9] 
}

clf = GridSearchCV(
    lda_model,
    parameters,
    cv=5,  
    scoring='f1_weighted',  
    n_jobs=-1  
)
clf.fit(X_train, y_train)

y_pred = clf.best_estimator_.predict(X_test)

print("Best Parameters: ", clf.best_params_)
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

Best Parameters:  {'shrinkage': None, 'solver': 'svd'}
F1 Score:  0.75
              precision    recall  f1-score   support

           0       0.64      0.90      0.75        10
           1       0.90      0.64      0.75        14

    accuracy                           0.75        24
   macro avg       0.77      0.77      0.75        24
weighted avg       0.79      0.75      0.75        24



34 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mobile Workstation 3\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mobile Workstation 3\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Mobile Workstation 3\AppData\Roaming\Python\Python38\site-packages\sklearn\discriminant_analysis.py", line 637, in fit
    self._solve_eigen(
  File "C:\Users\Mobile Workstation 3\AppData\Roam

### Supervised Classification: Support Vector Machine Model

In [12]:
svm_model = SVC()

param_distributions = {
    'C': uniform(0.1, 100), 
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  
    'degree': [2, 3, 4], 
    'gamma': ['scale', 'auto'], 
    'coef0': uniform(0, 10) 
}

random_search = RandomizedSearchCV(
    svm_model,
    param_distributions=param_distributions,
    n_iter=100,  
    scoring='f1_weighted',
    cv=5,  
    random_state=42,
    n_jobs=-1 
)
random_search.fit(X_train, y_train)

y_pred = random_search.best_estimator_.predict(X_test)

print("Best Parameters: ", random_search.best_params_)
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.3f}")
print(classification_report(y_test, y_pred, digits=3)) 
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.3f}")

Best Parameters:  {'C': 4.622728891053806, 'coef0': 3.2533033076326436, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf'}
F1 Score: 0.834
              precision    recall  f1-score   support

           0      0.750     0.900     0.818        10
           1      0.917     0.786     0.846        14

    accuracy                          0.833        24
   macro avg      0.833     0.843     0.832        24
weighted avg      0.847     0.833     0.834        24

Accuracy Score: 0.833


### Add Supervised Classifier Label

In [None]:
master_df = pd.read_csv('master.csv')

features = [
    'O2Hb_average_peak', 'O2Hb_highest_peak', 'O2Hb_lowest_peak',
    'O2Hb_difference_peak', 'O2Hb_auc',
    'duration_fixation', 'duration_saccade',
    'count_fixations', 'count_saccades'
]
X = master_df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

supervised_labels = random_search.best_estimator_.predict(X_scaled)

feedback_index = master_df.columns.get_loc('feedback_score_subexperiment')
master_df.insert(feedback_index + 1, 'supervised_classifier_label', supervised_labels)

master_df.to_csv('master.csv', index=False)
print("master.csv has been updated with the 'supervised_classifier_label' column next to 'feedback_score_subexperiment'.")

master.csv has been updated with the 'supervised_classifier_label' column next to 'feedback_score_subexperiment'.
