In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
'''
read the data from the files
check if you nead to update path
'''
EEG_features_3_sec = pd.read_csv("output_b/df_EEG_b_features_Tob_-3_toTob_0.csv")
EEG_features_2_sec = pd.read_csv("output_b/df_EEG_b_features_Tob_-2_toTob_0.csv")
EEG_features_3_until_2_sec = pd.read_csv("output_b/df_EEG_b_features_Tob_-3_toTob_-2.csv")
paymentMethod = pd.read_csv("df_painOfPayment_method.csv")
painOfPayment_score = pd.read_csv("df_painOfPayment_score.csv")

In [4]:
def merge_all(EEG_features):
    df_merged = pd.merge(EEG_features, paymentMethod, on="subject", how="inner")
    df_merged = pd.merge(df_merged, painOfPayment_score, on="subject", how="inner")

    return df_merged

In [5]:
'''
initialize variables
'''
waves = ['delta', 'theta', 'alpha', 'beta', 'gamma']
df_data_3_sec = merge_all(EEG_features_3_sec)
df_data_2_sec = merge_all(EEG_features_2_sec)
df_data_3_until_2_sec = merge_all(EEG_features_3_until_2_sec)

In [6]:
def get_RandomForest_hyperparameters(X_train,y_train, classification = True):
    
    # Number of trees in Random Forest
    n_estimators = [50, 100, 200, 300]
    
    # Number of features to consider at every split
    max_features = ['sqrt', 'log2', None]
    
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(5, 50, 10)]
    # Add the default as a possible value
    max_depth.append(None)
    
    # min_samples_split
    min_split = [2,3,4,5]
      
    param_grid = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth':max_depth,
                  'min_samples_split': min_split
                 }
    
    if classification:
        rf_base = RandomForestClassifier(random_state = 42)
    else:
        rf_base = RandomForestRegressor(random_state = 42)
    
    '''
    Create the grid search Random Forest:
    GridSearchCV
        GridSearch: taking all of paramters combination from the grid parameters to find the best hyper parameters
        CV: using cross validation 
    cv = 4 (4-fold cross validation) --> validation_set = 20% from all data (0.8 X 0.25 = 0.2)
    cv = LeaveOneOut()
    ''' 
    # note: if regression and cv = LeaveOneOut() --> use scoring = 'neg_mean_absolute_error'
    rf_search = GridSearchCV(estimator = rf_base, param_grid = param_grid, 
                               cv = LeaveOneOut(), verbose = 2, n_jobs = -1, scoring = 'neg_mean_absolute_error')
    
    # Fit the random search model
    rf_search.fit(X_train, y_train)
    
    return rf_search

In [7]:
def get_SVC_hyperparameters(X_train, y_train):
    
    # 
    C = [0.1,1, 10, 100, 1000]
    
    # 
    gamma = [1,0.1,0.01,0.001,0.0001]
       
    param_grid = {'C': C, 'gamma': gamma}
    
    SVC_base = SVC(random_state = 42)
    
    '''
    Create the grid search SVC:
    GridSearchCV
        GridSearch: taking all of paramters combination from the grid parameters to find the best hyper parameters
        CV: using cross validation 
    cv = 4 (4-fold cross validation) --> validation_set = 20% from all data (0.8 X 0.25 = 0.2)
    cv = LeaveOneOut()
    ''' 
    
    SVC_search = GridSearchCV(estimator = SVC_base, param_grid = param_grid, 
                               cv = LeaveOneOut(), verbose = 2, n_jobs = -1)
    
    # Fit the random search model
    SVC_search.fit(X_train, y_train)
    
    return SVC_search

In [8]:
'''
get model hyperParameters
'''
def get_classification_hyperParameters(X_train, y_train):
    rfc_serach = get_RandomForest_hyperparameters(X_train, y_train)
    print(rfc_serach.best_params_)

    SVC_serach = get_SVC_hyperparameters(X_train, y_train)
    print(SVC_serach.best_params_)

In [15]:
'''
get model hyperParameters
'''
def get_regression_hyperParameters(X_train, y_train):
    rfc_serach = get_RandomForest_hyperparameters(X_train, y_train, classification = False)
    print(rfc_serach.best_params_)

In [9]:
def RandomForest_classification_model(X_train, X_test, y_train, y_test, n_estimators, max_features , max_depth,  min_samples_split):
    print("################# RandomForest #################")
   
    rfc = RandomForestClassifier(n_estimators = n_estimators, max_features = max_features, max_depth = max_depth,  min_samples_split = min_samples_split, random_state = 42)
    rfc.fit(X_train, y_train)
    print_classification_matrics(rfc, X_train, X_test, y_train, y_test)

In [10]:
def RandomForest_regression_model(X_train, X_test, y_train, y_test, n_estimators, max_features , max_depth,  min_samples_split):
    print("################# RandomForest #################")
    
    rfc = RandomForestRegressor(n_estimators = n_estimators, max_features = max_features, max_depth = max_depth,  min_samples_split = min_samples_split, random_state = 42)
    rfc.fit(X_train, y_train)
    print_regression_matrics(rfc, X_train, X_test, y_train, y_test)

In [11]:
def SVC_model(X_train, X_test, y_train, y_test, C = 0.1, gamma = 1):
    print("################# SVC #################")
    
    svc = SVC(C = C, gamma = gamma, random_state = 42)
    svc.fit(X_train, y_train) 
    print_classification_matrics(svc, X_train, X_test, y_train, y_test)

In [12]:
def calssification_rule_based_model(X_train, X_test, y_train, y_test):
        
    print ("################# DummyClassifier - most_frequent #################")
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    print_classification_matrics(dummy_clf, X_train, X_test, y_train, y_test)
    
    # "stratified" strategy - generates predictions by randomly selecting labels according to the distribution of categories in the training data.
    print ("################# DummyClassifier - stratified #################")
    dummy_clf = DummyClassifier(strategy="stratified", random_state=42)
    dummy_clf.fit(X_train, y_train)
    print_classification_matrics(dummy_clf, X_train, X_test, y_train, y_test)
    
    print("################# LogisticRegression #################")
    Logistic_clf = LogisticRegression(random_state=42)
    Logistic_clf.fit(X_train, y_train)
    print_classification_matrics(Logistic_clf, X_train, X_test, y_train, y_test)

In [16]:
def regression_rule_based_model(X_train, X_test, y_train, y_test):
        
    print ("################# DummyClassifier - median #################")
    dummy_clf = DummyRegressor(strategy="median")
    dummy_clf.fit(X_train, y_train)
    print_regression_matrics(dummy_clf, X_train, X_test, y_train, y_test)
    
    print ("################# DummyClassifier - mean #################")
    dummy_clf = DummyRegressor(strategy="mean")
    dummy_clf.fit(X_train, y_train)
    print_regression_matrics(dummy_clf, X_train, X_test, y_train, y_test)
    
    print("################# LinearRegression #################")
    Linear_clf = LinearRegression()
    Linear_clf.fit(X_train, y_train)
    print_regression_matrics(Linear_clf, X_train, X_test, y_train, y_test)

In [13]:
def print_classification_matrics(model, X_train, X_test, y_train, y_test):
    
    print("####### train data results #######")
    print_classification_matrics_helper(model, X_train, y_train)
    
    print("####### test data results #######")
    print_classification_matrics_helper(model, X_test, y_test)

In [14]:
def print_classification_matrics_helper(model, X, y):
    
    predictions = model.predict(X)
    curr_confusion_matrix = confusion_matrix(y, predictions)
    print("confusion_matrix: ")
    print(curr_confusion_matrix)
    curr_classification_report = classification_report(y ,predictions, zero_division=0)
    print("classification_report: ")
    print(curr_classification_report)
    print("Accuracy: ", model.score(X, y) , "\n")

In [17]:
def print_regression_matrics(model, X_train, X_test, y_train, y_test):
    
    print("####### train data results #######")
    print_regression_matrics_helper(model, X_train, y_train)
    
    print("####### test data results #######")
    print_regression_matrics_helper(model, X_test, y_test)

In [18]:
def print_regression_matrics_helper(model, X, y):
    
    predictions = model.predict(X)
    
    plt.scatter(y, predictions, alpha = 0.6)
    plt.xlabel('True score', fontsize=12)
    plt.ylabel('Predicted score')
    plt.xlim(3, 29)
    plt.ylim(3, 29)
    plt.show()
    
    sns.histplot((y-predictions), bins=50, kde=True)
    plt.title("distibution of True_Score - Predicted_Score ", fontsize=14)
    plt.show()
    
    print('MAE:', metrics.mean_absolute_error(y, predictions))
    print('MSE:', metrics.mean_squared_error(y, predictions))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y, predictions)))
    print("\n")

In [19]:
'''
get model predictions
Call predict on the estimator with the best found parameters.
'''
def get_model_1_predicrions(X_train, X_test, y_train, y_test):
    
    # baseline
    # Rule-based models
    dummy_clf = calssification_rule_based_model(X_train, X_test, y_train, y_test)
    
    # random forest 
    # using: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 5}
    rfc = RandomForest_classification_model(X_train, X_test, y_train, y_test, n_estimators = 200, max_features = 'sqrt', max_depth = 5,  min_samples_split = 5)

    # SVM
    # using: {'C': 0.1, 'gamma': 1}
    #svc = SVC_model(X_train, X_test, y_train, y_test)


In [20]:
'''
get model predictions
Call predict on the estimator with the best found parameters.
'''
def get_model_2_predicrions(X_train, X_test, y_train, y_test):
    
    # baseline
    # Rule-based models
    dummy_clf = calssification_rule_based_model(X_train, X_test, y_train, y_test)
    
    # random forest 
    # using: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 4}
    rfc = RandomForest_classification_model(X_train, X_test, y_train, y_test, n_estimators = 200, max_features = 'sqrt', max_depth = 5,  min_samples_split = 4)

    # SVM
    # using: {'C': 0.1, 'gamma': 1}
    # svc = SVC_model(X_train, X_test, y_train, y_test)


In [21]:
'''
get model predictions
Call predict on the estimator with the best found parameters.
'''
def get_model_3_predicrions(X_train, X_test, y_train, y_test):
    
    # baseline
    # Rule-based models
    dummy_clf = regression_rule_based_model(X_train, X_test, y_train, y_test)
    
    # random forest 
    # using: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
    rfc = RandomForest_regression_model(X_train, X_test, y_train, y_test, n_estimators = 50, max_features = 'sqrt', max_depth = 5,  min_samples_split = 4)


In [22]:
'''
MAIN function to get model 1
'''
# Model 1 (first question): Tries to predict the payment method based on the EEG_features and painOfPayment

X = df_data_3_sec[waves]
y = df_data_3_sec['Payment_method']

# test size is 20% from all data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# get_classification_hyperParameters(X_train, y_train)
# random forest (3 seconds before ToB):
# cv = 4: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 200}
# cv = 5: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
# cv = LeaveOneOut(): {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 200}

# random forest - stratify (3 seconds before ToB):
# cv = 4: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 200}
# cv = 5: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 50}
# cv = LeaveOneOut(): {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 50}

# SVC:
# cv = 4: {'C': 100, 'gamma': 1}
# cv = 5: {'C': 0.1, 'gamma': 1}
# cv = LeaveOneOut() : {'C': 0.1, 'gamma': 1}

get_model_1_predicrions(X_train, X_test, y_train, y_test)

################# DummyClassifier - most_frequent #################
####### train data results #######
confusion_matrix: 
[[ 0 18  0]
 [ 0 29  0]
 [ 0 14  0]]
classification_report: 
              precision    recall  f1-score   support

        Cash       0.00      0.00      0.00        18
 Credit Card       0.48      1.00      0.64        29
  Smartphone       0.00      0.00      0.00        14

    accuracy                           0.48        61
   macro avg       0.16      0.33      0.21        61
weighted avg       0.23      0.48      0.31        61

Accuracy:  0.47540983606557374 

####### test data results #######
confusion_matrix: 
[[0 5 0]
 [0 3 0]
 [0 8 0]]
classification_report: 
              precision    recall  f1-score   support

        Cash       0.00      0.00      0.00         5
 Credit Card       0.19      1.00      0.32         3
  Smartphone       0.00      0.00      0.00         8

    accuracy                           0.19        16
   macro avg       0.06   

In [23]:
'''
MAIN function to get model 2 - classification
'''
# Model 2 (second question) : Tries to predict the painOfPayment score based on the EEG_features

X = df_data_2_sec[waves]
y = df_data_2_sec['PainOfPayment_categorical_score']

# test size is 20% from all data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# get_classification_hyperParameters(X_train, y_train)

# random forest (3 seconds before ToB):
# cv = 4: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 200}
# cv = 5: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 50}
# cv = LeaveOneOut(): {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 50}

# random forest (2 seconds before ToB):
# cv = 4: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 100}
# cv = 5: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 50}
# cv = LeaveOneOut(): {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}

# SVC:
# cv = 4: {'C': 0.1, 'gamma': 1}
# cv = 5: {'C': 0.1, 'gamma': 1}
# cv = LeaveOneOut() : {'C': 0.1, 'gamma': 1}

get_model_2_predicrions(X_train, X_test, y_train, y_test)

################# DummyClassifier - most_frequent #################
####### train data results #######
confusion_matrix: 
[[ 0  0 19]
 [ 0  0  7]
 [ 0  0 35]]
classification_report: 
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        19
         low       0.00      0.00      0.00         7
      medium       0.57      1.00      0.73        35

    accuracy                           0.57        61
   macro avg       0.19      0.33      0.24        61
weighted avg       0.33      0.57      0.42        61

Accuracy:  0.5737704918032787 

####### test data results #######
confusion_matrix: 
[[0 0 8]
 [0 0 3]
 [0 0 5]]
classification_report: 
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         8
         low       0.00      0.00      0.00         3
      medium       0.31      1.00      0.48         5

    accuracy                           0.31        16
   macro avg       0.10    

In [None]:
'''
MAIN function to get model 3 - regression
'''
# Model 3 (second question) : Tries to predict the painOfPayment score based on the EEG_features

X = df_data_2_sec[waves]
y = df_data_2_sec['PainOfPayment_score']

# test size is 20% from all data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

get_regression_hyperParameters(X_train, y_train)

# random forest (3 seconds before ToB):
# with min_samples_split and cv = 4: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
# with min_samples_split and cv = 5: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 50}
# with min_samples_split and cv = LeaveOneOut(): 

# random forest (2 seconds before ToB):
# with min_samples_split and cv = 4: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
# with min_samples_split and cv = 5: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 50}
# with min_samples_split and cv = LeaveOneOut():

get_model_3_predicrions(X_train, X_test, y_train, y_test)

Fitting 61 folds for each of 528 candidates, totalling 32208 fits
