In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import pandas as pd
import os
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneOut
from imblearn.over_sampling import SMOTE, ADASYN
#from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import random


In [2]:
class RemoveCorrelationTransformer2(BaseEstimator, TransformerMixin):
    def __init__(self, correlation_threshold=0.7):
        self.correlation_threshold = correlation_threshold


    def fit(self, X, Y=None):
        df = pd.DataFrame(X)
        df_corr = df.corr(method='pearson', min_periods=1)
        df_not_correlated = ~(df_corr.mask(
            np.tril(np.ones([len(df_corr)] * 2, dtype=bool))).abs() > self.correlation_threshold).any()
        self.un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
        return self

    def transform(self, X, Y=None):
        df = pd.DataFrame(X)
        df = df[self.un_corr_idx]
        return df.values

In [3]:

class RemoveCorrelationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, correlation_threshold=0.7, pca_components_ratio=3):
        self.correlation_threshold = correlation_threshold
        self.pca_components_ratio = pca_components_ratio


    def fit(self, X, Y=None):
        df = pd.DataFrame(X)
        df_corr = df.corr(method='pearson')
        df_corr = df_corr - np.eye(df.shape[1])
        outliares_corr = df_corr[np.abs(df_corr) > self.correlation_threshold]
        self.outliares_corr = outliares_corr.dropna(axis=1, how='all')

        correlated_df = df[self.outliares_corr.columns]

        n_components = len(self.outliares_corr.columns) // self.pca_components_ratio
        pca = PCA(n_components=n_components)

        correlated_df = pca.fit_transform(correlated_df)
        self.correlated_df = pd.DataFrame(correlated_df, columns=["pca_{}".format(i) for i in range(n_components)])

        return self

    def transform(self, X, Y=None):
        df = pd.DataFrame(X)
        df = df.drop((self.outliares_corr.columns), axis=1)
        df = df.join(self.correlated_df)
        return df

In [4]:

class RemoveMissingFeaturesTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, Y=None):
        self.is_missing = X.isnull().values.any(axis=0)
        return self

    def transform(self, X, Y=None):
        copy_x = pd.DataFrame(X)
        self.is_missing += copy_x.isnull().values.any(axis=0)

        copy_x = copy_x.iloc[:, ~self.is_missing]

        return copy_x.values


In [5]:
def refactor_labels(df):
    return df.replace({'low': 0 ,'high': 1, 'clinical': 1 })


def get_data(file_name, LSAS_threshold=None):
    group_column = 'group'
    sub_num_col = 'Subject_Number'
    lsas_col = 'LSAS'
    df = pd.read_excel(file_name, sheet_name='Sheet1')
    if LSAS_threshold is None:
        X = df.drop([group_column, sub_num_col, lsas_col], 1)
        Y = refactor_labels(df[group_column])
        return X, Y
    else:
        X = df.drop([group_column], 1)
        Y = pd.Series(np.where(X[lsas_col] > LSAS_threshold, 1, 0))
        X = X.drop([sub_num_col, lsas_col], 1)
        return X, Y


In [15]:
#from random features copy 7
features_lists_ = [
    ['STD_fixation_length_White_Space', 'STD_pupil_size_Neutral', 'STD_pupil_size_White_Space', 'average_fixation_length_Neutral', 'avg_of_amount_fixation_Disgusted', 'average_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'Ratio D/D+N', 'avg_of_sum_fixation_length_Disgusted', 'avg_of_amount_fixation_Neutral', 'average_pupil_size_Disgusted', 'avg_of_sum_fixation_length_Neutral', 'var_ratio_D_DN', 'average_fixation_length_White_Space'],
    ['STD_fixation_length_Neutral', 'average_pupil_size_Disgusted', 'STD_fixation_length_White_Space', 'avg_of_amount_fixation_Disgusted', 'average_fixation_length_Neutral', 'Ratio D/D+N', 'STD_pupil_size_White_Space', 'average_fixation_length_White_Space', 'avg_of_amount_fixation_Neutral', 'STD_pupil_size_Neutral', 'avg_of_sum_fixation_length_Disgusted', 'average_pupil_size_White_Space', 'var_ratio_D_DN', 'avg_of_sum_fixation_length_Neutral'],
    ['average_pupil_size_White_Space', 'avg_of_sum_fixation_length_Disgusted', 'STD_fixation_length_Neutral', 'avg_of_amount_fixation_Neutral', 'STD_pupil_size_White_Space', 'average_fixation_length_Neutral', 'Ratio D/D+N', 'average_fixation_length_White_Space', 'avg_of_amount_fixation_Disgusted', 'avg_of_sum_fixation_length_Neutral', 'var_ratio_D_DN', 'STD_pupil_size_Neutral', 'STD_fixation_length_White_Space', 'average_pupil_size_Disgusted'],
    ['avg_of_amount_fixation_Disgusted', 'var_ratio_D_DN', 'STD_fixation_length_White_Space', 'average_pupil_size_White_Space', 'average_fixation_length_White_Space', 'STD_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'STD_pupil_size_Neutral', 'avg_of_sum_fixation_length_Disgusted', 'avg_of_sum_fixation_length_Neutral', 'average_pupil_size_Disgusted', 'avg_of_amount_fixation_Neutral', 'Ratio D/D+N', 'average_fixation_length_Neutral'],
    ['avg_of_amount_fixation_Neutral', 'average_fixation_length_White_Space', 'avg_of_sum_fixation_length_Disgusted', 'average_pupil_size_White_Space', 'avg_of_sum_fixation_length_Neutral', 'Ratio D/D+N', 'avg_of_amount_fixation_Disgusted', 'STD_pupil_size_White_Space', 'STD_fixation_length_White_Space', 'var_ratio_D_DN', 'average_fixation_length_Neutral', 'STD_fixation_length_Neutral', 'STD_pupil_size_Neutral', 'average_pupil_size_Disgusted'],
    ['avg_of_amount_fixation_Disgusted', 'average_fixation_length_Neutral', 'STD_fixation_length_White_Space', 'STD_pupil_size_Neutral', 'average_pupil_size_Disgusted', 'var_ratio_D_DN', 'STD_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'avg_of_amount_fixation_Neutral', 'Ratio D/D+N', 'avg_of_sum_fixation_length_Neutral', 'avg_of_sum_fixation_length_Disgusted', 'average_pupil_size_White_Space', 'average_fixation_length_White_Space'],
    ['avg_of_amount_fixation_Neutral', 'Ratio D/D+N', 'average_fixation_length_Neutral', 'STD_pupil_size_Neutral', 'var_ratio_D_DN', 'STD_fixation_length_White_Space', 'avg_of_sum_fixation_length_Neutral', 'average_pupil_size_White_Space', 'avg_of_amount_fixation_Disgusted', 'average_pupil_size_Disgusted', 'STD_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'avg_of_sum_fixation_length_Disgusted', 'average_fixation_length_White_Space'],
    ['var_ratio_D_DN', 'avg_of_sum_fixation_length_Disgusted', 'STD_fixation_length_White_Space', 'Ratio D/D+N', 'average_fixation_length_Neutral', 'average_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'avg_of_amount_fixation_Neutral', 'avg_of_amount_fixation_Disgusted', 'STD_pupil_size_White_Space', 'average_pupil_size_Disgusted', 'avg_of_sum_fixation_length_Neutral', 'average_fixation_length_White_Space', 'STD_pupil_size_Neutral'],
    ['avg_of_amount_fixation_Neutral', 'Ratio D/D+N', 'average_fixation_length_Neutral', 'STD_pupil_size_Neutral', 'var_ratio_D_DN', 'STD_fixation_length_White_Space', 'avg_of_sum_fixation_length_Neutral', 'average_pupil_size_White_Space', 'avg_of_amount_fixation_Disgusted', 'average_pupil_size_Disgusted', 'STD_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'avg_of_sum_fixation_length_Disgusted', 'average_fixation_length_White_Space'],
    ['var_ratio_D_DN', 'avg_of_sum_fixation_length_Disgusted', 'STD_fixation_length_White_Space', 'Ratio D/D+N', 'average_fixation_length_Neutral', 'average_pupil_size_White_Space', 'STD_fixation_length_Neutral', 'avg_of_amount_fixation_Neutral', 'avg_of_amount_fixation_Disgusted', 'STD_pupil_size_White_Space', 'average_pupil_size_Disgusted', 'avg_of_sum_fixation_length_Neutral', 'average_fixation_length_White_Space', 'STD_pupil_size_Neutral'],
    ['Ratio D/D+N', 'STD_fixation_length_White_Space', 'avg_of_sum_fixation_length_Disgusted', 'avg_of_amount_fixation_Disgusted', 'average_pupil_size_White_Space', 'STD_pupil_size_Neutral', 'STD_fixation_length_Neutral', 'average_fixation_length_White_Space', 'var_ratio_D_DN', 'avg_of_amount_fixation_Neutral', 'average_fixation_length_Neutral', 'average_pupil_size_Disgusted', 'avg_of_sum_fixation_length_Neutral', 'STD_pupil_size_White_Space'],
    ['avg_of_amount_fixation_Disgusted', 'average_pupil_size_Disgusted', 'STD_pupil_size_Neutral', 'average_fixation_length_Neutral', 'Ratio D/D+N', 'avg_of_sum_fixation_length_Neutral', 'STD_fixation_length_Neutral', 'average_pupil_size_White_Space', 'avg_of_amount_fixation_Neutral', 'STD_fixation_length_White_Space', 'average_fixation_length_White_Space', 'avg_of_sum_fixation_length_Disgusted', 'var_ratio_D_DN', 'STD_pupil_size_White_Space']
    
]

## get training data

In [16]:
file_name = "training_set_100.xlsx"

In [17]:
X_train, y_train = get_data(file_name, LSAS_threshold = 50)

## shuffle

In [18]:
random.seed(217828)

In [19]:
X_train.columns

Index(['avg_of_sum_fixation_length_Disgusted',
       'avg_of_sum_fixation_length_Neutral',
       'avg_of_sum_fixation_length_White_Space',
       'average_fixation_length_Disgusted', 'average_fixation_length_Neutral',
       'average_fixation_length_White_Space',
       'avg_of_amount_fixation_Disgusted', 'avg_of_amount_fixation_Neutral',
       'avg_of_amount_fixation_White_Space', 'STD_fixation_length_Disgusted',
       'STD_fixation_length_Neutral', 'STD_fixation_length_White_Space',
       'STD_fixation_length_All', 'Ratio D/D+N', 'Ratio N/D+N',
       'var_ratio_D_DN', 'average_pupil_size_Disgusted',
       'average_pupil_size_Neutral', 'average_pupil_size_White_Space',
       'average_pupil_size_All', 'STD_pupil_size_Disgusted',
       'STD_pupil_size_Neutral', 'STD_pupil_size_White_Space',
       'STD_pupil_size_All', 'mean_different_AOI_per_trial'],
      dtype='object')

In [20]:
feature_set = 0

In [21]:

columns_shuffled = list(features_lists_[feature_set])

In [22]:
random.shuffle(columns_shuffled)
X_train = X_train[columns_shuffled]

In [23]:
len(X_train.columns)

14

In [24]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.1, stratify=y_train)


## EDA 

In [25]:
X_train.shape

(100, 14)

## get test data

In [26]:
file_name = "full_test_set.xlsx"
df = pd.read_excel(file_name, sheet_name='Sheet1')
X_test = df.drop('Subject_Number', axis = 1)

## training pipeline
 


In [30]:
pipe =  Pipeline([
  #  ("rnf", RemoveMissingFeaturesTransformer()), 
    ('correlation_threshold', RemoveCorrelationTransformer2()), 
    ('rfc', RFE(RandomForestClassifier(n_estimators = 100))),
    ('classifier', XGBClassifier())])

In [35]:
params_grid = [
    {
        'correlation_threshold__correlation_threshold' : [0.8,1],
        'rfc__n_features_to_select': [10,12,14],
        'classifier__min_child_weight': [1],
        'classifier__gamma': np.arange(0.2,1, 0.4),
        'classifier__subsample': [0.99, 0.9],
        'classifier__colsample_bytree': [0.7],
        'classifier__max_depth': [3, 5],
        'classifier__reg_alpha' : [0.5],
        'classifier__reg_lambda' :  [0.2],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__n_estimators': [ 70, 95, 150]}] 

## grid search

In [36]:
cv = LeaveOneOut()
gs = GridSearchCV(pipe, params_grid, cv=cv, scoring='accuracy')
gs.fit(X_train_2, y_train_2)

KeyboardInterrupt: 

In [None]:
gs.best_params_

In [None]:
gs.best_score_

## validate grid search score LOO

In [None]:
results = []
k = 30
for i in range(k):
    loo = LeaveOneOut()
    score = cross_val_score(gs.best_estimator_, X_train_2, y_train_2, cv=loo)
    results.append(score.mean())
    print(i)
sum(results)/k

## validate grid search score  10 folds

In [None]:
results = []
k = 30
for i in range(k):
    cv = StratifiedKFold(10)
    score = cross_val_score(gs.best_estimator_, X_train_2, y_train_2, cv=cv)
    results.append(score.mean())
    print(i)
sum(results)/k

## holdout

In [None]:
model = gs.best_estimator_.fit(X_train_2, y_train_2)

In [None]:
y_pred = model.predict(X_test_2)

In [None]:
accuracy_score(y_pred, y_test_2)

## scores on full set LOO

In [None]:
results = []
k = 30
for i in range(k):
    loo = LeaveOneOut()
    score = cross_val_score(gs.best_estimator_, X_train, y_train, cv=loo)
    results.append(score.mean())
    print(i)
sum(results)/k

## scores on full set 10 folds

In [None]:
results = []
k = 30
for i in range(k):
    cv = StratifiedKFold(10)
    score = cross_val_score(gs.best_estimator_, X_train, y_train, cv=cv)
    results.append(score.mean())
    print(i)
sum(results)/k