In [1]:
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

## functions

In [2]:
class RemoveCorrelationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, correlation_threshold=0.7):
        self.correlation_threshold = correlation_threshold


    def fit(self, X, Y=None):
        df = pd.DataFrame(X)
        df_corr = df.corr(method='pearson', min_periods=1)
        df_not_correlated = ~(df_corr.mask(
            np.tril(np.ones([len(df_corr)] * 2, dtype=bool))).abs() > self.correlation_threshold).any()
        self.un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
        return self

    def transform(self, X, Y=None):
        df = pd.DataFrame(X)
        df = df[self.un_corr_idx]
        return df.values

In [251]:
def refactor_labels(df):
    return df.replace({'low': 0 ,'high': 1, 'clinical': 1 })


def get_data(file_name, LSAS_threshold=None):
    group_column = 'group'
    sub_num_col = 'Subject_Number'
    lsas_col = 'LSAS'
    df = pd.read_excel(file_name, sheet_name='Sheet1')
    if LSAS_threshold is None:
        X = df.drop([group_column, sub_num_col, lsas_col], 1)
        Y = refactor_labels(df[group_column])
        return X, Y
    else:
        X = df.drop([group_column], 1)
        Y = pd.Series(np.where(X[lsas_col] > LSAS_threshold, 1, 0))
        X = X.drop([sub_num_col, lsas_col], 1)
        return X, Y


## the model
![image.png](attachment:image.png)

## threshold 50

In [252]:
test = pd.read_excel("test_set_updated.xlsx")
file_name = "training_set_100_updated.xlsx"
X_full_training_set, y_full_training_set = get_data(file_name, LSAS_threshold = 50)
random.seed(217828)
columns_shuffled = list(X_full_training_set.columns)
random.shuffle(columns_shuffled)
X_full_training_set = X_full_training_set[columns_shuffled] 
X_test = test[columns_shuffled]

In [253]:
pipe =  Pipeline([
    ('correlation_threshold', RemoveCorrelationTransformer(correlation_threshold = 0.7)), 
    ('rfc', RFE(RandomForestClassifier(n_estimators = 100), n_features_to_select = 11)),
    ('classifier', GradientBoostingClassifier( max_depth=6, n_estimators=400, learning_rate = 0.05))])

In [None]:
model = pipe.fit(X_full_training_set, y_full_training_set)
y_pred = model.predict(X_test)
for i in zip (test['Subject_Number'], y_pred):
    print(i)

## threshold 63

In [257]:
test = pd.read_excel("test_set_updated.xlsx")
file_name = "training_set_100_updated.xlsx"
X_full_training_set, y_full_training_set = get_data(file_name, LSAS_threshold = 63)
random.seed(217828)
columns_shuffled = list(X_full_training_set.columns)
random.shuffle(columns_shuffled)
X_full_training_set = X_full_training_set[columns_shuffled] 
X_test = test[columns_shuffled]

11

In [223]:

pipe =  Pipeline([
    ('correlation_threshold', RemoveCorrelationTransformer(correlation_threshold = 0.7)), 
    ('rfc', RFE(RandomForestClassifier(n_estimators = 100), n_features_to_select = 11)),
    ('classifier', GradientBoostingClassifier( max_depth=6, n_estimators=400, learning_rate = 0.05))])


Unnamed: 0,Ratio N/D+N,STD_fixation_length_All,average_fixation_length_Neutral,avg_of_sum_fixation_length_Disgusted,avg_of_amount_fixation_Disgusted,STD_fixation_length_Disgusted,avg_of_amount_fixation_White_Space,STD_pupil_size_Neutral,average_fixation_length_Disgusted,avg_of_amount_fixation_Neutral,...,STD_pupil_size_White_Space,average_pupil_size_Disgusted,average_pupil_size_Neutral,avg_of_sum_fixation_length_Neutral,Ratio D/D+N,STD_fixation_length_Neutral,STD_pupil_size_All,var_ratio_D_DN,average_pupil_size_White_Space,avg_of_sum_fixation_length_White_Space
0,0.496015,75.055419,191.436098,2246.618333,11.550000,71.328962,1.733333,0.197914,194.512410,10.250000,...,0.252251,4.499711,4.509919,1962.220000,0.503985,82.196263,0.202007,0.000000,4.370192,305.108333
1,0.518871,210.075033,279.476804,1414.330508,5.457627,174.141364,0.423729,0.264999,259.147516,6.576271,...,0.271116,5.652174,5.676031,1837.915254,0.481129,239.976088,0.272918,0.000000,5.464000,82.738983
2,0.585817,221.586391,319.456314,1212.121667,5.366667,111.251170,1.900000,0.290417,225.861180,9.766667,...,0.290646,3.909317,3.923208,3120.023333,0.414183,269.392021,0.319898,0.000000,3.561404,440.491667
3,0.498730,215.471895,294.974834,2075.338333,7.000000,214.596071,1.683333,0.244663,296.476905,7.550000,...,0.272545,4.272381,4.266004,2227.060000,0.501270,227.290297,0.254802,0.000000,4.127723,403.295000
4,0.499881,188.072859,283.180610,2134.308333,7.533333,195.725395,1.883333,0.349628,283.315265,7.650000,...,0.376116,3.675442,3.659259,2166.331667,0.500119,191.703555,0.349578,0.000000,3.468142,398.418333
5,0.504969,75.396003,186.787857,1379.441667,7.533333,76.715333,0.883333,0.371299,183.111726,7.000000,...,0.501851,5.296018,5.245714,1307.515000,0.495031,75.793764,0.374754,0.000000,5.094340,134.830000
6,0.499836,170.541398,281.705765,2109.483333,7.483333,172.165181,1.116667,0.330235,281.890869,7.516667,...,0.308649,3.665256,3.632151,2117.488333,0.500164,172.008167,0.331413,0.000000,3.489552,238.038333
7,0.491849,190.004682,266.114824,1891.923729,6.881356,180.233932,1.389831,0.260338,274.934729,7.203390,...,0.281652,4.345567,4.363529,1916.928814,0.508151,203.960210,0.276320,0.000000,4.148780,358.854237
8,0.539677,143.076992,245.116134,1568.055000,7.500000,96.335885,2.050000,0.249919,209.074000,9.916667,...,0.297230,4.150222,4.172605,2430.735000,0.460323,164.932210,0.262495,0.000000,4.047154,455.035000
9,0.497630,162.561438,249.901082,1988.818333,7.883333,161.752047,1.833333,0.257651,252.281395,7.700000,...,0.288612,3.752220,3.753463,1924.238333,0.502370,173.574480,0.263631,0.000000,3.708182,418.115000


In [None]:
model = pipe.fit(X_full_training_set, y_full_training_set)
y_pred = model.predict(X_test)
for i in zip (test['Subject_Number'], y_pred):
    print(i)