In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import pandas as pd
import os
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneOut
from imblearn.over_sampling import SMOTE, ADASYN
#from imblearn.pipeline import Pipeline


In [19]:
class RemoveCorrelationTransformer2(BaseEstimator, TransformerMixin):
    def __init__(self, correlation_threshold=0.7):
        self.correlation_threshold = correlation_threshold


    def fit(self, X, Y=None):
        df = pd.DataFrame(X)
        df_corr = df.corr(method='pearson', min_periods=1)
        df_not_correlated = ~(df_corr.mask(
            np.tril(np.ones([len(df_corr)] * 2, dtype=bool))).abs() > self.correlation_threshold).any()
        self.un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
        return self

    def transform(self, X, Y=None):
        df = pd.DataFrame(X)
        df = df[self.un_corr_idx]
        return df.values

In [20]:

class RemoveMissingFeaturesTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, Y=None):
        self.is_missing = X.isnull().values.any(axis=0)
        return self

    def transform(self, X, Y=None):
        copy_x = pd.DataFrame(X)
        self.is_missing += copy_x.isnull().values.any(axis=0)

        copy_x = copy_x.iloc[:, ~self.is_missing]

        return copy_x.values


In [21]:
def refactor_labels(df):
    return df.replace({'low': 0 ,'high': 1, 'clinical': 1 })


def get_data(file_name, LSAS_threshold=None):
    group_column = 'group'
    sub_num_col = 'Subject_Number'
    lsas_col = 'LSAS'
    df = pd.read_excel(file_name, sheet_name='Sheet1')
    if LSAS_threshold is None:
        X = df.drop([group_column, sub_num_col, lsas_col], 1)
        Y = refactor_labels(df[group_column])
        return X, Y
    else:
        X = df.drop([group_column], 1)
        Y = pd.Series(np.where(X[lsas_col] > LSAS_threshold, 1, 0))
        X = X.drop([sub_num_col, lsas_col], 1)
        return X, Y


## get training data

In [22]:
file_name = "training_set.xlsx"

In [23]:
X_train, y_train = get_data(file_name, LSAS_threshold = 50)

## get test data

In [24]:
file_name = "full_test_set.xlsx"
df = pd.read_excel(file_name, sheet_name='Sheet1')
X_test = df.drop('Subject_Number', axis = 1)

## training pipeline

In [25]:
pipe =  Pipeline([
    ("rnf", RemoveMissingFeaturesTransformer()), 
                ('scaling', MinMaxScaler()),
            ('correlation_threshold', RemoveCorrelationTransformer2()),
            ('rfc', RFE(RandomForestClassifier(n_estimators = 100))),
                ('classifier', GradientBoostingClassifier(learning_rate= 0.05))])

In [26]:
params_grid = [
    {
        
              'classifier__max_depth': [6],
     'classifier__n_estimators': [400],
    'rfc__n_features_to_select': [11]}]

## train the model

In [27]:
results = []
for i in range(100):
    loo = LeaveOneOut()
    gs = GridSearchCV(pipe, params_grid, cv=loo, scoring='accuracy')
    gs.fit(X_train, y_train)
    results.append(gs.best_score_)

In [28]:
gs.best_params_

{'classifier__max_depth': 6,
 'classifier__n_estimators': 400,
 'rfc__n_features_to_select': 11}

In [29]:
gs.best_score_

0.7088607594936709

## create predctions

In [31]:
sum(results)/100

0.714303797468354

In [33]:
repr(results)

'[0.7088607594936709, 0.7215189873417721, 0.7215189873417721, 0.7215189873417721, 0.7088607594936709, 0.6962025316455697, 0.7088607594936709, 0.7215189873417721, 0.7088607594936709, 0.7215189873417721, 0.7215189873417721, 0.6962025316455697, 0.6962025316455697, 0.7088607594936709, 0.7215189873417721, 0.7215189873417721, 0.7088607594936709, 0.7088607594936709, 0.7215189873417721, 0.7215189873417721, 0.6962025316455697, 0.7215189873417721, 0.7341772151898734, 0.7215189873417721, 0.7215189873417721, 0.7088607594936709, 0.7341772151898734, 0.7215189873417721, 0.7215189873417721, 0.7088607594936709, 0.7088607594936709, 0.7088607594936709, 0.7088607594936709, 0.7215189873417721, 0.7468354430379747, 0.6962025316455697, 0.7088607594936709, 0.6962025316455697, 0.7215189873417721, 0.7215189873417721, 0.7215189873417721, 0.6962025316455697, 0.6962025316455697, 0.7088607594936709, 0.6962025316455697, 0.7088607594936709, 0.7215189873417721, 0.6962025316455697, 0.7088607594936709, 0.7215189873417721

In [34]:
r = np.array(results)

In [36]:
r.std()

0.012174951762064106

In [37]:
r.mean()

0.7143037974683546

In [38]:
r.min()

0.6835443037974683

In [39]:
r.max()

0.7468354430379747