In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import pandas as pd
import os
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, ADASYN
#from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [2]:
class RemoveCorrelationTransformer2(BaseEstimator, TransformerMixin):
    def __init__(self, correlation_threshold=0.7):
        self.correlation_threshold = correlation_threshold


    def fit(self, X, Y=None):
        df = pd.DataFrame(X)
        df_corr = df.corr(method='pearson', min_periods=1)
        df_not_correlated = ~(df_corr.mask(
            np.tril(np.ones([len(df_corr)] * 2, dtype=bool))).abs() > self.correlation_threshold).any()
        self.un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
        return self

    def transform(self, X, Y=None):
        df = pd.DataFrame(X)
        df = df[self.un_corr_idx]
        return df.values

In [3]:

class RemoveMissingFeaturesTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, Y=None):
        self.is_missing = X.isnull().values.any(axis=0)
        return self

    def transform(self, X, Y=None):
        copy_x = pd.DataFrame(X)
        self.is_missing += copy_x.isnull().values.any(axis=0)

        copy_x = copy_x.iloc[:, ~self.is_missing]

        return copy_x.values


In [4]:
def refactor_labels(df):
    return df.replace({'low': 0 ,'high': 1, 'clinical': 1 })


def get_data(file_name, LSAS_threshold=None):
    group_column = 'group'
    sub_num_col = 'Subject_Number'
    lsas_col = 'LSAS'
    df = pd.read_excel(file_name, sheet_name='Sheet1')
    if LSAS_threshold is None:
        X = df.drop([group_column, sub_num_col, lsas_col], 1)
        Y = refactor_labels(df[group_column])
        return X, Y
    else:
        X = df.drop([group_column], 1)
        Y = pd.Series(np.where(X[lsas_col] > LSAS_threshold, 1, 0))
        X = X.drop([sub_num_col, lsas_col], 1)
        return X, Y


## get training data

In [5]:
file_name = "training set with sum, amount no reg no norm 2019-02-25.xlsx"

In [6]:
X_train, y_train = get_data(file_name, LSAS_threshold = 50)

## get test data

In [7]:
len(y_train)

81

In [8]:
file_name = "full_test_set_no_reg_features_no_norm.xlsx"
df = pd.read_excel(file_name, sheet_name='Sheet1')
X_test = df.drop('Subject_Number', axis = 1)

## training pipeline

In [35]:
pipe =  Pipeline([
    ("rnf", RemoveMissingFeaturesTransformer()), 
                ('scaling', MinMaxScaler()),
            ('correlation_threshold', RemoveCorrelationTransformer2(correlation_threshold=0.8)),
           # ('rfc', RFE(RandomForestClassifier(n_estimators = 100), n_features_to_select = 20)),
    ('pca', PCA(n_components=0.9)),
                ('classifier', XGBClassifier(
    learning_rate =0.15, 
    n_estimators=100,
    gamma=0.5,
    subsample=0.9,
    colsample_bytree=0.7,
    max_depth = 4,
    min_child_weight = 0.1,
                    reg_alpha =0, 
                    reg_lambda =0,
                    scale_pos_weight = 0.5
                    
))])

## train the model

In [36]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('rnf', RemoveMissingFeaturesTransformer()), ('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('correlation_threshold', RemoveCorrelationTransformer2(correlation_threshold=0.8)), ('pca', PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
  svd_solver='auto',..._alpha=0,
       reg_lambda=0, scale_pos_weight=0.5, seed=None, silent=True,
       subsample=0.9))])

## create predctions

In [37]:
yy = pipe.predict(X_train)


In [38]:
y_pred = pipe.predict(X_test)

In [39]:
y_pred

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [40]:
[i for i in zip(df["Subject_Number"], y_pred)]

[(1, 1),
 (2, 1),
 (3, 1),
 (5002, 0),
 (5004, 1),
 (5005, 0),
 (5006, 0),
 (5007, 0),
 (5009, 0),
 (5010, 0),
 (5012, 0),
 (5013, 0),
 (5014, 0),
 (5015, 0),
 (5017, 0),
 (5018, 0),
 (5019, 0),
 (5020, 1),
 (5021, 0),
 (5022, 0),
 (5027, 1)]

In [15]:
cols = df.columns[1:]

In [16]:
len(cols)

25

In [17]:
for i, j in zip(cols, pipe.steps[3][1].ranking_):
    if j==1:
        print(i)

AttributeError: 'PCA' object has no attribute 'ranking_'

In [None]:
df["Subject_Number"]