<h1>IS4242 Group Project</h1>

<b>Import necessary libraries</b>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.over_sampling import SMOTE

'''
Ensure that you are in the root folder of all the fold folders and target files
read_text(fold_name):
    fold_name: this is the name of the fold you want to read ALL patient files of. It will be read into a 2 dimensional
    list. If you would like to retrieve just the first patient instead, you will need to change the line 
    "txt_all.extend(txt[1:])" to "txt_all.append(txt[1:])" and you will be to use "read_text(fold1.txt)[0]" to retrieve
    the relevant patient's data
read_ans(file_name):
    file_name: this is the name of the file you want to read ALL targets of. It will be read into a 2 dimensional
    list. To retrieve the first patient's target: read_ans(ans.csv)[0]
put_single_into_dataframe(txt): This functions takes in 2 dimensional list ie the output of read_text(fold1.txt) 
put_multiple_into_dataframe(txt): Multiple is for using it with the output of read_text after you wanted to change it to append
'''

In [2]:
def read_text(fold_name):
    txt_all = list()
    for f in os.listdir(fold_name): # for each file in the directory
        with open(os.path.join(fold_name, f), 'r') as fp: # open each file
            txt = fp.readlines() # read inside the file
        recordid = txt[1].rstrip('\n').split(',')[-1] # get recordid
        txt = [[int(recordid)] + t.rstrip('\n').split(',') for t in txt] # preface each row with the recordid as all patients are 1 file
        txt_all.extend(txt[1:]) # skip the parameter list
    return txt_all

def read_ans(file_name):
    txt_all = list()
    with open(file_name, 'r') as fp: # opens the csv file
        txt = fp.readlines() 
    for i in range(1, len(txt)): # similar to above read_text
        record_id, length_of_stay, hospital_death = txt[i].rstrip('\n').split(',')
        txt_all.append([record_id, length_of_stay, hospital_death])
    return txt_all

def put_multiple_into_dataframe(txt_all):
    df = pd.DataFrame()
    for i in txt_all:
        df2 = pd.DataFrame(i, columns=['recordid', 'time', 'parameter', 'value'])
        df = df.append(df2, ignore_index=True)
    return df

def put_single_into_dataframe(txt_all):
    df = pd.DataFrame(txt_all, columns=['recordid', 'time', 'parameter', 'value'])
    return df

In [3]:
# Reading features
fold1 = put_single_into_dataframe(read_text("./Project_Data/Fold1"))
fold2 = put_single_into_dataframe(read_text("./Project_Data/Fold2"))
fold3 = put_single_into_dataframe(read_text("./Project_Data/Fold3"))
fold4 = put_single_into_dataframe(read_text("./Project_Data/Fold4"))
df_feat = fold1.copy()
df_feat = df_feat.append(fold2)
df_feat = df_feat.append(fold3)
df_feat = df_feat.append(fold4)
df_feat.head()

Unnamed: 0,recordid,time,parameter,value
0,132539,00:00,RecordID,132539
1,132539,00:00,Age,54
2,132539,00:00,Gender,0
3,132539,00:00,Height,-1
4,132539,00:00,ICUType,4


In [4]:
# Reading Target
df_target = pd.DataFrame(read_ans('./Project_Data/Fold1_Outcomes.csv'), columns=['recordid', 'days_in_hospital', 'mortality'])
df_target.head()

Unnamed: 0,recordid,days_in_hospital,mortality
0,132539,5,0
1,132540,8,0
2,132541,19,0
3,132543,9,0
4,132545,4,0


In [5]:

bin_feat = ['MechVent']
num_feat = ['Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol',
           'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT',
           'HR', 'K', 'Lactate', 'Mg', 'MAP', 'NA', 'NIDiasABP', 'NIMAP',
           'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate', 'SaO2',
           'SysABP', 'Temp', 'Tropl', 'TropT', 'Urine', 'WBC', 'Weight']

print("Number of record ids:", len(df_feat['recordid'].unique()))
unique_count = df_feat['parameter'].value_counts()/4000
print(unique_count)

Number of record ids: 4000
HR             57.13450
MAP            36.44000
SysABP         36.41250
DiasABP        36.39175
Urine          34.22950
Weight         32.29125
NISysABP       24.58275
NIDiasABP      24.55250
NIMAP          24.21775
Temp           21.60125
GCS            15.39075
RespRate       13.76275
FiO2            8.09750
MechVent        7.78600
pH              6.08875
PaCO2           5.82325
PaO2            5.81700
HCT             4.56775
K               3.61000
Platelets       3.52600
Creatinine      3.49575
BUN             3.47900
HCO3            3.40325
Mg              3.39750
Na              3.39250
Glucose         3.25525
WBC             3.22750
SaO2            2.04625
Lactate         2.00600
Height          1.00000
ICUType         1.00000
RecordID        1.00000
Gender          1.00000
Age             1.00000
Bilirubin       0.79775
AST             0.79550
ALT             0.79425
ALP             0.77300
Albumin         0.58900
TroponinT       0.53150
TroponinI    

<h2>Analysis of Features</h2>
<p>The data above shows the average number of times a variable observed per patient. Based on the data above and the feature description we classify the features into these categories:
<ul>
    <li>General Descriptors (static data) that are collected when the patient is admitted to the ICU. Weight is not included as weight are measured multiple times as a time series data. Each of the descriptors will be included as a feature into the model.</li>
    <li>Rare features: measured on average less than one time per patient (less than 1.0). We use the <u>existence</u> of these measurements for each patient as a feature.</li>
    <li>Features that measured often or more that one time per patient (more than 1.0). Calculate the hourly average of each measurements and put them into 48 columns. <i>Example, average HR on the first hour to HR_1, average HR on the second hour to HR_2, and so on.</i></li>
</ul>
</p>


In [6]:
stat_feat = ['Age', 'Gender', 'Height', 'ICUType', 'RecordID'] #General Descriptors
rare_feat = []
nor_feat = []
for index, value in unique_count.items():
    if value < 1.0:
        rare_feat.append(index)
    elif index not in stat_feat:
        nor_feat.append(index)
print("Rare features", rare_feat)
print("Normal features", nor_feat)

Rare features ['Bilirubin', 'AST', 'ALT', 'ALP', 'Albumin', 'TroponinT', 'TroponinI', 'Cholesterol']
Normal features ['HR', 'MAP', 'SysABP', 'DiasABP', 'Urine', 'Weight', 'NISysABP', 'NIDiasABP', 'NIMAP', 'Temp', 'GCS', 'RespRate', 'FiO2', 'MechVent', 'pH', 'PaCO2', 'PaO2', 'HCT', 'K', 'Platelets', 'Creatinine', 'BUN', 'HCO3', 'Mg', 'Na', 'Glucose', 'WBC', 'SaO2', 'Lactate']


In [7]:
#Creating dataframe
columns = stat_feat.copy()
columns.extend(rare_feat)
for feat in nor_feat:
    for hour in range(1,49):
        columns.append(feat + '_' +  str(hour))
print(columns)

df = pd.DataFrame(columns=columns)
df = df.set_index('RecordID')
df.head()

['Age', 'Gender', 'Height', 'ICUType', 'RecordID', 'Bilirubin', 'AST', 'ALT', 'ALP', 'Albumin', 'TroponinT', 'TroponinI', 'Cholesterol', 'HR_1', 'HR_2', 'HR_3', 'HR_4', 'HR_5', 'HR_6', 'HR_7', 'HR_8', 'HR_9', 'HR_10', 'HR_11', 'HR_12', 'HR_13', 'HR_14', 'HR_15', 'HR_16', 'HR_17', 'HR_18', 'HR_19', 'HR_20', 'HR_21', 'HR_22', 'HR_23', 'HR_24', 'HR_25', 'HR_26', 'HR_27', 'HR_28', 'HR_29', 'HR_30', 'HR_31', 'HR_32', 'HR_33', 'HR_34', 'HR_35', 'HR_36', 'HR_37', 'HR_38', 'HR_39', 'HR_40', 'HR_41', 'HR_42', 'HR_43', 'HR_44', 'HR_45', 'HR_46', 'HR_47', 'HR_48', 'MAP_1', 'MAP_2', 'MAP_3', 'MAP_4', 'MAP_5', 'MAP_6', 'MAP_7', 'MAP_8', 'MAP_9', 'MAP_10', 'MAP_11', 'MAP_12', 'MAP_13', 'MAP_14', 'MAP_15', 'MAP_16', 'MAP_17', 'MAP_18', 'MAP_19', 'MAP_20', 'MAP_21', 'MAP_22', 'MAP_23', 'MAP_24', 'MAP_25', 'MAP_26', 'MAP_27', 'MAP_28', 'MAP_29', 'MAP_30', 'MAP_31', 'MAP_32', 'MAP_33', 'MAP_34', 'MAP_35', 'MAP_36', 'MAP_37', 'MAP_38', 'MAP_39', 'MAP_40', 'MAP_41', 'MAP_42', 'MAP_43', 'MAP_44', 'MAP_45',

Unnamed: 0_level_0,Age,Gender,Height,ICUType,Bilirubin,AST,ALT,ALP,Albumin,TroponinT,...,Lactate_39,Lactate_40,Lactate_41,Lactate_42,Lactate_43,Lactate_44,Lactate_45,Lactate_46,Lactate_47,Lactate_48
RecordID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [8]:
tot_values = {} # the sum of the values
count = {} # num of occurences of a measurement
for index,row in df_feat.iterrows():
    if row['parameter'] == 'RecordID' and index != 0:
        # count the average of the previous record
        for key,value in tot_values.items():
            tot_values[key] = float(tot_values[key])/count[key]
            
        df = df.append(tot_values, ignore_index=True)
        tot_values.clear()
        count.clear()
        for feat in rare_feat:
            tot_values[feat] = 0;
            count[feat] = 1;
        
    if row['parameter'] in stat_feat:
        tot_values[row['parameter']] = row['value']
        count[row['parameter']] = 1
    elif row['parameter'] in rare_feat:
        tot_values[row['parameter']] = 1
        count[row['parameter']] = 1
    elif row['parameter'] in nor_feat:
        hour = int(row['time'][0:2]) + 1
        if hour == 49: hour-=1
        col = row['parameter'] + '_' + str(hour)
        tot_values[col] = row['value']
        if col in count:
            count[col] = count[col] + 1
        else:
            count[col] = 1
    
# count the average of the previous record
for key,value in tot_values.items():
    tot_values[key] = float(tot_values[key])/count[key]
            
df = df.append(tot_values, ignore_index=True)

In [9]:
df

Unnamed: 0,Age,Gender,Height,ICUType,Bilirubin,AST,ALT,ALP,Albumin,TroponinT,...,Lactate_40,Lactate_41,Lactate_42,Lactate_43,Lactate_44,Lactate_45,Lactate_46,Lactate_47,Lactate_48,RecordID
0,54.0,0.0,-1.0,4.0,,,,,,,...,,,,,,,,,,132539.0
1,76.0,1.0,175.3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,132540.0
2,44.0,0.0,-1.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,132541.0
3,68.0,1.0,180.3,3.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,132543.0
4,88.0,0.0,-1.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,132545.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3992,70.0,0.0,-1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,142665.0
3993,25.0,1.0,-1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,142667.0
3994,44.0,1.0,-1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,...,,,,,,,,,,142670.0
3995,37.0,1.0,-1.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,142671.0


In [39]:
def preprocess_x_for_design_matrix_3(df_feat):
    df = pd.DataFrame()
    
    tot_values = {} # the sum of the values
    count = {} # num of occurences of a measurement
    for index,row in df_feat.iterrows():
        if row['parameter'] == 'RecordID' and index != 0:
            # count the average of the previous record
            for key,value in tot_values.items():
                tot_values[key] = float(tot_values[key])/count[key]

            df = df.append(tot_values, ignore_index=True)
            tot_values.clear()
            count.clear()
            for feat in rare_feat:
                tot_values[feat] = 0;
                count[feat] = 1;

        if row['parameter'] in stat_feat:
            tot_values[row['parameter']] = row['value']
            count[row['parameter']] = 1
        elif row['parameter'] in rare_feat:
            tot_values[row['parameter']] = 1
            count[row['parameter']] = 1
        elif row['parameter'] in nor_feat:
            hour = int(row['time'][0:2]) + 1
            if hour == 49: hour-=1
            col = row['parameter'] + '_' + str(hour)
            tot_values[col] = row['value']
            if col in count:
                count[col] = count[col] + 1
            else:
                count[col] = 1

    # count the average of the previous record
    for key,value in tot_values.items():
        tot_values[key] = float(tot_values[key])/count[key]

    df = df.append(tot_values, ignore_index=True)
    
    df = df.astype({'RecordID': 'int32', 'ICUType':'int32'})
    
    return df

In [40]:
# import all folds

x_fold = {}
y_fold = {}
for i in range (1,5):
    string = "./Project_Data/Fold"+str(i)
    y_file = "./Project_Data/Fold"+str(i)+"_Outcomes.csv"
    x_fold[i] = preprocess_x_for_design_matrix_3(put_single_into_dataframe(read_text(string)))
    y_fold[i] = pd.read_csv(y_file)

In [88]:
#For MODEL 3

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
num_feat = []
for feat in nor_feat:
    for hour in range(1,49):
        num_feat.append(feat + '_' +  str(hour))
cat_feat = ["ICUType"]

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(-1, strategy='most_frequent')),
                                 ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

stat_feat_cp = stat_feat.copy()
stat_feat_cp.remove("RecordID")
stat_feat_cp.remove("ICUType")
prepro = ColumnTransformer(
    remainder = 'passthrough',
    transformers=[
        ('cat', cat_transformer, cat_feat),
    ('num', SimpleImputer(-1, strategy='mean'), num_feat),
    ('stat', SimpleImputer(-1, strategy='mean'), stat_feat_cp)])

parameters = {'VarianceThreshold':[0,0.5], 'PCA':[60,70, 150, 200],
              'SelectKBest':[350,400],
              'RFDepth':[3,4,5],
              'RFEst': [100,125,150],
              'KNeighbours':[3,5,7],
              'DT_min_samples_split':[2,4,8],
              'MLPClassifier_hiddenLayer':[(45,45,), (100,), (30,30,30)],
              'DecisionTreeRegressor_min_samples_leaf':[1,3],
              'MLPRegressor_hiddenLayer':[(45,45,), (100,), (30,30,30)]
             }
    
all_list = [1,2,3,4]
for i in range(2,5):
    print("Testing on Fold", i)
    x_train_df = pd.DataFrame()
    y_train_df = pd.DataFrame()
    
    # Getting train data set up
    for j in [x for x in all_list if x != i]: 
#         string = "./Project_Data/Fold"+str(j)
#         y_file = "./Project_Data/Fold"+str(j)+"_Outcomes.csv"
#         x_train_df = x_train_df.append(put_single_into_dataframe(read_text(string)))
#         y_train_df = y_train_df.append(pd.read_csv(y_file))
        x_train_df = x_train_df.append(x_fold[j])
        y_train_df = y_train_df.append(y_fold[j])
#     x_train_df = preprocess_x_for_design_matrix_3(x_train_df)
    y_train_df = y_train_df.drop(['Length_of_stay'], axis=1)
    
    # Replace -1 with NaN
#     x_train_df = x_train_df.replace(-1, np.nan)
    # Replace not known length of stay to 2
    y_train_df = y_train_df.replace(-1, 2)
    
    train_df = x_train_df.merge(y_train_df, on="RecordID", how='outer')
    train_df = train_df.set_index("RecordID")
    train_df = train_df.replace(np.nan, -1)
    X_train = train_df.loc[:, train_df.columns != 'In-hospital_death']
    Y_train = train_df['In-hospital_death']
    
        
    x_test_df = x_train_df.iloc[0:0]
    y_test_df = y_train_df.iloc[0:0]
    # Getting test data set up
    x_test_df = x_test_df.append(x_fold[i])
    y_test_df = y_test_df.append(y_fold[i])
    y_test_df = y_test_df.drop(['Length_of_stay'], axis=1)
     # Replace -1 with NaN
#     x_test_df = x_test_df.replace(-1, np.nan)
    # Replace not known length of stay to 2
    y_test_df = y_test_df.replace(-1, 2)
    
    test_df = x_test_df.merge(y_test_df, on="RecordID", how='outer')
    test_df = test_df.set_index("RecordID")
    test_df = test_df.replace(np.nan, -1)
    X_test = test_df.loc[:, test_df.columns != 'In-hospital_death']
    Y_test = test_df['In-hospital_death']
    
    print(X_train.head())
    
    # Logistic Regression
    for k_val in parameters['SelectKBest']:
        for nN_components in parameters["PCA"]:
            model = imPipeline(steps=[('imputer', prepro),
                                      ('smote', SMOTE()),
                                      ('scaler', StandardScaler()),
                                    ('f_selecter', SelectKBest(k = k_val)),
                                    ('dim_reducer', PCA(n_components = nN_components)),
                                   ('lr', LogisticRegression())])
            model = model.fit(X_train, Y_train)
            y_pred = model.predict(X_test)
            print ( "Using LogisticRegression --> SelectKBest: {} and PCA: {} has a score of {}".format(k_val, nN_components, roc_auc_score(Y_test, y_pred)) )
            
    # Random Forest
    for k_val in parameters['SelectKBest']:
        for nN_components in parameters["PCA"]:
            for est in parameters['RFEst']:
                for depth in parameters['RFDepth']:
                    model = imPipeline(steps=[('imputer', prepro),
                                      ('smote', SMOTE()),
                                      ('scaler', StandardScaler()),
                                            ('f_selecter', SelectKBest(k = k_val)),
                                            ('dim_reducer', PCA(n_components = nN_components)),
                                           ('classifier', RandomForestClassifier(n_estimators=est, max_depth=depth))])
                    model = model.fit(X_train, Y_train)
                    y_pred = model.predict(X_test)
                    print ( "Using Random Forest --> SelectKBest: {}, PCA: {}, RF Estimators: {}, and RF depth: {} has a score of {}".format(k_val, nN_components, est, depth, roc_auc_score(Y_test, y_pred)) )
                    
    # K-Neighbors
    for k_val in parameters['SelectKBest']:
        for nN_components in parameters["PCA"]:
            for k_neigh in parameters['KNeighbours']:
                model = imPipeline(steps=[('imputer', prepro),
                                      ('smote', SMOTE()),
                                      ('scaler', StandardScaler()),
                                        ('f_selecter', SelectKBest(k = k_val)),
                                        ('dim_reducer', PCA(n_components = nN_components)),
                                       ('classifier', KNeighborsClassifier(n_neighbors=k_neigh))])
                model = model.fit(X_train, Y_train)
                y_pred = model.predict(X_test)
                print ( "Using KNeighbors --> SelectKBest: {}, PCA: {}, and K_Neighbours: {} has a score of {}".format(k_val, nN_components, k_neigh, roc_auc_score(Y_test, y_pred)) )
            
    # Decision Tree
    for k_val in parameters['SelectKBest']:
        for nN_components in parameters["PCA"]:
            for min_sample in parameters['DT_min_samples_split']:
                model = imPipeline(steps=[('imputer', prepro),
                                      ('smote', SMOTE()),
                                      ('scaler', StandardScaler()),
                                        ('f_selecter', SelectKBest(k = k_val)),
                                        ('dim_reducer', PCA(n_components = nN_components)),
                                       ('classifier', DecisionTreeClassifier(min_samples_split = min_sample))])
                model = model.fit(X_train, Y_train)
                y_pred = model.predict(X_test)
                print ( "Using Decision Tree --> SelectKBest: {}, PCA: {}, and DT Min_samples_split: {} has a score of {}".format(k_val, nN_components, min_sample, roc_auc_score(Y_test, y_pred)) )
                
    # Gaussian Process
    for k_val in parameters['SelectKBest']:
        for nN_components in parameters["PCA"]:
            model = imPipeline(steps=[('imputer', prepro),
                                      ('smote', SMOTE()),
                                      ('scaler', StandardScaler()),
                                        ('f_selecter', SelectKBest(k = k_val)),
                                        ('dim_reducer', PCA(n_components = nN_components)),
                                       ('classifier', GaussianProcessClassifier())])
            model = model.fit(X_train, Y_train)
            y_pred = model.predict(X_test)
            print ( "Using Gaussian Process --> SelectKBest: {} and PCA: {} has a score of {}".format(k_val, nN_components, roc_auc_score(Y_test, y_pred)) )
            
    # MLP Classifier
    for k_val in parameters['SelectKBest']:
        for nN_components in parameters["PCA"]:
            for hiddenLayer in parameters['MLPClassifier_hiddenLayer']:
                model = imPipeline(steps=[('imputer', prepro),
                                      ('smote', SMOTE()),
                                      ('scaler', StandardScaler()),
                                        ('f_selecter', SelectKBest(k = k_val)),
                                        ('dim_reducer', PCA(n_components = nN_components)),
                                       ('classifier', MLPClassifier(hidden_layer_sizes = hiddenLayer, learning_rate_init = 0.01))])
                model = model.fit(X_train, Y_train)
                y_pred = model.predict(X_test)
                print ( "Using MLP classifier --> SelectKBest: {}, PCA: {}, and Hidden Layer: {} has a score of {}".format(k_val, nN_components, hiddenLayer, roc_auc_score(Y_test, y_pred)) )
                
    # Creating regression model and parameters to try out
    for nThreshold in parameters["VarianceThreshold"]:
        for nN_components in parameters["PCA"]:
            est = Pipeline(steps=[
            ('scaler', scaler),
            ('f_selecter', VarianceThreshold(threshold = nThreshold)),
            ('dim_reducer', PCA(n_components = nN_components)),
            ('classifier', LinearRegression())])
            est.fit(x_train_df, y_linear_train_df)
            prediction = est.predict(x_test_df)
            print ( "Using LinearRegression --> VarianceThreshold: {} and PCA: {} has a score of {}".format(nThreshold, nN_components, mean_squared_error(y_linear_test_df, prediction)) )
    for nThreshold in parameters["SelectKBest"]:
        for nN_components in parameters["PCA"]:
            for min_samples_leaf in parameters["DecisionTreeRegressor_min_samples_leaf"]:
                est = Pipeline(steps=[
                ('scaler', scaler),
                ('f_selecter', SelectKBest(k = 90)),
                ('dim_reducer', PCA(n_components = nN_components)),
                ('classifier', DecisionTreeRegressor(min_samples_leaf = min_samples_leaf))])
                est.fit(x_train_df, y_linear_train_df)
                prediction = est.predict(x_test_df)
                print ( "Using DecisionTreeRegressor({}) --> SelectKBest: 90 and PCA: {} has a score of {}".format(min_samples_leaf, nN_components, mean_squared_error(y_linear_test_df, prediction)) )
    for nThreshold in parameters["VarianceThreshold"]:
        for nN_components in parameters["PCA"]:
            for hiddenLayer in parameters["MLPRegressor_hiddenLayer"]:
                est = Pipeline(steps=[
                ('scaler', scaler),
                ('f_selecter', VarianceThreshold(threshold = nThreshold)),
                ('dim_reducer', PCA(n_components = nN_components)),
                ('classifier', MLPRegressor(hidden_layer_sizes = hiddenLayer, learning_rate_init = 0.01))])
                est.fit(x_train_df, y_linear_train_df)
                prediction = est.predict(x_test_df)
                print ( "Using MLPRegressor({}) --> SelectKBest: 90 and PCA: {} has a score of {}".format(hiddenLayer, nN_components, mean_squared_error(y_linear_test_df, prediction)) )

Testing on Fold 2


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


          ALP  ALT  AST   Age  Albumin  BUN_1  BUN_10  BUN_11  BUN_12  BUN_13  \
RecordID                                                                        
132539   -1.0 -1.0 -1.0  54.0     -1.0   -1.0    -1.0    13.0    -1.0    -1.0   
132540    0.0  0.0  0.0  76.0      0.0   -1.0    -1.0    -1.0    -1.0    -1.0   
132541    1.0  1.0  1.0  44.0      1.0   -1.0    -1.0    -1.0    -1.0    -1.0   
132543    1.0  1.0  1.0  68.0      1.0   23.0    -1.0    -1.0    -1.0    -1.0   
132545    0.0  0.0  0.0  88.0      1.0   -1.0    -1.0    -1.0    -1.0    -1.0   

          ...  pH_44  pH_45  pH_46  pH_47  pH_48  pH_5  pH_6  pH_7  pH_8  pH_9  
RecordID  ...                                                                   
132539    ...   -1.0   -1.0  -1.00   -1.0   -1.0 -1.00  -1.0 -1.00  -1.0  -1.0  
132540    ...   -1.0   -1.0   7.37   -1.0   -1.0  7.34  -1.0  7.36  -1.0  -1.0  
132541    ...   -1.0   -1.0  -1.00   -1.0   -1.0 -1.00  -1.0 -1.00  -1.0  -1.0  
132543    ...   -1.0   -1.0



Using LogisticRegression --> SelectKBest: 350 and PCA: 60 has a score of 0.6691885547519351




Using LogisticRegression --> SelectKBest: 350 and PCA: 70 has a score of 0.6562618956985153




Using LogisticRegression --> SelectKBest: 350 and PCA: 150 has a score of 0.6957556147696993




Using LogisticRegression --> SelectKBest: 350 and PCA: 200 has a score of 0.7139639639639641




Using LogisticRegression --> SelectKBest: 400 and PCA: 60 has a score of 0.6643351097576449




Using LogisticRegression --> SelectKBest: 400 and PCA: 70 has a score of 0.6815124984139069




Using LogisticRegression --> SelectKBest: 400 and PCA: 150 has a score of 0.6901725669331302




Using LogisticRegression --> SelectKBest: 400 and PCA: 200 has a score of 0.691346275853318
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 100, and RF depth: 3 has a score of 0.6702353762212918
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 100, and RF depth: 4 has a score of 0.6787368354269763
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 100, and RF depth: 5 has a score of 0.637149473417079
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 125, and RF depth: 3 has a score of 0.6655088186778327
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 125, and RF depth: 4 has a score of 0.6769762720466946
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 125, and RF depth: 5 has a score of 0.6245241720593834
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimators: 150, and RF depth: 3 has a score of 0.6270460601446517
Using Random Forest --> SelectKBest: 350, PCA: 60, RF Estimato

Using Random Forest --> SelectKBest: 400, PCA: 200, RF Estimators: 125, and RF depth: 4 has a score of 0.5310557035909148
Using Random Forest --> SelectKBest: 400, PCA: 200, RF Estimators: 125, and RF depth: 5 has a score of 0.5264877553609948
Using Random Forest --> SelectKBest: 400, PCA: 200, RF Estimators: 150, and RF depth: 3 has a score of 0.5564807765511991
Using Random Forest --> SelectKBest: 400, PCA: 200, RF Estimators: 150, and RF depth: 4 has a score of 0.552055576703464
Using Random Forest --> SelectKBest: 400, PCA: 200, RF Estimators: 150, and RF depth: 5 has a score of 0.5558780611597514
Using KNeighbors --> SelectKBest: 350, PCA: 60, and K_Neighbours: 3 has a score of 0.6263005963710189
Using KNeighbors --> SelectKBest: 350, PCA: 60, and K_Neighbours: 5 has a score of 0.6208761578479889
Using KNeighbors --> SelectKBest: 350, PCA: 60, and K_Neighbours: 7 has a score of 0.6145476462377871
Using KNeighbors --> SelectKBest: 350, PCA: 70, and K_Neighbours: 3 has a score of 0.

Using MLP classifier --> SelectKBest: 400, PCA: 70, and Hidden Layer: (45, 45) has a score of 0.6046504250729603
Using MLP classifier --> SelectKBest: 400, PCA: 70, and Hidden Layer: (100,) has a score of 0.6134532419743688
Using MLP classifier --> SelectKBest: 400, PCA: 70, and Hidden Layer: (30, 30, 30) has a score of 0.5942297931734551
Using MLP classifier --> SelectKBest: 400, PCA: 150, and Hidden Layer: (45, 45) has a score of 0.6050945311508692
Using MLP classifier --> SelectKBest: 400, PCA: 150, and Hidden Layer: (100,) has a score of 0.6099162542824514
Using MLP classifier --> SelectKBest: 400, PCA: 150, and Hidden Layer: (30, 30, 30) has a score of 0.5877585331810684
Using MLP classifier --> SelectKBest: 400, PCA: 200, and Hidden Layer: (45, 45) has a score of 0.5941980713107474
Using MLP classifier --> SelectKBest: 400, PCA: 200, and Hidden Layer: (100,) has a score of 0.5990515163050375
Using MLP classifier --> SelectKBest: 400, PCA: 200, and Hidden Layer: (30, 30, 30) has a

NameError: name 'scaler' is not defined

In [67]:
x_train_df.tail()

Unnamed: 0,ALP,ALT,AST,Age,Albumin,BUN_1,BUN_10,BUN_11,BUN_12,BUN_13,...,pH_44,pH_45,pH_46,pH_47,pH_48,pH_5,pH_6,pH_7,pH_8,pH_9
995,0.0,0.0,0.0,70.0,0.0,,,,,,...,,,,,,7.37,,,,
996,0.0,0.0,0.0,25.0,0.0,,,,,,...,,,,,,,,,,
997,1.0,1.0,1.0,44.0,0.0,,10.0,,,,...,,,7.41,,,7.39,,,,
998,1.0,1.0,1.0,37.0,1.0,65.0,,,,,...,,,,,,7.45,,,,
999,1.0,1.0,1.0,78.0,1.0,,,17.0,,,...,7.34,,7.31,,,7.3,7.26,,3.67,3.6
