In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from yellowbrick.regressor import AlphaSelection
from xgboost import XGBRegressor

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, WhiteKernel, RBF, RationalQuadratic, Matern, ExpSineSquared
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import f_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers
from keras import initializers

Using TensorFlow backend.


# Data processing

In [2]:
def feature_selection_with_f_regressor_and_random_forest(n_features_fr, n_features_rf, X_train, X_val, y_train):
    
    # Using f_regression
    features_scores = f_regression(X_train, y_train)[0]
    y = list(features_scores)
    myarray = np.asarray(y)

    indices_fr = myarray.argsort()[-n_features_fr:][::-1]
    
    # Using Random Forest Regressor
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=50)
    rf.fit(X_train, y_train)

    scores = list(rf.feature_importances_)
    my_rf_features = np.asarray(scores)

    indices_rf = my_rf_features.argsort()[-n_features_rf:][::-1]

    # Make the union of the two
    indices = list(np.union1d(indices_rf, indices_fr))
    
    return X_train.iloc[:, indices], X_val.iloc[:, indices]

In [3]:
def drop_outliers_samples_isolation_forest(X_train, y_train):
    
    # Use isolation forest for outlier detection, Computation heavy
    forest = IsolationForest()
    forest.fit(X_train)
    
    # Outlier indices for training
    outliers_training = forest.predict(X_train)
    outliers_training_indices = np.argwhere(outliers_training == -1).flatten()
    
    # Drop signal outliers in training data
    X_train_without_outliers = X_train.drop(index=outliers_training_indices)
    y_train_without_outliers = y_train.drop(index=outliers_training_indices)

    return X_train_without_outliers, y_train_without_outliers  

In [4]:
def feature_extraction_lasso(X_train , X_test, y_train):
    alphas = np.logspace(-1, 10, 100)
    reg = LassoCV(cv = 5, alphas= alphas)
    lasso = AlphaSelection(reg)
    lasso.fit(X_train , y_train)
    alpha_best = lasso.alpha_
    coef = lasso.coef_
    print("Best Alpha = " , alpha_best)
    print('# of coef before = ' , len(coef))
    print("# of coef after = " , np.sum(coef!=0))
    X_train_extracted = X_train.loc[:,coef!=0]
    X_test_extracted = X_test.loc[:,coef!=0]
    return (X_train_extracted , X_test_extracted)

In [5]:
def train_test_split(dataframe, test_train_ratio):

    total_samples = len(dataframe.index)
    nsamples_test = int(test_train_ratio*total_samples)
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    test = dataframe.iloc[:nsamples_test]
    X_test = test.drop(['Age'], axis=1)
    y_test = test[['Patient_ID', 'Age']]

    train = dataframe.iloc[nsamples_test:]
    X_train = train.drop(['Age'], axis=1)
    y_train = train[['Patient_ID','Age']]
    
    return X_train, y_train, X_test, y_test

In [6]:
def fill_with_median(dataframe_with_nan):
    
    for column in dataframe_with_nan:
        if not isinstance(dataframe_with_nan[column].values[0], str): 
            median = dataframe_with_nan[column].median()
            dataframe_with_nan[column].fillna(median, inplace=True)

    return dataframe_with_nan

In [7]:
def feature_select_by_correlation(X_train, y_train, nb_features):
    corr = X_train.corrwith(y_train['Age'], axis=0, method='pearson')
    feature_select = pd.DataFrame(corr.iloc[(-corr.abs()).argsort()][:nb_features])
    best_feature_names = feature_select.index.values
    
    return feature_select, best_feature_names

In [8]:
def feature_selection_by_correlation(X_train, y_train, X_test, nb_features):
    feature_select, best_feature_names = feature_select_by_correlation(X_train, y_train, nb_features)
    best_feature_names = np.insert(best_feature_names, 0, 'Patient_ID')
    return X_train[best_feature_names], X_test[best_feature_names]

# Models

In [9]:
def fit_neural_network(dropout, X_train, y_train):

    # Create model
    model = Sequential()
    model.add(Dense(30, input_dim=len(indices), kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))

    model.add(Dense(1, init='RandomUniform'))
    
    # Compile model
    optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[coeff_determination])
    
    # Fit the model
    print("Start fitting ...")
    model.fit(x=X_train, y=y_train['y'], epochs=80, verbose=0, validation_split=0.1, shuffle=True, \
              steps_per_epoch=50, initial_epoch=0, validation_steps=5)
    
    return model

In [10]:
def fit_adaboost_with_grid_search(X_train, y_train):
    
    # Create random forest object
    ada = AdaBoostRegressor()
    
    # Grid search 
    parameters = {'base_estimator':[
                    DecisionTreeRegressor(max_depth=25),
                    DecisionTreeRegressor(max_depth=30),
                    DecisionTreeRegressor(max_depth=35),
                    DecisionTreeRegressor(max_depth=40)
    ]
                  ,'n_estimators':[1000], 'loss':['square']}
    clf = GridSearchCV(ada, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for AdaBoost: " + str(clf.best_params_))
            
    return clf

In [11]:
def fit_simple_linear_regression(X_train, y_train):
    
    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train, y_train['y'])
    
    return regr

In [2]:
def fit_xgboost_with_grid_search(X_train, y_train):

    # Create gradient boosting object
    xgbr = XGBRegressor(verbosity=1, max_depth=10, reg_lambda=1) 
    
    # Grid search 
    parameters = {'max_depth':[1,2,4], 'reg_lambda':[4, 6, 8, 10, 12], 'min_child_weight':[6, 8, 12,16]}    
    clf = GridSearchCV(xgbr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train['y'])
    
    print("Best parameters for XGBoost: " + str(clf.best_params_))
            
    return clf

In [13]:
def fit_svr_with_grid_search(X_train, y_train):

    # Create svr object
    svr = SVR()
    # svr = SVR(kernel='rbf', C=30, gamma=1)
    # svr = SVR(kernel='linear', C=100, gamma='auto')
    # svr = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1, coef0=1)
    
    # Grid search 
    parameters = {'kernel':['poly', 'rbf', 'sigmoid',],'C':[10e5, 10e6, 10e7], 'gamma':[10e-2, 10e-1, 1, 2]}
    clf = GridSearchCV(svr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for SVR: " + str(clf.best_params_))
            
    return clf 

In [14]:
def fit_random_forest_with_grid_search(X_train, y_train):
    
    # Create random forest object
    rf = RandomForestRegressor() 
    
    # Grid search 
    parameters = {'max_depth':[10, 15, 20, 25, 30], 
                  'n_estimators' :[1000],
                  'min_samples_split':[2, 3, 5],
                  'max_leaf_nodes':[100, 150, 200, 250]}
    clf = GridSearchCV(rf, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for Random Forest: " + str(clf.best_params_))
            
    return clf

In [15]:
# ConstantKernel, DotProduct, WhiteKernel, RBF, RationalQuadratic, Matern, ExpSineSquared
def fit_gaussian_process_with_grid_search(X_train, y_train):
    
    # Create Gaussian process object
    gpr = GaussianProcessRegressor()
    
    # Grid search
    parameters = {
        'kernel': [10 * RationalQuadratic() * DotProduct(sigma_0 = 1) + 2 * ConstantKernel() + WhiteKernel(noise_level=0.5)
                  ]}
    clf = GridSearchCV(gpr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for Gaussian Process: " + str(clf.best_params_))
    
    return clf

In [16]:
def fit_gradient_boosting_with_grid_search(X_train, y_train):
    
    # Create Gradient boosting object
    gb = GradientBoostingRegressor()
    
    # Grid search 
    parameters = {'learning_rate':[0.06, 0.05, 0.04], 
                         'n_estimators':[1000],
                         'min_samples_split':[2], 
                         'max_depth':[2, 4, 6]}    
    clf = GridSearchCV(gb, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for Gradient Boosting: " + str(clf.best_params_))
            
    return clf

In [17]:
def fit_extra_trees_regressor_with_grid_search(X_train, y_train):
    
    # Create Gradient boosting object
    etr = ExtraTreesRegressor(n_jobs=1)
    
    # Grid search 
    parameters = {'max_depth':[2, 3, 5, 10, 15], 
                         'n_estimators':[1000],
                         'min_samples_split':[2]
                 }        
    clf = GridSearchCV(etr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for Extra Trees Regressor: " + str(clf.best_params_))
            
    return clf

In [18]:
def fit_simple_linear_regression(X_train_feature_extracted, y_train, nb_cv):

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train_feature_extracted, y_train)

    # Evaluate performance using cross-validation 
    scores = cross_val_score(regr, X_train_feature_extracted, y_train, scoring="neg_mean_squared_error", cv=nb_cv)
    train_mse = np.mean(-scores)

    # Results
    print("----- Train results -------")
    print('Coefficients: \n', regr.coef_)
    print('Mean squared error (cross-validation): %.2f'
          % train_mse)
    print()

    return regr, train_mse 

In [19]:
def simple_linear_regression(X_train_feature_extracted, X_test_feature_extracted, y_train, y_test, nb_cv):

    regr, train_mse = fit_simple_linear_regression(X_train_feature_extracted, y_train, nb_cv)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test_feature_extracted)
    test_mse = mean_squared_error(y_test, y_pred)

    print("----- Test results --------")
    print("Number of test samples: ", len(y_pred))
    print('Mean squared error: %.2f'
          % test_mse)
    
    return regr, train_mse, test_mse

In [20]:
def slr_with_correlation(aggregated_dataframe, test_ratio, nb_features_extracted, nb_cv):
    
    # Define Train and Test data 
    print()
    print("---- Split into Train and Test data ----")
    print()
    (X_train_inter, y_train_inter, X_test_inter, y_test_inter) = \
                                                    train_test_split(aggregated_dataframe, test_ratio)

    print("Length of the train set:")
    print(len(y_train_inter))
    print("Length of the test set:")
    print(len(y_test_inter))
    
    # Feature selection by correlation
    print()
    print("---- Feature selection by correlation ----")
    print()
    feature_select, best_feature_names = \
                        feature_select_by_correlation(X_train_inter, y_train_inter, nb_features_extracted)
    print(feature_select)
    
    X_train_inter_feature_extracted, X_test_inter_feature_extracted = \
               feature_selection_by_correlation(X_train_inter, y_train_inter, X_test_inter, nb_features_extracted)
    
    # Simple Linear Regression
    print()
    print("---- Simple Linear Regression ----")
    print()
    
    # Remove key for linear regression
    X_train_inter_feature_extracted = X_train_inter_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore')
    X_test_inter_feature_extracted = X_test_inter_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore')

    y_train_inter = y_train_inter.drop(['Patient_ID'], axis=1, errors='ignore')
    y_test_inter = y_test_inter.drop(['Patient_ID'], axis=1, errors='ignore')
    
    simple_linear_regression(X_train_inter_feature_extracted, X_test_inter_feature_extracted, \
                         y_train_inter, y_test_inter, nb_cv)

In [21]:
def ensemble_slr_with_correlation(df_label_psd_cluster, df_label_spectro_cluster, df_label_microstate, 
                                 test_ratio, nb_features_extracted_psd, nb_features_extracted_spe, 
                                nb_features_extracted_mic, nb_cv):
    
    # Define Train and Test data 
    print()
    print("---- Split into Train and Test data ----")
    print()
    (X_train_psd, y_train_psd, X_test_psd, y_test_psd) = \
                                                        train_test_split(df_label_psd_cluster, test_ratio)
    (X_train_spe, y_train_spe, X_test_spe, y_test_spe) = \
                                                        train_test_split(df_label_spectro_cluster, test_ratio)
    (X_train_mic, y_train_mic, X_test_mic, y_test_mic) = \
                                                        train_test_split(df_label_microstate, test_ratio)

    print("Test lengths: " + str(len(y_test_psd)) + "(psd), " + \
         str(len(y_test_spe)) + "(spectro), " + str(len(y_test_mic)) + "(micro)")
    
    # Feature selection by correlation
    print()
    print("---- Feature selection by correlation ----")
    print()
    X_train_psd_feature_extracted, X_test_psd_feature_extracted = \
            feature_selection_by_correlation(X_train_psd, y_train_psd, X_test_psd, nb_features_extracted_psd)
    X_train_spe_feature_extracted, X_test_spe_feature_extracted = \
            feature_selection_by_correlation(X_train_spe, y_train_spe, X_test_spe, nb_features_extracted_spe)
    X_train_mic_feature_extracted, X_test_mic_feature_extracted = \
            feature_selection_by_correlation(X_train_mic, y_train_mic, X_test_mic, nb_features_extracted_mic)
    
    # Simple Linear Regressions
    print()
    print("---- Simple Linear Regressions ----")
    print()

    # Train Power Spectrum Cluster SLR
    regr_psd, train_mse_psd = fit_simple_linear_regression(
        X_train_psd_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_psd.drop(['Patient_ID'], axis=1, errors='ignore'), 
        nb_cv)

    # Train Spectro SLR
    regr_spe, train_mse_spe = fit_simple_linear_regression(
        X_train_spe_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_spe.drop(['Patient_ID'], axis=1, errors='ignore'), 
        nb_cv)

    # Train Microstate
    regr_mic, train_mse_mic = fit_simple_linear_regression(
        X_train_mic_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_mic.drop(['Patient_ID'], axis=1, errors='ignore'),
        nb_cv)
    
    # Make the union of the test data sets
    union_test_patient_ids = pd.merge(X_test_psd['Patient_ID'], X_test_spe['Patient_ID'], \
                                      on='Patient_ID', how='outer')
    union_test_patient_ids = pd.merge(union_test_patient_ids, X_test_mic['Patient_ID'], \
                                      on='Patient_ID', how='outer')

    # Make an ensemble testing, weightened by train mse score

    sse = 0
    for test_patient_id in union_test_patient_ids["Patient_ID"]:

        y_pred = [0, 0, 0]
        weights = [0, 0, 0]

        if test_patient_id in X_test_psd['Patient_ID'].values:
            test_sample = X_test_psd_feature_extracted[X_test_psd_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[0] = regr_psd.predict(test_sample)[0][0]
            weights[0] = 1.0/train_mse_psd
            y_test = y_test_psd[y_test_psd['Patient_ID']==test_patient_id]['Age'].values[0]

        if test_patient_id in X_test_spe['Patient_ID'].values:
            test_sample = X_test_spe_feature_extracted[X_test_spe_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[1] = regr_spe.predict(test_sample)[0][0]
            weights[1] = 1.0/train_mse_spe
            y_test = y_test_spe[y_test_spe['Patient_ID']==test_patient_id]['Age'].values[0]

        if test_patient_id in X_test_mic['Patient_ID'].values:
            test_sample = X_test_mic_feature_extracted[X_test_mic_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[2] = regr_mic.predict(test_sample)[0][0]
            weights[2] = 1.0/train_mse_mic
            y_test = y_test_mic[y_test_mic['Patient_ID']==test_patient_id]['Age'].values[0]

        weights = weights/sum(weights)
        y_pred = np.sum(np.multiply(y_pred, weights))

        sse = sse + (y_pred - y_test)**2

    test_mse = sse/len(union_test_patient_ids)
    print("----- Test results --------")
    print('Mean squared error: %.2f'% test_mse)


In [22]:
def fit_gradient_boosting(X_train, y_train, nb_cv=10):

    # Create gradient boosting object
    # verbosity=1, max_depth=15, min_child_weight=100, subsample=1, reg_lambda=3
    xgbr = XGBRegressor(verbosity=1, max_depth=15, min_child_weight=100, subsample=1, reg_lambda=4) 
   
    # Train the model using the training sets
    xgbr.fit(X_train, y_train)

    # Evaluate performance using cross-validation 
    scores = cross_val_score(xgbr, X_train, y_train, scoring="neg_mean_squared_error", cv=nb_cv)
    train_mse = np.mean(-scores)

    # Results
    print("----- Train results -------")
    print('Mean squared error (cross-validation): %.2f'
          % train_mse)
    print()
    
    return xgbr, train_mse

In [23]:
def ensemble_xgboost_with_correlation(df_label_psd_cluster, df_label_spectro_cluster, df_label_microstate, 
                                 test_ratio, nb_features_extracted_psd, nb_features_extracted_spe, 
                                nb_features_extracted_mic, nb_cv):
    
    # Define Train and Test data 
    print()
    print("---- Split into Train and Test data ----")
    print()
    (X_train_psd, y_train_psd, X_test_psd, y_test_psd) = \
                                                        train_test_split(df_label_psd_cluster, test_ratio)
    (X_train_spe, y_train_spe, X_test_spe, y_test_spe) = \
                                                        train_test_split(df_label_spectro_cluster, test_ratio)
    (X_train_mic, y_train_mic, X_test_mic, y_test_mic) = \
                                                        train_test_split(df_label_microstate, test_ratio)

    print("Test lengths: " + str(len(y_test_psd)) + "(psd), " + \
         str(len(y_test_spe)) + "(spectro), " + str(len(y_test_mic)) + "(micro)")
    
    # Feature selection by correlation
    print()
    print("---- Feature selection by correlation ----")
    print()
    X_train_psd_feature_extracted, X_test_psd_feature_extracted = X_train_psd, X_test_psd
            #feature_selection_by_correlation(X_train_psd, y_train_psd, X_test_psd, nb_features_extracted_psd)
    X_train_spe_feature_extracted, X_test_spe_feature_extracted = X_train_spe, X_test_spe
            #feature_selection_by_correlation(X_train_spe, y_train_spe, X_test_spe, nb_features_extracted_spe)
    X_train_mic_feature_extracted, X_test_mic_feature_extracted = X_train_mic, X_test_mic
            #feature_selection_by_correlation(X_train_mic, y_train_mic, X_test_mic, nb_features_extracted_mic)
    
    # XGBoost Regressions
    print()
    print("---- XGBoost Regressions ----")
    print()

    # Train Power Spectrum Cluster SLR
    regr_psd, train_mse_psd = fit_gradient_boosting(
        X_train_psd_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_psd.drop(['Patient_ID'], axis=1, errors='ignore'), 
        nb_cv)

    # Train Spectro SLR
    regr_spe, train_mse_spe = fit_gradient_boosting(
        X_train_spe_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_spe.drop(['Patient_ID'], axis=1, errors='ignore'), 
        nb_cv)

    # Train Microstate
    regr_mic, train_mse_mic = fit_gradient_boosting(
        X_train_mic_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_mic.drop(['Patient_ID'], axis=1, errors='ignore'),
        nb_cv)
    
    # Make the union of the test data sets
    union_test_patient_ids = pd.merge(X_test_psd['Patient_ID'], X_test_spe['Patient_ID'], \
                                      on='Patient_ID', how='outer')
    union_test_patient_ids = pd.merge(union_test_patient_ids, X_test_mic['Patient_ID'], \
                                      on='Patient_ID', how='outer')

    # Make an ensemble testing, weightened by train mse score

    sse = 0
    for test_patient_id in union_test_patient_ids["Patient_ID"]:

        y_pred = [0, 0, 0]
        weights = [0, 0, 0]

        if test_patient_id in X_test_psd['Patient_ID'].values:
            test_sample = X_test_psd_feature_extracted[X_test_psd_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)
            
            y_pred[0] = regr_psd.predict(test_sample)[0]
            weights[0] = 1.0/train_mse_psd
            y_test = y_test_psd[y_test_psd['Patient_ID']==test_patient_id]['Age'].values[0]

        if test_patient_id in X_test_spe['Patient_ID'].values:
            test_sample = X_test_spe_feature_extracted[X_test_spe_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[1] = regr_spe.predict(test_sample)[0]
            weights[1] = 1.0/train_mse_spe
            y_test = y_test_spe[y_test_spe['Patient_ID']==test_patient_id]['Age'].values[0]

        if test_patient_id in X_test_mic['Patient_ID'].values:
            test_sample = X_test_mic_feature_extracted[X_test_mic_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[2] = regr_mic.predict(test_sample)[0]
            weights[2] = 1.0/train_mse_mic
            y_test = y_test_mic[y_test_mic['Patient_ID']==test_patient_id]['Age'].values[0]

        weights = weights/sum(weights)
        y_pred = np.sum(np.multiply(y_pred, weights))

        sse = sse + (y_pred - y_test)**2

    test_mse = sse/len(union_test_patient_ids)
    print("----- Test results --------")
    print('Mean squared error: %.2f'% test_mse)  

In [24]:
def fit_xgboost_with_grid_search(X_train, y_train):

    # Create gradient boosting object
    xgbr = XGBRegressor(verbosity=1) 
    
    # Grid search 
    parameters = {'max_depth':[1,2,4], 'reg_lambda':[8, 10, 12], 'min_child_weight':[6, 8, 12,16]}
    clf = GridSearchCV(xgbr, parameters, scoring='r2', n_jobs=4, iid=False, cv=5)
    clf.fit(X_train, y_train)
    
    print("Best parameters for XGBoost: " + str(clf.best_params_))
            
    return clf