In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Tool functions for all approaches in Task 1

## Approach 1

In [6]:
def train_test_split(dataframe, test_train_ratio):

    total_samples = len(dataframe.index)
    nsamples_test = int(test_train_ratio*total_samples)
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    test = dataframe.iloc[:nsamples_test]
    X_test = test.drop(['Age'], axis=1)
    y_test = test[['Patient_ID', 'Age']]

    train = dataframe.iloc[nsamples_test:]
    X_train = train.drop(['Age'], axis=1)
    y_train = train[['Patient_ID','Age']]
    
    return X_train, y_train, X_test, y_test

In [7]:
def feature_select_by_correlation(X_train, y_train, nb_features):
    corr = X_train.corrwith(y_train['Age'], axis=0, method='pearson')
    feature_select = pd.DataFrame(corr.iloc[(-corr.abs()).argsort()][:nb_features])
    best_feature_names = feature_select.index.values
    
    return feature_select, best_feature_names

In [8]:
def feature_selection_by_correlation(X_train, y_train, X_test, nb_features):
    feature_select, best_feature_names = feature_select_by_correlation(X_train, y_train, nb_features)
    best_feature_names = np.insert(best_feature_names, 0, 'Patient_ID')
    return X_train[best_feature_names], X_test[best_feature_names]

In [9]:
def fit_simple_linear_regression(X_train_feature_extracted, y_train, nb_cv):

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train_feature_extracted, y_train)

    # Evaluate performance using cross-validation 
    scores = cross_val_score(regr, X_train_feature_extracted, y_train, scoring="neg_mean_squared_error", cv=nb_cv)
    train_mse = np.mean(-scores)

    # Results
    print("----- Train results -------")
    print('Coefficients: \n', regr.coef_)
    print('Mean squared error (cross-validation): %.2f'
          % train_mse)
    print()

    return regr, train_mse 

In [10]:
def simple_linear_regression(X_train_feature_extracted, X_test_feature_extracted, y_train, y_test, nb_cv):

    regr, train_mse = fit_simple_linear_regression(X_train_feature_extracted, y_train, nb_cv)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test_feature_extracted)
    test_mse = mean_squared_error(y_test, y_pred)

    print("----- Test results --------")
    print("Number of test samples: ", len(y_pred))
    print('Mean squared error: %.2f'
          % test_mse)
    
    return regr, train_mse, test_mse

In [11]:
def slr_with_correlation(aggregated_dataframe, test_ratio, nb_features_extracted, nb_cv):
    
    # Define Train and Test data 
    print()
    print("---- Split into Train and Test data ----")
    print()
    (X_train_inter, y_train_inter, X_test_inter, y_test_inter) = \
                                                    train_test_split(aggregated_dataframe, test_ratio)

    print("Length of the train set:")
    print(len(y_train_inter))
    print("Length of the test set:")
    print(len(y_test_inter))
    
    # Feature selection by correlation
    print()
    print("---- Feature selection by correlation ----")
    print()
    feature_select, best_feature_names = \
                        feature_select_by_correlation(X_train_inter, y_train_inter, nb_features_extracted)
    print(feature_select)
    
    X_train_inter_feature_extracted, X_test_inter_feature_extracted = \
               feature_selection_by_correlation(X_train_inter, y_train_inter, X_test_inter, nb_features_extracted)
    
    # Simple Linear Regression
    print()
    print("---- Simple Linear Regression ----")
    print()
    
    # Remove key for linear regression
    X_train_inter_feature_extracted = X_train_inter_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore')
    X_test_inter_feature_extracted = X_test_inter_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore')

    y_train_inter = y_train_inter.drop(['Patient_ID'], axis=1, errors='ignore')
    y_test_inter = y_test_inter.drop(['Patient_ID'], axis=1, errors='ignore')
    
    simple_linear_regression(X_train_inter_feature_extracted, X_test_inter_feature_extracted, \
                         y_train_inter, y_test_inter, nb_cv)

## Approach 2

In [12]:
def ensemble_slr_with_correlation(df_label_psd_cluster, df_label_spectro_cluster, df_label_microstate, 
                                 test_ratio, nb_features_extracted_psd, nb_features_extracted_spe, 
                                nb_features_extracted_mic, nb_cv):
    
    # Define Train and Test data 
    print()
    print("---- Split into Train and Test data ----")
    print()
    (X_train_psd, y_train_psd, X_test_psd, y_test_psd) = \
                                                        train_test_split(df_label_psd_cluster, test_ratio)
    (X_train_spe, y_train_spe, X_test_spe, y_test_spe) = \
                                                        train_test_split(df_label_spectro_cluster, test_ratio)
    (X_train_mic, y_train_mic, X_test_mic, y_test_mic) = \
                                                        train_test_split(df_label_microstate, test_ratio)

    print("Test lengths: " + str(len(y_test_psd)) + "(psd), " + \
         str(len(y_test_spe)) + "(spectro), " + str(len(y_test_mic)) + "(micro)")
    
    # Feature selection by correlation
    print()
    print("---- Feature selection by correlation ----")
    print()
    X_train_psd_feature_extracted, X_test_psd_feature_extracted = \
            feature_selection_by_correlation(X_train_psd, y_train_psd, X_test_psd, nb_features_extracted_psd)
    X_train_spe_feature_extracted, X_test_spe_feature_extracted = \
            feature_selection_by_correlation(X_train_spe, y_train_spe, X_test_spe, nb_features_extracted_spe)
    X_train_mic_feature_extracted, X_test_mic_feature_extracted = \
            feature_selection_by_correlation(X_train_mic, y_train_mic, X_test_mic, nb_features_extracted_mic)
    
    # Simple Linear Regressions
    print()
    print("---- Simple Linear Regressions ----")
    print()

    # Train Power Spectrum Cluster SLR
    regr_psd, train_mse_psd = fit_simple_linear_regression(
        X_train_psd_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_psd.drop(['Patient_ID'], axis=1, errors='ignore'), 
        nb_cv)

    # Train Spectro SLR
    regr_spe, train_mse_spe = fit_simple_linear_regression(
        X_train_spe_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_spe.drop(['Patient_ID'], axis=1, errors='ignore'), 
        nb_cv)

    # Train Microstate
    regr_mic, train_mse_mic = fit_simple_linear_regression(
        X_train_mic_feature_extracted.drop(['Patient_ID'], axis=1, errors='ignore'),
        y_train_mic.drop(['Patient_ID'], axis=1, errors='ignore'),
        nb_cv)
    
    # Make the union of the test data sets
    union_test_patient_ids = pd.merge(X_test_psd['Patient_ID'], X_test_spe['Patient_ID'], \
                                      on='Patient_ID', how='outer')
    union_test_patient_ids = pd.merge(union_test_patient_ids, X_test_mic['Patient_ID'], \
                                      on='Patient_ID', how='outer')

    # Make an ensemble testing, weightened by train mse score

    sse = 0
    for test_patient_id in union_test_patient_ids["Patient_ID"]:

        y_pred = [0, 0, 0]
        weights = [0, 0, 0]

        if test_patient_id in X_test_psd['Patient_ID'].values:
            test_sample = X_test_psd_feature_extracted[X_test_psd_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[0] = regr_psd.predict(test_sample)[0][0]
            weights[0] = 1.0/train_mse_psd
            y_test = y_test_psd[y_test_psd['Patient_ID']==test_patient_id]['Age'].values[0]

        if test_patient_id in X_test_spe['Patient_ID'].values:
            test_sample = X_test_spe_feature_extracted[X_test_spe_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[1] = regr_spe.predict(test_sample)[0][0]
            weights[1] = 1.0/train_mse_spe
            y_test = y_test_spe[y_test_spe['Patient_ID']==test_patient_id]['Age'].values[0]

        if test_patient_id in X_test_mic['Patient_ID'].values:
            test_sample = X_test_mic_feature_extracted[X_test_mic_feature_extracted['Patient_ID']==test_patient_id]
            test_sample = test_sample.drop(['Patient_ID'], axis=1)

            y_pred[2] = regr_mic.predict(test_sample)[0][0]
            weights[2] = 1.0/train_mse_mic
            y_test = y_test_mic[y_test_mic['Patient_ID']==test_patient_id]['Age'].values[0]

        weights = weights/sum(weights)
        y_pred = np.sum(np.multiply(y_pred, weights))

        sse = sse + (y_pred - y_test)**2

    test_mse = sse/len(union_test_patient_ids)
    print("----- Test results --------")
    print('Mean squared error: %.2f'% test_mse)


## Approach 3

In [13]:
def fill_with_median(dataframe_with_nan):
    
    for column in dataframe_with_nan:
        if not isinstance(dataframe_with_nan[column].values[0], str): 
            median = dataframe_with_nan[column].median()
            dataframe_with_nan[column].fillna(median, inplace=True)

    return dataframe_with_nan