# Random

In [5]:
def random_its(num_its, df_random_ages, subject_ages):
    import random
    import pandas as pd
    
    # generate 1000 iterations of random ages
    for i in range(num_its):
        rand_age = []
        for sub in range(len(df_random_ages)):
            rand_indx = random.randrange(0, len(subject_ages))
            rand_age.append(subject_ages[rand_indx])
        df_temp = pd.DataFrame(rand_age)
        df_temp.columns = [str(i+1)]
        df_random_ages = pd.concat([df_random_ages, df_temp], axis=1)
    
    return df_random_ages

In [6]:
def get_arr_mae(df_random_ages, df_subject_ages):
    # calculate MAE for each random prediction
    # test mean absolute error
    from sklearn.metrics import mean_absolute_error, r2_score

    arr_mae = []
    for i in range(df_random_ages.shape[1]-1):
        arr_mae.append(mean_absolute_error(df_subject_ages, df_random_ages.iloc[:, i+1]))
    
    return arr_mae

# Plots

In [5]:
def plot_rand_hist(arr_mae, lines=None, xlim_min=3, x_max = None, title="", set_context='notebook', fig_tuple=(5,5), legend=True):
    if not x_max:
        x_max = max(arr_mae)
    
    # plot MAE Distribution
    import matplotlib.pyplot as plt
    import numpy as np
    import matplotlib.mlab as mlab
    import seaborn as sns
    
    sns.set_context(set_context, font_scale=1.2)
    # plotting the Histogram
    plt.figure(1)
    plt.figure(figsize=fig_tuple)
    plt.hist(arr_mae, normed = True, label='Random MAE Distribution')
    plt.xlim(xlim_min, x_max)
    plt.xlabel("Mean Absolute Error")
    #plt.ylabel("Percentage of mean absolute error in each group")

    # fitting a Gaussian
    mean = np.mean(arr_mae)
    variance = np.var(arr_mae)
    sigma = np.sqrt(variance)
    x = np.linspace(min(arr_mae), max(arr_mae), 100)
    plt.plot(x, mlab.normpdf(x, mean, sigma), color='y', label='Gaussian')
    
    colors = ['g', 'k', 'c', 'm', 'y']
    color_counter = 0
    
    if lines:
        for key, x in lines.items():
            # plot MAE complex
            plt.axvline(x, ymax=1, label=key, color=colors[color_counter])
            color_counter+=1
    
    plt.title(title)
    
    if legend:
        plt.legend(loc='best')
        
    plt.show()

In [22]:
def plot_simple_v_complex(y_one, y_two, title=None, label='Simple', fig_tuple=(5,5), fig_title=''):
    if title == None:
        add = ""
    else:
        add = title
    
    plt.figure(1)
    plt.figure(figsize=fig_tuple)
    plt.plot(numpy.arange(0,80,0.1), numpy.arange(0,80,0.1), c='black', linewidth = 3)
    plt.scatter(y_one, y_two, alpha=0.5)
    plt.ylim(0, max(y_two) + 5)
    plt.xlim(0, max(y_one) + 5)
    plt.xlabel(label + " Model Prediction " + add)
    plt.ylabel("Complex Model Prediction " + add)
    
    plt.show()

In [6]:
def plot_ae_hist(plot_list, title="", set_context='notebook', fig_tuple=(5,5), file_title=''):
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    sns.set_context(set_context, font_scale=1.2)
    fig,ax =plt.subplots(1, figsize=fig_tuple)
    
    for key, x in plot_list.items():
        sns.distplot(x, ax = ax, label = key)
    
    plt.legend(loc="best")
    plt.xlabel("Error")
    #plt.ylabel("Fraction")
    plt.title(title)
    plt.show()

In [None]:
def poster_plot(y_one, y_two, xl='', yl='', t='', fsize_x=5, fsize_y=5, set_context='notebook', color='g'):
    import matplotlib.pyplot as plm
    import seaborn as sns
    
    sns.set_context(set_context)
    plm.figure(1)
    plm.figure(figsize=(fsize_x,fsize_y))
    #plm.scatter(y_one, y_two, c='g')
    sns.regplot(y_one, y_two, color=color)
    plm.title(t)
    plm.xlabel(xl)
    plm.ylabel(yl)
    plm.show()

# Statistical Tests

## Percentile

In [12]:
def percentile(arr_mae, x):
    # Calculating Percentile of Score of Complex Model
    from scipy.stats import percentileofscore

    percentile_mae = percentileofscore(arr_mae, x)
    return percentile_mae

## Wilcoxon

In [20]:
def wilcoxon_sum(x, y):
    # Wilcoxon on the Training Set
    from scipy.stats import wilcoxon

    statistic, pvalue = wilcoxon(x, y)
    sum = len(x)*(len(x)+1)/2
    
    return statistic, pvalue, sum

# Util

In [23]:
def get_wil_helper(df_atf, y_pred_atf, y_pred_ss, y_pred_stack, df_age, subj='Subject',):
    df = pd.DataFrame([])
    df[subj] = df_atf[subj]
    df['atf_pred'] = y_pred_atf
    df['ss_pred'] = y_pred_ss
    df['stack'] = y_pred_stack
    df = df.merge(df_age, on = subj)
    df['Atf_Err'] = np.subtract(df.iloc[:,1], df.iloc[:,4])
    df['SS_Err'] = np.subtract(df.iloc[:,2], df.iloc[:,4])
    df['Complx_Err'] = np.subtract(df.iloc[:,3], df.iloc[:,4])
    return df

# Iterations

In [12]:
def mult_iterations(num_iter, df_data_ct, df_data_sv, df_data_ca, df_data_hcp_ct, df_data_hcp_sv, df_data_hcp_ca, df_data_nki_ct, df_data_nki_sv, df_data_nki_ca,
                   df_atf, df_hcp_atf, df_nki_atf):
    import pandas as pd
#     df_tt_split = pd.DataFrame(columns=['stack_ts', 'stack_hcp', 'stack_nki','simple_ts',
#                                     'simple_hcp','simple_nki', 'wstat_ts', 
#                                         'wpval_ts', 'wstat_hcp',
#                                         'wpval_hcp', 
#                                         'wstat_nki',
#                                         'wpval_nki',
#                                        'kstat_ts', 'kpval_ts',
#                                        'kstat_hcp', 'kpval_hcp',
#                                        'kstat_nki', 'kpval_nki'])

    wstat_ts = []
    wpval_ts = []
    wstat_hcp = []
    wpval_hcp = []
    wstat_nki = []
    wpval_nki = []
    kstat_ts = []
    kpval_ts = []
    kstat_hcp = []
    kpval_hcp = []
    kstat_nki = []
    kpval_nki = []
    
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error
    
    for i in range(num_iter):
        # for ct
        X_train_ct, X_test_ct, y_train_ct, y_test_ct, pipe_ct = train_test_pipeline(df_data_ct.iloc[:, :5124], 
                                                                                df_data_ct.iloc[:,5125], 
                                                                                test_size = 0.5)
        pipe_ct.fit(X=X_train_ct, y=y_train_ct)
    
        # for sv
        X_train_sv, X_test_sv, y_train_sv, y_test_sv, pipe_sv = train_test_pipeline(df_data_sv.iloc[:, :66], 
                                                                                df_data_sv.iloc[:,67], 
                                                                                test_size = 0.5, 
                                                                                model='yes', model_train=X_train_ct,
                                                                                model_test=X_test_ct)
        pipe_sv.fit(X=X_train_sv, y= y_train_sv.as_matrix().ravel())
    
        # for ca
        X_train_ca, X_test_ca, y_train_ca, y_test_ca, pipe_ca = train_test_pipeline(df_data_ca.iloc[:, :5124], 
                                                                                df_data_ca.iloc[:,5125], 
                                                                                test_size = 0.5,
                                                                                model='yes', model_train=X_train_ct,
                                                                                model_test=X_test_ct)
        pipe_ca.fit(X=X_train_ca, y= y_train_ca.as_matrix().ravel())
    
        # for complex stacked
        source_dict = {
            'aseg': get_stacked_ages(y_train_sv, pipe_sv.predict(X_train_sv), cross_val_predict(pipe_sv, X_train_sv, y_train_sv.as_matrix().ravel()),
                                       y_test_sv, pipe_sv.predict(X_test_sv)),
            'ct': get_stacked_ages(y_train_ct, pipe_ct.predict(X_train_ct), cross_val_predict(pipe_ct, X_train_ct, y_train_ct),
                                       y_test_ct, pipe_ct.predict(X_test_ct), first=True),
            'ca': get_stacked_ages(y_train_ca, pipe_ca.predict(X_train_ca), cross_val_predict(pipe_ca, X_train_ca, y_train_ca.as_matrix().ravel()),
                                       y_test_ca, pipe_ca.predict(X_test_ca))
                }
        source_selection_dict = {'fs': ['aseg', 'ct', 'ca'],}
        scores_test, dd_train, dd_test, pipe_stack = stacking(source_dict, source_selection_dict, 'age', show=False)
    
        # get the predictions after fitting
        
        #df_tt_split.set_value(i, 'stack_ts', mean_absolute_error(dd_test.age_at_scan, dd_test.pred_age_test))
    
        df_stack_hcp = pd.DataFrame([])
        df_stack_hcp['aseg'] = pipe_sv.predict(df_data_hcp_sv.iloc[:,:66])
        df_stack_hcp['ct'] = pipe_ct.predict(df_data_hcp_ct.iloc[:,:5124])
        df_stack_hcp['ca'] = pipe_ca.predict(df_data_hcp_ca.iloc[:,:5124])
        df_stack_hcp['Subject'] = df_data_hcp_ct.Subject.values

        y_predicted_stack_hcp = pipe_stack.predict(df_stack_hcp.iloc[:,:3])    
       
        #df_tt_split.set_value(i, 'stack_hcp', mean_absolute_error(df_data_hcp_ca.iloc[:,5125], y_predicted_stack_hcp))
    
        # for nki
        df_stack_nki = pd.DataFrame([])
        df_stack_nki['aseg'] = pipe_sv.predict(df_data_nki_sv.iloc[:,:66])
        df_stack_nki['ct'] = pipe_ct.predict(df_data_nki_ct.iloc[:,:5124])
        df_stack_nki['ca'] = pipe_ca.predict(df_data_nki_ca.iloc[:,:5124])
        df_stack_nki['Subject'] = df_data_nki_ct.participant_id.values

        y_predicted_stack_nki = pipe_stack.predict(df_stack_nki.iloc[:,:3])
        
        #df_tt_split.set_value(i, 'stack_nki', mean_absolute_error(df_data_nki_ca.iloc[:,5125], y_predicted_stack_nki))
    
        # for simple model
        X_train_atf, X_test_atf, y_train_atf, y_test_atf, pipe_atf =  train_test_pipeline(df_atf.iloc[:, 2:], 
                                                                                  df_atf.iloc[:,1], 
                                                                                  test_size = 0.5,
                                                                                  model='yes', 
                                                                                  model_train=X_train_ct, 
                                                                                  model_test=X_test_ct)
        pipe_atf.fit(X=X_train_atf, y=y_train_atf.as_matrix().ravel())
    
        # get prediction after fitting
        y_predicted_test_atf = pipe_atf.predict(X_test_atf)
        y_predicted_hcp_atf = pipe_atf.predict(df_hcp_atf.iloc[:, 2:])
        y_predicted_nki_atf = pipe_atf.predict(df_nki_atf.iloc[:, 5:])
    
        # add mean absolute error
        #df_tt_split.set_value(i, 'simple_ts', mean_absolute_error(y_test_atf, y_predicted_test_atf))
        #df_tt_split.set_value(i, 'simple_hcp', mean_absolute_error(df_hcp_atf.Age_in_Yrs, y_predicted_hcp_atf))
        #df_tt_split.set_value(i, 'simple_nki', mean_absolute_error(df_nki_atf.age, y_predicted_nki_atf))
    
        temp_ts = y_test_atf.assign(y_pred = y_predicted_test_atf).sort_index()
        statistic, ts_pvalue, sum = wilcoxon_sum(np.subtract(temp_ts.y_pred.values, temp_ts.age_at_scan.values), 
                                      np.subtract(dd_test.pred_age_test, dd_test.age_at_scan))
        wstat_ts.append(statistic)
        wpval_ts.append(ts_pvalue)
#         df_tt_split.set_value(i, 'wstat_ts', statistic)
#         df_tt_split.set_value(i, 'wpval_ts', ts_pvalue)
    
        statistic, hcp_pvalue, sum = wilcoxon_sum(np.subtract(y_predicted_hcp_atf, df_hcp_atf.Age_in_Yrs),
                                             np.subtract(y_predicted_stack_hcp, df_data_hcp_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'wstat_hcp', statistic)
#         df_tt_split.set_value(i, 'wpval_hcp', hcp_pvalue)
        wstat_hcp.append(statistic)
        wpval_hcp.append(hcp_pvalue)
    
        statistic, nki_pvalue, sum = wilcoxon_sum(np.subtract(y_predicted_nki_atf, df_nki_atf.age),
                                             np.subtract(y_predicted_stack_nki, df_data_nki_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'wstat_nki', statistic)
#         df_tt_split.set_value(i, 'wpval_nki', nki_pvalue)
        
        wstat_nki.append(statistic)
        wpval_nki.append(nki_pvalue)
        
        from scipy import stats
    
        statistic, ts_pvalue = stats.ks_2samp(np.subtract(temp_ts.y_pred.values, temp_ts.age_at_scan.values), 
                                      np.subtract(dd_test.pred_age_test, dd_test.age_at_scan))
#         df_tt_split.set_value(i, 'kstat_ts', statistic)
#         df_tt_split.set_value(i, 'kpval_ts', ts_pvalue)
        kstat_ts.append(statistic)
        kpval_ts.append(ts_pvalue)
    
        statistic, hcp_pvalue = stats.ks_2samp(np.subtract(y_predicted_hcp_atf, df_hcp_atf.Age_in_Yrs),
                                             np.subtract(y_predicted_stack_hcp, df_data_hcp_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'kstat_hcp', statistic)
#         df_tt_split.set_value(i, 'kpval_hcp', hcp_pvalue)
        kstat_hcp.append(statistic)
        kpval_hcp.append(hcp_pvalue)
    
        statistic, nki_pvalue = stats.ks_2samp(np.subtract(y_predicted_nki_atf, df_nki_atf.age),
                                             np.subtract(y_predicted_stack_nki, df_data_nki_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'kstat_nki', statistic)
#         df_tt_split.set_value(i, 'kpval_nki', nki_pvalue)
        
        kstat_nki.append(statistic)
        kpval_nki.append(nki_pvalue)
        
        print(i)
        
        with open('/data/NNDSP/anal/analysis_notebooks/other_files/iter_log.txt','a') as f:
            f.write("Iteration %d\n" % (i))
        
        #print(df_tt_split.iloc[i, :])
    
        df_tt_split = pd.DataFrame({'wstat_ts' : wstat_ts, 'wpval_ts' : wpval_ts,
                               'wstat_hcp' : wstat_hcp, 'wpval_hcp' : wpval_hcp, 
                               'wstat_nki': wstat_nki, 'wpval_nki': wpval_nki,
                               'kstat_ts' : kstat_ts, 'kpval_ts' : kpval_ts,
                               'kstat_hcp' : kstat_hcp, 'kpval_hcp' : kpval_hcp, 
                               'kstat_nki': kstat_nki, 'kpval_nki': kpval_nki})
    f.close()    
    return df_tt_split

In [None]:
def subsample_iterations(num_iter, 
                         df_data_ct, df_data_sv, df_data_ca, 
                         df_data_hcp_ct, df_data_hcp_sv, df_data_hcp_ca, 
                         df_data_nki_ct, df_data_nki_sv, df_data_nki_ca,
                         df_atf, df_hcp_atf, df_nki_atf):
    import pandas as pd
#     df_tt_split = pd.DataFrame(columns=['stack_ts', 'stack_hcp', 'stack_nki','simple_ts',
#                                     'simple_hcp','simple_nki', 'wstat_ts', 
#                                         'wpval_ts', 'wstat_hcp',
#                                         'wpval_hcp', 
#                                         'wstat_nki',
#                                         'wpval_nki',
#                                        'kstat_ts', 'kpval_ts',
#                                        'kstat_hcp', 'kpval_hcp',
#                                        'kstat_nki', 'kpval_nki'])

    wstat_ts = []
    wpval_ts = []
    wstat_hcp = []
    wpval_hcp = []
    wstat_nki = []
    wpval_nki = []
    kstat_ts = []
    kpval_ts = []
    kstat_hcp = []
    kpval_hcp = []
    kstat_nki = []
    kpval_nki = []
    
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error
    
    for i in range(num_iter):
        # for ct
        X_train_ct, X_test_ct, y_train_ct, y_test_ct, pipe_ct = train_test_pipeline(df_data_ct.iloc[:, :5124], 
                                                                                df_data_ct.age, 
                                                                                test_size = 0.5)
        pipe_ct.fit(X=X_train_ct, y=y_train_ct)
    
        # for sv
        X_train_sv, X_test_sv, y_train_sv, y_test_sv, pipe_sv = train_test_pipeline(df_data_sv.iloc[:, :66], 
                                                                                df_data_sv.age, 
                                                                                test_size = 0.5, 
                                                                                model='yes', model_train=X_train_ct,
                                                                                model_test=X_test_ct)
        pipe_sv.fit(X=X_train_sv, y= y_train_sv.as_matrix().ravel())
    
        # for ca
        X_train_ca, X_test_ca, y_train_ca, y_test_ca, pipe_ca = train_test_pipeline(df_data_ca.iloc[:, :5124], 
                                                                                df_data_ca.age, 
                                                                                test_size = 0.5,
                                                                                model='yes', model_train=X_train_ct,
                                                                                model_test=X_test_ct)
        pipe_ca.fit(X=X_train_ca, y= y_train_ca.as_matrix().ravel())
    
        # for complex stacked
        source_dict = {
            'aseg': get_stacked_ages(y_train_sv, pipe_sv.predict(X_train_sv), cross_val_predict(pipe_sv, X_train_sv, y_train_sv.as_matrix().ravel()),
                                       y_test_sv, pipe_sv.predict(X_test_sv)),
            'ct': get_stacked_ages(y_train_ct, pipe_ct.predict(X_train_ct), cross_val_predict(pipe_ct, X_train_ct, y_train_ct),
                                       y_test_ct, pipe_ct.predict(X_test_ct), first=True),
            'ca': get_stacked_ages(y_train_ca, pipe_ca.predict(X_train_ca), cross_val_predict(pipe_ca, X_train_ca, y_train_ca.as_matrix().ravel()),
                                       y_test_ca, pipe_ca.predict(X_test_ca))
                }
        source_selection_dict = {'fs': ['aseg', 'ct', 'ca'],}
        scores_test, dd_train, dd_test, pipe_stack = stacking(source_dict, source_selection_dict, 'age', show=False)
    
        # get the predictions after fitting
        
        #df_tt_split.set_value(i, 'stack_ts', mean_absolute_error(dd_test.age_at_scan, dd_test.pred_age_test))
    
        df_stack_hcp = pd.DataFrame([])
        df_stack_hcp['aseg'] = pipe_sv.predict(df_data_hcp_sv.iloc[:,:66])
        df_stack_hcp['ct'] = pipe_ct.predict(df_data_hcp_ct.iloc[:,:5124])
        df_stack_hcp['ca'] = pipe_ca.predict(df_data_hcp_ca.iloc[:,:5124])
        df_stack_hcp['Subject'] = df_data_hcp_ct.subject.values

        y_predicted_stack_hcp = pipe_stack.predict(df_stack_hcp.iloc[:,:3])    
       
        #df_tt_split.set_value(i, 'stack_hcp', mean_absolute_error(df_data_hcp_ca.iloc[:,5125], y_predicted_stack_hcp))
    
        # for nki
        df_stack_nki = pd.DataFrame([])
        df_stack_nki['aseg'] = pipe_sv.predict(df_data_nki_sv.iloc[:,:66])
        df_stack_nki['ct'] = pipe_ct.predict(df_data_nki_ct.iloc[:,:5124])
        df_stack_nki['ca'] = pipe_ca.predict(df_data_nki_ca.iloc[:,:5124])
        df_stack_nki['Subject'] = df_data_nki_ct.subject.values

        y_predicted_stack_nki = pipe_stack.predict(df_stack_nki.iloc[:,:3])
        
        #df_tt_split.set_value(i, 'stack_nki', mean_absolute_error(df_data_nki_ca.iloc[:,5125], y_predicted_stack_nki))
    
        # for simple model
        X_train_atf, X_test_atf, y_train_atf, y_test_atf, pipe_atf =  train_test_pipeline(df_atf.iloc[:, 3:], 
                                                                                  df_atf.age, 
                                                                                  test_size = 0.5,
                                                                                  model='yes', 
                                                                                  model_train=X_train_ct, 
                                                                                  model_test=X_test_ct)
        pipe_atf.fit(X=X_train_atf, y=y_train_atf.as_matrix().ravel())
    
        # get prediction after fitting
        y_predicted_test_atf = pipe_atf.predict(X_test_atf)
        y_predicted_hcp_atf = pipe_atf.predict(df_hcp_atf.iloc[:, 2:])
        y_predicted_nki_atf = pipe_atf.predict(df_nki_atf.iloc[:, 5:])
    
        # add mean absolute error
        #df_tt_split.set_value(i, 'simple_ts', mean_absolute_error(y_test_atf, y_predicted_test_atf))
        #df_tt_split.set_value(i, 'simple_hcp', mean_absolute_error(df_hcp_atf.Age_in_Yrs, y_predicted_hcp_atf))
        #df_tt_split.set_value(i, 'simple_nki', mean_absolute_error(df_nki_atf.age, y_predicted_nki_atf))
    
        temp_ts = y_test_atf.assign(y_pred = y_predicted_test_atf).sort_index()
        statistic, ts_pvalue, sum = wilcoxon_sum(np.subtract(temp_ts.y_pred.values, temp_ts.age_at_scan.values), 
                                      np.subtract(dd_test.pred_age_test, dd_test.age_at_scan))
        wstat_ts.append(statistic)
        wpval_ts.append(ts_pvalue)
#         df_tt_split.set_value(i, 'wstat_ts', statistic)
#         df_tt_split.set_value(i, 'wpval_ts', ts_pvalue)
    
        statistic, hcp_pvalue, sum = wilcoxon_sum(np.subtract(y_predicted_hcp_atf, df_hcp_atf.Age_in_Yrs),
                                             np.subtract(y_predicted_stack_hcp, df_data_hcp_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'wstat_hcp', statistic)
#         df_tt_split.set_value(i, 'wpval_hcp', hcp_pvalue)
        wstat_hcp.append(statistic)
        wpval_hcp.append(hcp_pvalue)
    
        statistic, nki_pvalue, sum = wilcoxon_sum(np.subtract(y_predicted_nki_atf, df_nki_atf.age),
                                             np.subtract(y_predicted_stack_nki, df_data_nki_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'wstat_nki', statistic)
#         df_tt_split.set_value(i, 'wpval_nki', nki_pvalue)
        
        wstat_nki.append(statistic)
        wpval_nki.append(nki_pvalue)
        
        from scipy import stats
    
        statistic, ts_pvalue = stats.ks_2samp(np.subtract(temp_ts.y_pred.values, temp_ts.age_at_scan.values), 
                                      np.subtract(dd_test.pred_age_test, dd_test.age_at_scan))
#         df_tt_split.set_value(i, 'kstat_ts', statistic)
#         df_tt_split.set_value(i, 'kpval_ts', ts_pvalue)
        kstat_ts.append(statistic)
        kpval_ts.append(ts_pvalue)
    
        statistic, hcp_pvalue = stats.ks_2samp(np.subtract(y_predicted_hcp_atf, df_hcp_atf.Age_in_Yrs),
                                             np.subtract(y_predicted_stack_hcp, df_data_hcp_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'kstat_hcp', statistic)
#         df_tt_split.set_value(i, 'kpval_hcp', hcp_pvalue)
        kstat_hcp.append(statistic)
        kpval_hcp.append(hcp_pvalue)
    
        statistic, nki_pvalue = stats.ks_2samp(np.subtract(y_predicted_nki_atf, df_nki_atf.age),
                                             np.subtract(y_predicted_stack_nki, df_data_nki_ca.iloc[:, 5125]))
#         df_tt_split.set_value(i, 'kstat_nki', statistic)
#         df_tt_split.set_value(i, 'kpval_nki', nki_pvalue)
        
        kstat_nki.append(statistic)
        kpval_nki.append(nki_pvalue)
        
        print(i)
        
        with open('/data/NNDSP/anal/analysis_notebooks/other_files/iter_log.txt','a') as f:
            f.write("Iteration %d\n" % (i))
        
        #print(df_tt_split.iloc[i, :])
    
        df_tt_split = pd.DataFrame({'wstat_ts' : wstat_ts, 'wpval_ts' : wpval_ts,
                               'wstat_hcp' : wstat_hcp, 'wpval_hcp' : wpval_hcp, 
                               'wstat_nki': wstat_nki, 'wpval_nki': wpval_nki,
                               'kstat_ts' : kstat_ts, 'kpval_ts' : kpval_ts,
                               'kstat_hcp' : kstat_hcp, 'kpval_hcp' : kpval_hcp, 
                               'kstat_nki': kstat_nki, 'kpval_nki': kpval_nki})
    f.close()    
    return df_tt_split