In [25]:
%run /data/NNDSP/anal/analysis_notebooks/util.ipynb

## Setup

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re

In [3]:
analysis_version = "2017_07_07"
project_dir_absolute = Path('/gpfs/gsfs6/users/NNDSP')

# Project Directory
project_dir = Path(project_dir_absolute) # needs to be pathlib.Path object

# Bids Directory
bids_fs = project_dir.joinpath('derived/fs5.3_subj') # freesurfer bids
bids_bar = project_dir.joinpath('derived/bar_subj') # baracus bids

# Brain Age File
brain_age_file = project_dir.joinpath('anal/analysis_notebooks/NNDSP_famid.csv')

print(brain_age_file.absolute())

/gpfs/gsfs6/users/NNDSP/anal/analysis_notebooks/NNDSP_famid.csv


## Load Subjects

In [4]:
import os
from glob import glob

# get the subjects in the freesurfer directory
subj_dirs = [os.path.basename(s) for s in sorted(glob(os.path.join(bids_fs, "sub-*")))]

In [5]:
subjects_to_analyze = subject_to_anal(subj_dirs, bids_bar)

In [7]:
# Getting Brain Age
df_target = pd.read_csv(brain_age_file, sep=',')
del df_target['nuclear_fam_id']
del df_target['MRN']

## Itarate through

In [31]:
df_pred_test = pd.DataFrame([])

In [44]:
from sklearn.cross_validation import cross_val_predict

for i in range(3):
    features = extract_features(subjects_to_analyze, bids_bar, num_feat = i)
    
    df_data_ct = get_source_data('thickness', features, df_target) # extract thickness info from features
    df_data_ct = df_data_ct.sort_values(by='MASKID')

    df_data_sv = get_source_data('aseg', features, df_target)
    df_data_sv = df_data_sv.sort_values(by='MASKID')
    
    df_data_ca = get_source_data('area', features, df_target)
    df_data_ca = df_data_ca.sort_values(by='MASKID')
    
    #split test and train data into equal parts
    X_train_ct, X_test_ct, y_train_ct, y_test_ct, pipe_ct = train_test_pipeline(df_data_ct.iloc[:, :df_data_ct.shape[1]-3], df_data_ct.iloc[:,df_data_ct.shape[1]-1], test_size = 0.5, random_state=None)
    # fit model
    pipe_ct.fit(X=X_train_ct, y=y_train_ct)
    
    # prediction and errors
    y_predicted_train_ct = pipe_ct.predict(X_train_ct)
    y_predicted_test_ct = pipe_ct.predict(X_test_ct)
    y_predicted_ct_cv = cross_val_predict(pipe_ct, X_train_ct, y_train_ct)
    
    # Train Test Split
    X_train_sv, X_test_sv, y_train_sv, y_test_sv, pipe_sv = train_test_pipeline(df_data_sv.iloc[:, :df_data_sv.shape[1]-3], df_data_sv.iloc[:,df_data_sv.shape[1]-1], test_size = 0.5, random_state=666, model='yes', model_train=X_train_ct, model_test=X_test_ct)

    # fit model
    pipe_sv.fit(X=X_train_sv, y= y_train_sv.as_matrix().ravel())

    # prediction and errors
    y_predicted_train_sv = pipe_sv.predict(X_train_sv)
    y_predicted_test_sv = pipe_sv.predict(X_test_sv)
    y_predicted_sv_cv = cross_val_predict(pipe_sv, X_train_sv, y_train_sv.as_matrix().ravel())
    
    # Train Test Split
    #split test and train data into equal parts
    X_train_ca, X_test_ca, y_train_ca, y_test_ca, pipe_ca = train_test_pipeline(df_data_ca.iloc[:, :df_data_ca.shape[1]-3], df_data_ca.iloc[:,df_data_ca.shape[1]-1], 
                                                                                                test_size = 0.5, random_state=666, model='yes', model_train=X_train_ct, model_test=X_test_ct)
    
    # fit model
    pipe_ca.fit(X=X_train_ca, y=y_train_ca.as_matrix().ravel())
    
    # prediction and errors
    y_predicted_train_ca = pipe_ca.predict(X_train_ca)
    y_predicted_test_ca = pipe_ca.predict(X_test_ca)
    y_predicted_ca_cv = cross_val_predict(pipe_ca, X_train_ca, y_train_ca.as_matrix().ravel())
    
    # stack predicted values
    # get stacked age predictions (test and train) for ct
    df_y_ct = get_stacked_ages(y_train_ct, y_predicted_train_ct, y_predicted_ct_cv, y_test_ct, y_predicted_test_ct, first=True)
    # get stacked age prediction (test and train) for ca
    df_y_ca = get_stacked_ages(y_train_ca, y_predicted_train_ca, y_predicted_ca_cv, y_test_ca, y_predicted_test_ca)
    # get stacked age prediction (test and train) for sv
    df_y_sv = get_stacked_ages(y_train_sv, y_predicted_train_sv, y_predicted_sv_cv, y_test_sv, y_predicted_test_sv)

    # stacking function inputes
    target = 'age'
    source_dict = {'aseg': df_y_sv, 'ct': df_y_ct,'ca': df_y_ca}
    source_selection_dict = {'fs': ['aseg', 'ct', 'ca'],}
    
    scores_test, dd_train, dd_test, pipe_stack = stacking(source_dict, source_selection_dict, target, show=False)
    
    df_pred_test = pd.concat([df_pred_test, pd.DataFrame(dd_test.pred_age_test.values)], axis = 1)

Fitting stacking model
best max_depth: 4
Fitting stacking model
best max_depth: 3
Fitting stacking model
best max_depth: 3


In [45]:
df_pred_test

Unnamed: 0,i,0,0.1,0.2
0,,24.832333,27.564655,27.733398
1,36.389033,29.933926,27.500569,30.290329
2,,9.708646,25.430372,12.540339
3,13.797961,17.125499,11.425952,18.391074
4,,9.540635,24.996964,25.154847
5,,23.119943,11.198553,10.578218
6,,9.477300,37.207158,29.246278
7,,19.165989,22.850103,19.848275
8,,17.471553,32.703766,21.591304
9,,36.360034,21.429380,31.581713
