In [104]:
%run /data/NNDSP/anal/analysis_notebooks/follow_up_analysis/util_setup.ipynb

# Hypothesis 2.3

If there are data quality differences between datasets, then the dataset identity will predict quality metrics.

## Setup

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [36]:
#project directory
project_dir = Path('/data/NNDSP'
                  )
# NNDSP data directories
nndsp_project_dir = Path('/data/NNDSP')
nndsp_bids_dir = nndsp_project_dir.joinpath('bids_2017_07_14_generic')
nndsp_fs_dir = nndsp_project_dir.joinpath('derivatives/fs5.3_subj')
nndsp_bar_dir = nndsp_project_dir.joinpath('derivatives/bar_subj')
nndsp_mriqc_dir = nndsp_project_dir.joinpath('derivatives/mriqc_2018_06_08')
nndsp_pheno_file = nndsp_project_dir.joinpath('anal/analysis_notebooks/phenotype_files/NNDSP_famid.csv')

# HCP data directories
hcp_project_dir = nndsp_project_dir
hcp_bids_dir = Path('/data/HCP/HCP_900/s3/hcp')
hcp_fs_dir = hcp_project_dir.joinpath('derivatives/fs_hcp_subj')
hcp_bar_dir = hcp_project_dir.joinpath('derivatives/bar_hcp_subj')
# hcp_mriqc_dir = hcp_project_dir.joinpath('derivatives/mriqc_hcp')
hcp_pheno_file = hcp_project_dir.joinpath('nino/HCP_ages.csv')
hcp_pheno2_file = hcp_project_dir.joinpath('anal/analysis_notebooks/phenotype_files/hcp_unrestricted.csv')

# NKI data directories
nki_project_dir = nndsp_project_dir
nki_bids_dir = nki_project_dir.joinpath('anal/NKI')
nki_fs_dir = nki_project_dir.joinpath('derivatives/fs_nki_subj')
nki_bar_dir = nki_project_dir.joinpath('derivatives/bar_nki_subj')
nki_mriqc_dir = nki_project_dir.joinpath('derivatives/mriqc_nki_2018_07_16')
nki_pheno_file = nki_project_dir.joinpath('anal/analysis_notebooks/phenotype_files/participants.tsv')

# CoRR data directories
corr_project_dir = Path('/data/DSST/CoRR')
corr_bids_dir = corr_project_dir.joinpath('bids_corr')
corr_fs_dir = corr_project_dir.joinpath('derivatives/fs_corr')
corr_bar_dir = corr_project_dir.joinpath('derivatives/bar_corr/baracus')
corr_mriqc_dir = corr_project_dir.joinpath('derivatives/mriqc_corr')
corr_pheno_file = corr_project_dir.joinpath('phenotype_files/corr_ages.csv')

# SALD data directories
sald_project_dir = Path('/data/DSST/SALD')
sald_bids_dir = sald_project_dir.joinpath('bids_sald')
sald_fs_dir = sald_project_dir.joinpath('derivatives/fs_sald')
sald_bar_dir = sald_project_dir.joinpath('derivatives/bar_sald/baracus')
sald_mriqc_dir = sald_project_dir.joinpath('derivatives/mriqc_sald')
sald_pheno_file = sald_project_dir.joinpath('phenotype_files/sub_information.xlsx')

## Load MRIQC Json Files

In [37]:
import json
import re

In [110]:
df_nndsp = mriqc_df(nndsp_mriqc_dir, nndsp_pheno_file, data='NNDSP', data_class = 1, 
                    merge_df = (nndsp_subjectdata(nndsp_fs_dir, nndsp_pheno_file)[['subject', 'sex']]))
# df_hcp = mriqc_df(hcp_mriqc_dir, hcp_pheno_file, data='HCP', data_class = 2, 
#                  merge_df = (hcp_subjectdata(hcp_fs_dir, hcp_pheno_file, hcp_pheno2_file)[['subject', 'sex']]))
df_nki = mriqc_df(nki_mriqc_dir, nki_pheno_file, data='NKI', data_class = 3,
                 merge_df = (nki_subjectdata(nki_fs_dir, nki_pheno_file)[['subject', 'sex']]))
# df_corr = mriqc_df(corr_mriqc_dir, corr_pheno_file, data='CoRR', data_class = 4, 
#                   merge_df = (corr_subjectdata(corr_fs_dir, corr_pheno_file)[['subject','sex']]))
df_sald = mriqc_df(sald_mriqc_dir, sald_pheno_file, data='SALD', data_class = 5,
                  merge_df = (sald_subjectdata(sald_fs_dir, sald_pheno_file)[['subject', 'sex']]))

In [114]:
df_all = pd.concat([df_nndsp, df_nki, df_sald], axis=0, join='outer')
df_all.head()

Unnamed: 0,acq_id,cjv,cnr,data_class,dataset,efc,fber,fwhm_avg,fwhm_x,fwhm_y,...,summary_wm_mean,summary_wm_median,summary_wm_n,summary_wm_p05,summary_wm_p95,summary_wm_stdv,tpm_overlap_csf,tpm_overlap_gm,tpm_overlap_wm,wm2max
0,0.0,0.450745,2.966345,1,NNDSP,0.73165,26904.433594,3.339224,3.68108,3.748546,...,1000.574402,1000.011963,197615.0,921.248688,1081.648413,49.010384,0.222196,0.50536,0.562915,0.424826
1,0.0,0.597186,2.285747,1,NNDSP,0.738215,25883.199219,3.400049,3.794461,3.804922,...,1001.480408,1000.007324,197970.0,924.199457,1084.025879,49.015297,0.21316,0.493546,0.555361,0.39811
2,0.0,0.325163,4.105503,1,NNDSP,0.715936,22670.064453,3.143957,3.30256,3.607913,...,1000.676392,1000.012146,161564.0,906.751019,1096.440839,57.998871,0.202775,0.545467,0.515358,0.378795
3,0.0,0.331401,4.022244,1,NNDSP,0.731355,30989.095703,3.051045,3.257622,3.420898,...,1000.7724,1000.005615,170270.0,908.286295,1095.220068,57.538391,0.199213,0.536563,0.520068,0.380536
4,0.0,0.374933,3.617528,1,NNDSP,0.634474,8574.626953,3.196221,3.525169,3.623307,...,999.297913,999.996094,155860.0,927.353989,1068.836938,43.263741,0.192762,0.516845,0.550855,0.458206


In [None]:
import statsmodels as sm
import statsmodels.regression.mixed_linear_model as smf

temp = smf.MixedLM()

print(mdf.summary())