## Driver code comparing different QC / OD lists 
- QC: manual / visual quality checks
- OD: automatic outlier detection 

### Steps
- import CSVs from QC / OD procedures
- compare overlap of subjects

In [1]:
import sys
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn3

sys.path.append('../')
from lib.data_handling import *
from lib.data_stats import *

### Data paths

In [31]:
proj_dir = '/home/nikhil/projects/CT_reproduce/code/compare-surf-tools/'
data_dir = proj_dir + 'data/'
qc_dir =  '/home/nikhil/projects/CT_reproduce/data/QC/'
outlier_dir = data_dir + 'outliers/'

# Gleb / Marten visual QC
manual_outlier_file = qc_dir + 'master_QC_table.csv'

# Automated outliers
pipeline_outlier_file = outlier_dir + 'pipeline_outlier_data.csv'

aparc_outlier_file = outlier_dir + 'aparc_outlier_data.csv'
destr_outlier_file = outlier_dir + 'des_outlier_data.csv'
glass_outlier_file = outlier_dir + 'glas_outlier_data.csv'

highdim_lh_ctrl = outlier_dir + 'outlier_lh_con.csv'
highdim_rh_ctrl = outlier_dir + 'outlier_rh_con.csv'

outlier_file_dict = {'manual':manual_outlier_file,
                    'auto_pipline': pipeline_outlier_file,
                    'auto_aparc': aparc_outlier_file,
                    'auto_destr': destr_outlier_file,
                    'auto_glass': glass_outlier_file,
                    'auto_highdim_lh_ctrl':highdim_lh_ctrl,
                    'auto_highdim_rh_ctrl':highdim_rh_ctrl,}

# qc_criterion = ('QC_maarten',[0]) # keep subs
# qc_criterion = ('QC_gleb',['1','-+1']) # keep subs
# qc_criterion = ('outlier_all_data',[0])

### Global Var

In [32]:
subject_ID_col = 'SubjID'

In [54]:
outlier_data_dict = {}
for key in outlier_file_dict.keys():
    df = pd.read_csv(outlier_file_dict[key], dtype={'SubjID': int})
    if key == 'manual':
        df.loc[df['QC_maarten'] != 0.0, 'QC_maarten'] = 1.0
        df.loc[df['QC_gleb'] == '1', 'QC_gleb'] = 0.0
        df.loc[df['QC_gleb'] == '-+1', 'QC_gleb'] = 0.0
        df.loc[df['QC_gleb'] != 0.0, 'QC_gleb'] = 1.0
        
    outlier_data_dict[key] = df
    print('shape of QC file {}: {}'.format(key,df.shape))

shape of QC file auto_glass: (1047, 4)
shape of QC file auto_destr: (1047, 4)
shape of QC file auto_highdim_rh_ctrl: (542, 3)
shape of QC file manual: (1031, 6)
shape of QC file auto_highdim_lh_ctrl: (542, 3)
shape of QC file auto_pipline: (3890, 4)
shape of QC file auto_aparc: (1047, 4)


In [73]:
master_QC_OD_df = pd.DataFrame(columns=[subject_ID_col])
master_QC_OD_df[subject_ID_col] = outlier_data_dict['auto_aparc'][subject_ID_col]

# Manual
manual_df = outlier_data_dict['manual'][[subject_ID_col,'QC_maarten','QC_gleb']]
master_QC_OD_df = pd.merge(master_QC_OD_df, manual_df, on=subject_ID_col, how='left')

# Pipelines
pipe_df = outlier_data_dict['auto_pipline']
for pipe in pipe_df['pipeline'].unique():
    df = pipe_df[pipe_df['pipeline']==pipe][[subject_ID_col,'outlier_ind']]
    df = df.rename(columns={'outlier_ind':'{}_outlier_ind'.format(pipe)})
    master_QC_OD_df = pd.merge(master_QC_OD_df, df, on=subject_ID_col, how='left')

# Atlases
for atlas in ['aparc','destr','glass']:
    atlas_df = outlier_data_dict['auto_{}'.format(atlas)][[subject_ID_col,'outlier_all_data']]
    atlas_df = atlas_df.rename(columns={'outlier_all_data':'{}_outlier_all_data'.format(atlas)})
    master_QC_OD_df = pd.merge(master_QC_OD_df, atlas_df, on=subject_ID_col, how='left')

# High-dim (outlier_ind)
for highdim in ['lh_ctrl','rh_ctrl']:
    highdim_df = outlier_data_dict['auto_highdim_{}'.format(highdim)][['SUB_ID','outlier_ind']]
    highdim_df = highdim_df.rename(columns={'SUB_ID':subject_ID_col,'outlier_ind':'{}_outlier_ind'.format(highdim)})
    master_QC_OD_df = pd.merge(master_QC_OD_df, highdim_df, on=subject_ID_col, how='left')

In [74]:
master_QC_OD_df

Unnamed: 0,SubjID,QC_maarten,QC_gleb,ants_outlier_ind,civet_outlier_ind,fs51_outlier_ind,fs53_outlier_ind,fs60_outlier_ind,aparc_outlier_all_data,destr_outlier_all_data,glass_outlier_all_data,lh_ctrl_outlier_ind,rh_ctrl_outlier_ind
0,50002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,,
1,50003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,,
2,50004,0.0,1.0,,,,,,0,0,0,,
3,50006,0.0,1.0,,,,,,0,0,0,,
4,50007,0.0,1.0,,,,,,0,0,0,,
5,50008,0.0,1.0,,,,,,1,1,1,,
6,50009,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1,,
7,50010,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1,,
8,50011,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,,
9,50012,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,,


In [67]:
outlier_data_dict['auto_highdim_lh_ctrl']

Unnamed: 0,SUB_ID,age,outlier_ind
0,50822,-0.574714,0
1,50785,-0.840852,0
2,50788,-0.856355,0
3,50777,-1.096655,0
4,50818,-0.657398,0
5,50773,-0.780131,0
6,50812,-0.892529,0
7,50779,-0.964878,0
8,50820,-1.034642,0
9,50817,-0.892529,0
