In [1]:
import numpy as np
import pandas as pd
import os
from scipy.spatial.distance import *
import matplotlib.pyplot as plt
from scipy.sparse.csgraph import dijkstra, shortest_path, connected_components
%matplotlib inline

In [2]:
def check_matrix(file_id, data_dir=''):
    filename = file_id + '/' + [file for file in os.listdir(data_dir+file_id) if 'FULL' in file][0]
    matrix = np.loadtxt(data_dir + filename)
    matrix = np.delete(matrix, [3,38], axis = 1)
    matrix = np.delete(matrix, [3,38], axis = 0)
    np.fill_diagonal(matrix, 0)
    return np.min(matrix.sum(axis=1)) != 0

def process_name(x):
    if 'Normal' in x:
        return 'Normal'
    elif 'EMCI' in x:
        return 'EMCI'
    else:
        return x

## Getting ADNI pairs (we are removing pairs with zero degrees)

In [3]:
target_col = 'DX Group'
data_dir = 'data/adni/'
subjects_df = pd.read_excel(data_dir + 'ADNI2_Master_Subject_List.xls', sheetname='Subject List', index_col=None)
subjects_df = subjects_df[~subjects_df['Subject ID'].isnull()]
subjects_df = subjects_df[~subjects_df[target_col].isnull()]
groups = subjects_df.groupby(by = 'Subject ID File')
meta_files = set(groups.groups.keys())
actual_files = set(os.listdir(data_dir + 'matrices/'))
#actual_subjects = set([item[:-2] for item in actual_files])
subject_files = list(actual_files & meta_files)

In [4]:
clean_subject_files = sorted([item for item in subject_files if check_matrix(item, data_dir + 'matrices/')])

In [5]:
print(len(subject_files), len(clean_subject_files))

680 675


In [6]:
pairs_list = []
#counter = 0
counter = 0
for ind1, file1 in enumerate(clean_subject_files):
    print(file1)
    for ind2 in range(ind1, len(clean_subject_files)):
        print(counter) if counter % 10000 == 0 else 1
        file2 = clean_subject_files[ind2]
        is_same = (file1[:-2] == file2[:-2])*1
        file1_target = groups.get_group(file1)[target_col].values[0]
        file2_target = groups.get_group(file2)[target_col].values[0]
        pairs_list.append([file1, file1_target, file2, file2_target, is_same])
        counter +=1

003_S_2374_1
0
003_S_2374_4
003_S_4081_1
003_S_4081_5
003_S_4119_1
003_S_4119_3
003_S_4119_4
003_S_4136_1
003_S_4136_2
003_S_4136_3
003_S_4136_4
003_S_4142_1
003_S_4152_1
003_S_4152_3
003_S_4288_1
10000
003_S_4288_2
003_S_4288_3
003_S_4350_1
003_S_4350_3
003_S_4350_4
003_S_4354_1
003_S_4354_2
003_S_4354_3
003_S_4354_4
003_S_4373_1
003_S_4441_1
003_S_4441_3
003_S_4441_4
003_S_4524_1
003_S_4555_1
003_S_4555_3
20000
003_S_4644_1
003_S_4644_3
003_S_4839_1
003_S_4840_1
003_S_4872_1
003_S_4872_2
003_S_4892_1
003_S_4892_2
003_S_4892_4
003_S_4900_1
003_S_4900_2
003_S_4900_4
003_S_5130_1
003_S_5130_2
003_S_5150_1
30000
003_S_5154_1
003_S_5165_1
003_S_5165_2
003_S_5209_1
005_S_2390_1
005_S_2390_2
005_S_2390_3
005_S_2390_4
005_S_2390_5
005_S_4168_1
005_S_4168_3
005_S_4168_4
005_S_4168_5
005_S_4185_1
005_S_4185_2
005_S_4185_3
005_S_4185_4
40000
005_S_4185_5
005_S_4707_1
005_S_4707_2
005_S_4707_3
005_S_4707_4
005_S_4910_1
005_S_4910_2
005_S_5038_1
005_S_5119_1
007_S_2058_1
007_S_2058_2
007_S_2058_3

In [7]:
#pairs_data.query('subject1_target == "SMC" & subject2_target == "SMC" & are_same == 1')

In [8]:
cols = ['subject1_id', 'subject1_target', 'subject2_id', 'subject2_target', 'are_same']
pairs_data = pd.DataFrame(pairs_list, columns=cols, index = range(len(pairs_list)))
pairs_data = pairs_data[pairs_data.subject1_id != pairs_data.subject2_id]
pairs_data = pairs_data.query('subject1_target != "SMC" & subject2_target != "SMC"')
pairs_data.subject1_target = pairs_data.subject1_target.apply(process_name)
pairs_data.subject2_target = pairs_data.subject2_target.apply(process_name)
pairs_data.to_csv('data/adni_pairs_data_with_dx_group_without_isolated_nodes.csv', index = False)

In [9]:
pairs_data.are_same.sum()

764

In [10]:
tst = subjects_df[subjects_df['Subject ID File'].isin(clean_subject_files)]['Subject ID'].drop_duplicates().index

In [11]:
all_files = set(pairs_data.subject1_id) | set(pairs_data.subject2_id)

In [12]:
len(all_files)

639

In [13]:
all_subjects = np.unique([item[:-2] for item in all_files])
len(all_subjects)

227

In [14]:
set(subjects_df.loc[tst]['Subject ID'].unique()) - set([item[:-2] for item in clean_subject_files])

{'003_S_5187'}

In [15]:
tst = subjects_df[subjects_df['Subject ID'].isin(all_subjects)].groupby(by = 'Subject ID')

In [16]:
meta_isbi = tst.first()

In [17]:
meta_isbi['DX Group'].value_counts().sum()

227

In [18]:
meta_isbi.columns

Index(['Image Data ID', 'Scan #', 'Visit ', 'Subject ID File', 'Study Date',
       'Month Difference', 'Age @ Study Date', 'Sex', 'Race (C /N)',
       'DX Group', 'APOE A1', 'APOE A2',
       'DXCHANGE [1=Stable: NL; 2=Stable: MCI; 3=Stable: Dementia; 4=Conversion: NL to MCI; 5=Conversion: MCI to Dementia; 6=Conversion: NL to Dementia; 7=Reversion: MCI to NL; 8=Reversion: Dementia to MCI; 9=Reversion: Dementia to NL]',
       'DxSUM', 'MMSE', 'EDUC (PTEDUCAT)', 'ADAS 11 (TOTSCORE)',
       'ADAS 13 (TOTAL13)', 'CDR SOB', 'CDR (CDGLOBAL)', 'Visit Description',
       'QC (Y/N)', 'Scan Repeats (Y/N; if Y explain)',
       'Mislabeling of Visit Description', 'Significant Month Difference ',
       'Comments on T1', 'T1 Mask (checked; Y or N)', 'Comments on T2',
       'EPI_Corr_FibersDensityMap', 'Unnamed: 30'],
      dtype='object')

In [19]:
meta_isbi['Age @ Study Date'].std()

7.4214191497587301

In [20]:
len(meta_isbi[meta_isbi['DX Group'] != 'SMC'])

227

In [21]:
subs = pairs_data.subject1_id.apply(lambda x: x[:-2])

In [22]:
pairs_data.groupby(by = subs).first().subject1_target.value_counts().sum()-26

200

In [23]:
pairs_data.subject1_target.unique()

array(['EMCI', 'Normal', 'AD', 'LMCI'], dtype=object)

## Getting Parkinson pairs (we are removing pairs with zero degrees)

In [44]:
def convert(data, size=68, mode = 'vec2mat'): #diag=0,

    if mode == 'mat2vec':
        
        mat = data.copy()
        rows, cols = np.triu_indices(data.shape[0],k = 0)
        vec = mat[rows,cols]
        
        return vec

    elif mode == 'vec2mat':
        
        vec = data.copy()        
        rows, cols = np.triu_indices(size,k = 0)
        mat = csr_matrix((vec, (rows, cols)), shape=(size, size)).todense()
        mat = mat + mat.T # symmetric matrix
        #np.fill_diagonal(mat, diag)
        np.fill_diagonal(mat, np.diag(mat)/2)
        return mat

In [45]:
path = 'data/parkinson/'

all_matrices = pd.DataFrame(columns = ['subject_id_file','subject_id',
                                       #'subject_id','scan_id',
                                       'matrix','centers', 'target'])

folders = [item for item in sorted(os.listdir(path)) if '.csv' not in item and '.txt' not in item]
for foldername in folders:
    for filename in sorted(os.listdir(path+foldername)):
        if 'FULL' in filename:
            mat = np.genfromtxt(path+foldername+'/'+filename)
            subject_id_file = foldername
            subject_id = subject_id_file[:8]
            mat = mat[:70][:,:70]
            mat = np.delete(mat, [3,38], 1)
            mat = np.delete(mat, [3,38], 0)
            subject_data = convert(mat, mode = 'mat2vec')
            
        elif 'connect_grav' in filename:
            centers = pd.read_csv(path+foldername+'/'+filename)
            centers.drop([3,38], inplace=True)
            subject_center = np.array(centers[['mm_cordX', 'mm_cordY', 'mm_cordZ']])
                  
            
    single_subject = pd.DataFrame(data = [[subject_id_file, subject_id, subject_data, subject_center, np.nan]],
                                  columns = ['subject_id_file','subject_id', 'matrix','centers', 'target'])
    all_matrices = all_matrices.append(single_subject)
all_matrices.index = all_matrices.subject_id_file


meta_data = pd.read_csv(path + 'demo_info.txt',header=None)
meta_data.columns = ['subject_id_file', 'id', 'date','age','sex', 'target']
meta_data = meta_data.query('target == "PD" | target == "Control"')
meta_data.index = meta_data.subject_id_file

all_matrices.target = meta_data.target

In [46]:
meta_data.age.mean()

61.531057017543851

In [47]:
subs = meta_data.subject_id_file.apply(lambda x: x[:-9])
meta_data.groupby(by = subs).first()['target'].value_counts().sum()

226

In [48]:
meta_data

Unnamed: 0_level_0,subject_id_file,id,date,age,sex,target
subject_id_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3102_007_Baseline,3102_007_Baseline,3102,2010-11-23,63.6658,M,PD
3102_007_Month-24,3102_007_Month-24,3102,2013-02-06,65.8740,M,PD
3104_007_Baseline,3104_007_Baseline,3104,2011-02-14,72.0658,M,Control
3104_007_Month-12,3104_007_Month-12,3104,2012-04-20,73.2466,M,Control
3105_007_Month-12,3105_007_Month-12,3105,2012-04-18,69.6411,M,PD
3106_007_Month-12,3106_007_Month-12,3106,2012-04-18,71.2959,F,Control
3107_007_Baseline,3107_007_Baseline,3107,2011-04-13,69.6575,M,PD
3107_007_Month-12,3107_007_Month-12,3107,2012-03-28,70.6164,M,PD
3107_007_Month-24,3107_007_Month-24,3107,2013-05-15,71.7479,M,PD
3108_007_Baseline,3108_007_Baseline,3108,2011-04-20,49.8000,F,PD


In [49]:
len(meta_data.groupby(by = subs))

226

In [50]:
len(meta_data)

456

In [51]:
meta_data.groupby(by = subs)['age'].first().std()

9.841487919913849

In [52]:
all_matrices

Unnamed: 0_level_0,subject_id_file,subject_id,matrix,centers,target
subject_id_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14426_120_Baseline,14426_120_Baseline,14426_12,"[5127.0, 0.0, 126.0, 0.0, 0.0, 37.0, 1054.0, 2...","[[-47.7532, -44.1677, 4.68671], [-4.03934, 21....",
15761_012_Baseline,15761_012_Baseline,15761_01,"[2356.0, 0.0, 864.0, 0.0, 0.0, 0.0, 1925.0, 28...","[[-51.3843, -41.4437, 6.96815], [-3.88298, 17....",
16644_088_Baseline,16644_088_Baseline,16644_08,"[2275.0, 0.0, 0.0, 0.0, 0.0, 3.0, 512.0, 1220....","[[-48.7847, -53.0503, 8.70825], [-5.18801, 16....",
17608_073_Baseline,17608_073_Baseline,17608_07,"[3001.0, 0.0, 102.0, 0.0, 0.0, 0.0, 1159.0, 12...","[[-49.1788, -42.5815, 4.34578], [-4.63141, 24....",
3101_007_Baseline,3101_007_Baseline,3101_007,"[4923.0, 0.0, 674.0, 0.0, 1.0, 149.0, 1357.0, ...","[[-48.8778, -49.7964, 2.16742], [-3.9476, 17.1...",
3101_007_Month-24,3101_007_Month-24,3101_007,"[6558.0, 0.0, 142.0, 13.0, 29.0, 254.0, 2582.0...","[[-49.291, -49.0041, 0.672131], [-3.78689, 17....",
3102_007_Baseline,3102_007_Baseline,3102_007,"[1810.0, 0.0, 1.0, 6.0, 0.0, 24.0, 1070.0, 103...","[[-49.7971, -41.8696, 4.00966], [-3.4036, 22.7...",PD
3102_007_Month-24,3102_007_Month-24,3102_007,"[1857.0, 3.0, 0.0, 12.0, 0.0, 0.0, 912.0, 231....","[[-48.5109, -43.5864, 6.42336], [-3.55125, 21....",PD
3104_007_Baseline,3104_007_Baseline,3104_007,"[2336.0, 0.0, 32.0, 4.0, 0.0, 16.0, 561.0, 209...","[[-53.6923, -35.0928, 0.615385], [-4.51765, 16...",Control
3104_007_Month-12,3104_007_Month-12,3104_007,"[1721.0, 0.0, 0.0, 0.0, 0.0, 29.0, 384.0, 135....","[[-52.6954, -33.1207, 1.4023], [-4.32558, 15.0...",Control


In [53]:
#meta_data = meta_data.query('target == "PD" | target == "Control"')
subject_files = meta_data.subject_id_file.values
target_col = 'target'

In [57]:
len(subject_files)

456

In [64]:
len(set([item[:-9] for item in subject_files]))

226

In [54]:
clean_subject_files = [item for item in subject_files if check_matrix(item, path)]
print(len(subject_files), len(clean_subject_files))

456 456


In [55]:
meta_data.target.value_counts()

PD         344
Control    112
Name: target, dtype: int64

In [56]:
pairs_list = []
#counter = 0
counter = 0
for ind1, file1 in enumerate(clean_subject_files):
    for ind2 in range(ind1+1, len(clean_subject_files)):
        print(counter) if counter % 10000 == 0 else 1
        file2 = subject_files[ind2]
        name1 = '_'.join(file1.split('_')[:-1])
        name2 = '_'.join(file2.split('_')[:-1])
        is_same = (name1 == name2)*1
        file1_target = meta_data.loc[file1, target_col]
        file2_target = meta_data.loc[file1, target_col]
        pairs_list.append([file1, file1_target, file2, file2_target, is_same])
        counter +=1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [66]:
pairs_data = pd.DataFrame(pairs_list, columns = ['subject1_id', 'subject1_target', 'subject2_id', 'subject2_target', 'are_same'])
pairs_data = pairs_data[pairs_data.subject1_id != pairs_data.subject2_id]
pairs_data.to_csv('data/parkinson_pairs_data.csv', index = False)

In [68]:
pairs_data.are_same.sum()

301

In [40]:
tst = pd.read_csv('data/parkinson_pairs_data.csv')

In [65]:
pairs_data.are_same.sum()

96