## Contains helper methods for fetching demographic information about subjects


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from demographics import get_age_and_education, create_merged_table

In [4]:
# Paths to various CSV files

PET_METADATA_PATH = 'All_Preprocessed_PET.csv'
DEMOGRAPHIC_PATH = 'All_Subjects_Demographic.csv'
DIAGNOSIS_PATH = 'All_Subjects_DXSUM.csv'
CDRSB_PATH = 'All_Subjects_CDR.csv'
MMSE_PATH = 'MMSE.csv'
MOCA_PATH = 'MOCA.csv'
AV45_PATH = 'All_Subjects_UCBERKELEY_AMY_6MM.csv'
CSF_PATH = 'All_Subjects_UPENNBIOMK_ROCHE_ELECSYS.csv'

In [4]:
# def create_merged_table(df1, df2, df1_id_col, df2_id_col, df1_date_col, df2_date_col):
#     '''
#     Merge two DataFrames based on subject ID and closest visit date.
#     '''
#     def get_closest_date(row):
#         # Get the closest date in df2 given a row in df1
#         deltas = (df2[df2[df2_id_col] == row[df1_id_col]][df2_date_col] - row[df1_date_col]).abs()
#         return deltas.idxmin() if not deltas.isna().all() else pd.NA
    
#     deltas = df1.apply(get_closest_date, axis=1)
#     data = df1.copy()
#     data['closest_ind'] = deltas
#     data = pd.merge(data, df2, left_on='closest_ind', right_index=True).dropna(subset=['closest_ind'])
#     return data
    
    #  --------------------------------------  
#     all_prep_pet = pd.read_csv(all_prep_pet_name, parse_dates=['image_date'])
#     dxsum = pd.read_csv(dxsum_name, parse_dates=['EXAMDATE'])
#     cdr = pd.read_csv(cdr_name, parse_dates=['VISDATE'])
    
#     # Merge diagnosis table with PET info table by checking for closest diagnosis exam date to each image date
#     def get_closest_dx_date(row):
#         deltas = (dxsum[dxsum['PTID'] == row['subject_id']]['EXAMDATE'] - row['image_date']).abs()
#         return deltas.idxmin() if not deltas.isna().all() else pd.NA

#     deltas = all_prep_pet.apply(get_closest_dx_date, axis=1)
#     data_init = all_prep_pet.copy()
#     data_init['closest_dx_ind'] = deltas
#     data_init = pd.merge(data_init, dxsum, left_on='closest_dx_ind', right_index=True).dropna(subset=['closest_dx_ind'])

#     # Merge again with CDR table by searching for CDR visit date closest to the image date
#     def get_closest_cdr_date(row):
#         deltas = (cdr[cdr['PTID'] == row['PTID']]['VISDATE'] - row['image_date']).abs()
#         return deltas.idxmin() if not deltas.isna().all() else pd.NA

#     deltas = data_init.apply(get_closest_cdr_date, axis=1)
#     data_init['closest_cdr_ind'] = deltas
#     data = pd.merge(data_init, cdr, left_on='closest_cdr_ind', right_index=True).dropna(subset=['closest_cdr_ind', 'CDRSB']) 
#     return data

In [5]:
pet_meta = pd.read_csv(PET_METADATA_PATH, parse_dates=['image_date'])#.sort_values(by='image_date')
pet_meta.shape

(3760, 7)

In [6]:
demog_df = pd.read_csv(DEMOGRAPHIC_PATH, parse_dates=['VISDATE']).dropna(subset=['VISDATE'])#.sort_values(by='VISDATE')
demog_df['PTDOB'] = pd.to_datetime(demog_df['PTDOB'], format='%m/%Y')
demog_df.shape

(6210, 84)

In [7]:
dxsum = pd.read_csv(DIAGNOSIS_PATH, parse_dates=['EXAMDATE'])
dxsum.shape

(15781, 41)

In [19]:
cdr = pd.read_csv(CDRSB_PATH, parse_dates=['VISDATE'])
cdr.shape

(14576, 25)

In [45]:
mmse = pd.read_csv(MMSE_PATH, parse_dates=['VISDATE'])
mmse.shape

(14625, 58)

In [51]:
moca = pd.read_csv(MOCA_PATH, parse_dates=['VISDATE'])
moca.shape

(8990, 58)

In [64]:
av45 = pd.read_csv(AV45_PATH, parse_dates=['SCANDATE'])
av45.shape

(4581, 344)

In [5]:
csf = pd.read_csv(CSF_PATH, parse_dates=['EXAMDATE'])
csf.shape

(3174, 13)

In [33]:
img_ids = [27628,27933,27937,25392,26555,27080,28280,26419,27990,26671,27097,58692,62048,27656,27167,27242,57562,58705,57879,58027,27481,26249,28732,27274,27110,26490,27165]
df = get_age_and_education(img_ids)

In [44]:
df2 = create_merged_table(df, cdr, 'subject_id', 'PTID', 'image_date', 'VISDATE')
cols = ['image_id', 'image_date', 'PTEDUCAT', 'age', 'subject_id', 'CDRSB']
df2 = df2[cols]
df2.head()

Unnamed: 0,image_id,image_date,PTEDUCAT,age,subject_id,CDRSB
0,27937,2005-12-22,20.0,81.223819,099_S_0054,2.5
1,27628,2005-12-09,20.0,83.523614,035_S_0033,2.0
2,27933,2005-12-22,18.0,66.644764,099_S_0051,1.0
4,26555,2006-01-06,20.0,77.516769,018_S_0057,1.5
5,26419,2006-02-09,20.0,64.19165,007_S_0128,0.5


In [56]:
df3 = create_merged_table(df2, mmse, 'subject_id', 'PTID', 'image_date', 'VISDATE')
cols = ['image_id', 'image_date', 'PTEDUCAT', 'age', 'subject_id', 'CDRSB', 'MMSCORE']
df3 = df3[cols]
df3.head()

Unnamed: 0,image_id,image_date,PTEDUCAT,age,subject_id,CDRSB,MMSCORE
0,27937,2005-12-22,20.0,81.223819,099_S_0054,2.5,27.0
1,27628,2005-12-09,20.0,83.523614,035_S_0033,2.0,29.0
2,27933,2005-12-22,18.0,66.644764,099_S_0051,1.0,27.0
4,26555,2006-01-06,20.0,77.516769,018_S_0057,1.5,27.0
5,26419,2006-02-09,20.0,64.19165,007_S_0128,0.5,29.0


In [65]:
df4 = create_merged_table(df3, moca, 'subject_id', 'PTID', 'image_date', 'VISDATE')
cols = ['image_id', 'image_date', 'PTEDUCAT', 'age', 'subject_id', 'CDRSB', 'MMSCORE', 'MOCA']
df4 = df4[cols]
df4.head()

Unnamed: 0,image_id,image_date,PTEDUCAT,age,subject_id,CDRSB,MMSCORE,MOCA
2,27933,2005-12-22,18.0,66.644764,099_S_0051,1.0,27.0,
5,26419,2006-02-09,20.0,64.19165,007_S_0128,0.5,29.0,
8,25392,2006-01-11,18.0,73.697467,007_S_0101,0.5,27.0,
11,26671,2006-02-16,20.0,79.211499,010_S_0161,0.5,28.0,
13,27097,2006-02-17,14.0,79.296372,018_S_0142,0.5,29.0,


In [67]:
df5 = create_merged_table(df4, av45, 'subject_id', 'PTID', 'image_date', 'SCANDATE')
cols = ['image_id', 'image_date', 'PTEDUCAT', 'age', 'subject_id', 'CDRSB', 'MMSCORE', 'MOCA', 'SUMMARY_SUVR']
df5 = df5[cols]
df5.head()

Unnamed: 0,image_id,image_date,PTEDUCAT,age,subject_id,CDRSB,MMSCORE,MOCA,SUMMARY_SUVR
2,27933,2005-12-22,18.0,66.644764,099_S_0051,1.0,27.0,,1.416
8,25392,2006-01-11,18.0,73.697467,007_S_0101,0.5,27.0,,1.437
13,27097,2006-02-17,14.0,79.296372,018_S_0142,0.5,29.0,,1.582
18,28280,2006-05-12,18.0,71.030801,127_S_0112,0.5,29.0,,1.352
22,58692,2006-02-16,18.0,82.461328,128_S_0135,1.0,29.0,,0.968


In [11]:
# x = pd.merge_asof(pet_meta, demog_df, right_on='VISDATE', left_on='image_date', right_by='PTID', left_by='subject_id', direction='nearest')
# x = pd.merge_asof(pet_meta, dxsum, right_on='EXAMDATE', left_on='image_date', right_by='PTID', left_by='subject_id', direction='nearest')
# x = x.dropna(subset=['EXAMDATE'])
# x.shape

In [None]:
# def blah(row):
#     return abs(row['EXAMDATE'] - row['image_date'])


# x['delta'] = x.apply(blah, axis='columns')#.dt.days.hist()
# x.apply(blah, axis='columns').dt.days.hist()

In [None]:
# y = x[['PTID', 'delta', 'EXAMDATE', 'image_date']].sort_values(by='delta')
# # y[y['PTID'] == '128_S_2036']
# y