In [1]:
## This code is attempting to count the numbers of participants
## who have data for each SAMS modality at baseline based on a 
## redcap reports from both the SAMS contact database as well as the 
## de-identified SAMS database

## Note: at the time of this code (Sept 2020) not all participants 
## have wave 1 7T info in redcap, so this data needs to be obtained
## from header information on Oak

In [2]:
## Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
import glob
import string
import statistics
from os.path import exists
from pydicom import dcmread
import datetime

In [3]:
# read in data for the W2 7T session
w2_7t_session = pd.DataFrame()
curr_path = '/Users/madisonhunt/Desktop/W2_7T_SessionInfo.csv'
w2_7t_session = pd.read_csv(curr_path)

In [4]:
w2_7t_session.head()

Unnamed: 0,pidn,fse_ses1,ufse_ses1,wmn_ses1,mtrage_ses1,flair_ses1,csfn_ses1,mepi_ses1,gmn_ses1,qti_ses1,fse_ses2,ufse_ses2,wmn_ses2,mtrage_ses2,flair_ses2,csfn_ses2,mepi_ses2,gmn_ses2,qti_ses2
0,203,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0
1,301,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,303,1,0,1,1,0,1,1,1,1,0,0,0,1,0,1,0,1,1
3,457,1,0,1,1,0,1,1,1,1,0,0,0,1,0,1,0,1,1
4,472,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0


In [5]:
# read in data for the W1 Visit Summary
w1_visit_summ = pd.DataFrame
curr_path = '/Users/madisonhunt/Desktop/W1_Visit_Summary.csv'
w1_visit_summ = pd.read_csv(curr_path)

In [6]:
w1_visit_summ.head()

Unnamed: 0,pidn,w1_blood,w1_csf,w1_cni,w1_7t
0,203,1,1,0.0,1
1,301,1,1,1.0,1
2,303,1,1,1.0,1
3,405,1,1,1.0,0
4,415,1,1,0.0,1


In [7]:
# read in data for the W1 7T session that is entered in redcap. 
#Later we will check the files that are not documented in redcap

w1_7t_session = pd.DataFrame
curr_path = '/Users/madisonhunt/Desktop/w1_7t_session.csv'
w1_7t_session = pd.read_csv(curr_path)

In [8]:
## read in data for the W2 visit summary

w2_visit_summ = pd.DataFrame()
curr_path = '/Users/madisonhunt/Desktop/w2_visit_summary.csv'
w2_visit_summ= pd.read_csv(curr_path)

In [9]:
## read in participant visit dates (will be used later to determine 
#if data was collected within the 2 year window)

visit_dates = pd.DataFrame()
curr_path = '/Users/madisonhunt/Desktop/SAMS_visit_dates.csv'
visit_dates = pd.read_csv(curr_path)

In [10]:

PET_data = pd.DataFrame()
curr_path = '/Users/madisonhunt/Desktop/SAMS_bio_status.csv'
PET_data = pd.read_csv(curr_path)

print(PET_data)

     pidn      wave ab_group ptau_group   lp_date
0     203     Wave2      ab-        NaN       NaN
1     301  Baseline      ab-         T-       NaN
2     303  Baseline      ab-         T-       NaN
3     405  Baseline      ab+         T+       NaN
4     415  Baseline      ab-         T+       NaN
5     423  Baseline      ab-         T+       NaN
6     447  Baseline      ab+         T-       NaN
7     449  Baseline      ab-         T+       NaN
8     456  Baseline      ab-         T+       NaN
9     457  Baseline      ab-         T-   9/10/19
10    468  Baseline      ab-         T-       NaN
11    470  Baseline      ab-         T-       NaN
12    472     Wave2      ab-        NaN   12/3/19
13    474  Baseline      ab+         T+       NaN
14    475  Baseline      ab-         T-       NaN
15    476     Wave2      ab-        NaN   9/17/19
16    477  Baseline      ab-         T-       NaN
17    478  Baseline      ab-         T-       NaN
18    479  Baseline      ab+         T-       NaN


In [11]:
ab_pos = len(PET_data[PET_data['ab_group'] == 'ab+'])
tau_pos = len(PET_data[PET_data['ptau_group'] == 'T+'])
both_pos = len(PET_data[(PET_data['ab_group'] == 'ab+') & (PET_data['ptau_group'] == 'T+')])

total_pos = ab_pos + tau_pos - both_pos

print(ab_pos)
print(total_pos)

##June 25, 2019

46
61


In [12]:
print(PET_data.loc[29:148,'lp_date'])

29      9/10/19
30          NaN
31          NaN
32          NaN
33          NaN
34          NaN
35          NaN
36          NaN
37          NaN
38          NaN
39      6/25/19
40          NaN
41          NaN
42     10/15/19
43          NaN
44          NaN
45      1/30/20
46          NaN
47          NaN
48          NaN
49          NaN
50          NaN
51          NaN
52          NaN
53          NaN
54          NaN
55     11/21/19
56          NaN
57     11/12/19
58          NaN
         ...   
119         NaN
120         NaN
121         NaN
122         NaN
123         NaN
124         NaN
125         NaN
126         NaN
127         NaN
128         NaN
129         NaN
130         NaN
131         NaN
132         NaN
133         NaN
134         NaN
135         NaN
136         NaN
137         NaN
138         NaN
139         NaN
140         NaN
141         NaN
142         NaN
143         NaN
144         NaN
145         NaN
146         NaN
147         NaN
148         NaN
Name: lp_date, Length: 1

In [13]:
# create data frame for a summary of the available data 
# that was collected within the last two years
avail_data_summ = pd.DataFrame()

#add in all the pidns 
avail_data_summ['pidn'] = w1_visit_summ['pidn']

In [14]:
print(avail_data_summ['pidn'])

0       203
1       301
2       303
3       405
4       415
5       416
6       423
7       447
8       448
9       449
10      456
11      457
12      460
13      463
14      468
15      470
16      472
17      474
18      475
19      476
20      477
21      478
22      479
23      481
24      482
25      488
26      489
27      500
28      504
29      507
       ... 
187    1083
188    1094
189    1095
190    1097
191    1099
192    1101
193    1104
194    1116
195    1119
196    1128
197    1135
198    1136
199    1137
200    1138
201    1141
202    1142
203    1163
204    1183
205    1184
206    1185
207    1190
208    1192
209    1193
210    1195
211    1196
212    1201
213    1227
214    1230
215    1231
216    1234
Name: pidn, Length: 217, dtype: int64


In [15]:
## concatenate the data into one data frame

avail_data_summ = avail_data_summ.merge(w2_visit_summ, on = 'pidn', how='left')

avail_data_summ = avail_data_summ.merge(w1_visit_summ, on = 'pidn', how='left')

avail_data_summ = avail_data_summ.merge(w1_7t_session, on='pidn', how='left')

avail_data_summ = avail_data_summ.merge(w2_7t_session, on = 'pidn', how='left')

avail_data_summ = avail_data_summ.merge(PET_data, on = 'pidn', how='left')

avail_data_summ = avail_data_summ.merge(visit_dates, on = 'pidn', how='left')

In [16]:
## check that the data frame looks like we expect it to
print(avail_data_summ)

     pidn  w2_blood  w2_csf  w2_7T  w1_blood  w1_csf  w1_cni  w1_7t  w1_fse  \
0     203       1.0     1.0    1.0         1       1     0.0      1       0   
1     301       1.0     1.0    1.0         1       1     1.0      1       0   
2     303       1.0     1.0    1.0         1       1     1.0      1       0   
3     405       NaN     NaN    NaN         1       1     1.0      0       0   
4     415       NaN     NaN    NaN         1       1     0.0      1       0   
5     416       NaN     NaN    NaN         1       0     NaN      0       0   
6     423       NaN     NaN    NaN         1       1     1.0      1       0   
7     447       NaN     NaN    NaN         1       1     0.0      1       0   
8     448       NaN     NaN    NaN         1       1     0.0      1       0   
9     449       NaN     NaN    NaN         1       1     0.0      1       0   
10    456       NaN     NaN    NaN         1       1     0.0      1       0   
11    457       1.0     0.0    1.0         1       1

In [17]:
## if a participant doesnt have an FSE recorded in redcap, check to see if they have a file on oak
## this is due to an error that was made at the beginning of the study with documenting data collection

#set current path to Oak folder
curr_path = '/Users/madisonhunt/Desktop/Oak/7T_Data'

## loop through the data that does not exist on redcap
# used fse to determine because all participants should have an fse if they completed a scan
for pid in avail_data_summ[avail_data_summ['w1_fse']==0]['pidn']:
    #find all of the folders in the current participants path
    pt_path = glob.glob(curr_path + '/0' + str(pid) + '_****/00*')
    #set iterator "folder" to zero
    folder = 0
    #for the folders in the participant's data
    for n in pt_path:
        #calculate the number of files in each folder, this is a cheat way to determine which scan 
        #it is without looking at header information
        num_files = len([f for f in os.listdir(pt_path[folder])])
        # if the folder has 16 files, it is either an fse or an ungated fse. for the purpose of this
        #code, we don't care which it is, so mark that they have an fse at w1
        if num_files ==16:
            avail_data_summ.loc[(avail_data_summ['pidn']==pid), 'w1_fse'] = 1
        # if the folder has 46 files, it is the localizer and we do not care about this scan, so move on
        elif num_files == 46:
            continue
        # if the folder has 216 files it a flair. mark that the participant has this scan    
        elif num_files == 216:
             avail_data_summ.loc[(avail_data_summ['pidn']==pid), 'w1_flair'] = 1
        #if the folder has 316 files, mark that the participant has a cube
        elif num_files == 312: 
             avail_data_summ.loc[(avail_data_summ['pidn']==pid), 'w1_cube'] = 1
        # if the folder has 440 files, mark that the participant has a wmnMPRAGE
        elif num_files == 440: 
             avail_data_summ.loc[(avail_data_summ['pidn']==pid), 'w1_wmnmprage'] = 1
        # if the folder has another number of files, mark that there is an error so we can 
        # manually check what scan it is (first time run --> there are only 3 errors, so its not difficult to
        # manually check)
        else:
            print('error on folder' + pt_path[folder])
        ## increase the folder to re-loop
        folder+=1

In [18]:
avail_data_summ.head()

Unnamed: 0,pidn,w2_blood,w2_csf,w2_7T,w1_blood,w1_csf,w1_cni,w1_7t,w1_fse,w1_ufse,...,qti_ses2,wave,ab_group,ptau_group,lp_date,intake_date_w1,cni_w1_date,lp_w1_date,intake_date_w2,lp_w2_date
0,203,1.0,1.0,1.0,1,1,0.0,1,0,0,...,0.0,Wave2,ab-,,,2014-10-16,2014-08-22,2014-10-09,2018-12-10,2019-03-05
1,301,1.0,1.0,1.0,1,1,1.0,1,0,0,...,0.0,Baseline,ab-,T-,,2014-10-09,2015-02-19,2014-10-09,2018-10-24,2019-01-23
2,303,1.0,1.0,1.0,1,1,1.0,1,0,0,...,1.0,Baseline,ab-,T-,,2015-02-17,2015-02-17,2014-11-13,2019-02-12,2019-02-13
3,405,,,,1,1,1.0,0,0,0,...,,Baseline,ab+,T+,,2014-07-08,2014-08-20,2014-07-08,,
4,415,,,,1,1,0.0,1,0,0,...,,Baseline,ab-,T+,,2014-09-29,2014-11-24,2014-09-18,,


In [19]:
## check folders with errors and manually enter which scans they are

avail_data_summ.loc[(avail_data_summ['pidn']==562), 'w1_fse'] = 1
avail_data_summ.loc[(avail_data_summ['pidn']==598), 'w1_fse'] = 1
avail_data_summ.loc[(avail_data_summ['pidn']==600), 'w1_cube'] = 1

In [20]:
## count the wave 2 data that we have. From the date this is run,
# all wave 2 data will be within two years so we don't need to worry about that
w2_blood_np = len(avail_data_summ[avail_data_summ['w2_blood'] == 1])
w2_blood_apos = len(avail_data_summ[(avail_data_summ['w2_blood'] == 1) & (avail_data_summ['ab_group'] == 'ab+')])
w2_blood_aneg = len(avail_data_summ[(avail_data_summ['w2_blood'] == 1) & (avail_data_summ['ab_group'] == 'ab-')])

w2_csf = len(avail_data_summ[avail_data_summ['w2_csf'] == 1])
w2_csf_apos = len(avail_data_summ[(avail_data_summ['w2_csf'] == 1) & (avail_data_summ['ab_group'] == 'ab+')])
w2_csf_aneg = len(avail_data_summ[(avail_data_summ['w2_csf'] == 1) & (avail_data_summ['ab_group'] == 'ab-')])

w2_7t = len(avail_data_summ[avail_data_summ['w2_7T'] == 1])
w2_7t_apos = len(avail_data_summ[(avail_data_summ['w2_7T'] == 1) & (avail_data_summ['ab_group'] == 'ab+')])
w2_7t_aneg = len(avail_data_summ[(avail_data_summ['w2_7T'] == 1) & (avail_data_summ['ab_group'] == 'ab-')])

print(w2_blood_np)
print(w2_csf)
print(w2_7t)

print(w2_blood_apos)
print(w2_blood_aneg)
print(w2_csf_apos)
print(w2_csf_aneg)
print(w2_7t_apos)
print(w2_7t_aneg)

46
21
42
8
38
4
17
8
34


In [21]:
## calculate the number of each w2 scan 
##this almost works, but need to iterate and repeat for each scan type

num_w2_fse = sum(1 for x in (range(len(avail_data_summ.index))) if((avail_data_summ.loc[x, 'fse_ses1'] == 1) or (avail_data_summ.loc[x, 'fse_ses2'] == 1) or (avail_data_summ.loc[x, 'ufse_ses1'] == 1) or (avail_data_summ.loc[x, 'ufse_ses2'] == 1)))
w2_fse_apos = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'fse_ses1'] == 1) or (avail_data_summ.loc[x, 'fse_ses2'] == 1) or (avail_data_summ.loc[x, 'ufse_ses1'] == 1) or (avail_data_summ.loc[x, 'ufse_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_fse_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'fse_ses1'] == 1) or (avail_data_summ.loc[x, 'fse_ses2'] == 1) or (avail_data_summ.loc[x, 'ufse_ses1'] == 1) or (avail_data_summ.loc[x, 'ufse_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))
  

num_w2_wmn = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'wmn_ses1'] == 1) or (avail_data_summ.loc[x, 'wmn_ses2'] == 1)))
w2_wmn_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'wmn_ses1'] == 1) or (avail_data_summ.loc[x, 'wmn_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_wmn_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'wmn_ses1'] == 1) or (avail_data_summ.loc[x, 'wmn_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))


num_w2_mtrage = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'mtrage_ses1'] == 1) or (avail_data_summ.loc[x, 'mtrage_ses2'] == 1)))
w2_mt_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'mtrage_ses1'] == 1) or (avail_data_summ.loc[x, 'mtrage_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_mt_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'mtrage_ses1'] == 1) or (avail_data_summ.loc[x, 'mtrage_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w2_flair = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'flair_ses1'] == 1) or (avail_data_summ.loc[x, 'flair_ses2'] == 1)))
w2_flair_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'flair_ses1'] == 1) or (avail_data_summ.loc[x, 'flair_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_flair_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'flair_ses1'] == 1) or (avail_data_summ.loc[x, 'flair_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w2_csfn = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'csfn_ses1'] == 1) or (avail_data_summ.loc[x, 'csfn_ses2'] == 1)))
w2_csfn_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'csfn_ses1'] == 1) or (avail_data_summ.loc[x, 'csfn_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_csfn_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'csfn_ses1'] == 1) or (avail_data_summ.loc[x, 'csfn_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w2_mepi = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'mepi_ses1'] == 1) or (avail_data_summ.loc[x, 'mepi_ses2'] == 1)))
w2_mepi_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'mepi_ses1'] == 1) or (avail_data_summ.loc[x, 'mepi_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_mepi_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'mepi_ses1'] == 1) or (avail_data_summ.loc[x, 'mepi_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w2_gmn = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'gmn_ses1'] == 1) or (avail_data_summ.loc[x, 'gmn_ses2'] == 1)))
w2_gmn_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'gmn_ses1'] == 1) or (avail_data_summ.loc[x, 'gmn_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_gmn_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'gmn_ses1'] == 1) or (avail_data_summ.loc[x, 'gmn_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w2_qti = sum(1 for x in (range(len(avail_data_summ.index))) if ((avail_data_summ.loc[x, 'qti_ses1'] == 1) or (avail_data_summ.loc[x, 'qti_ses2'] == 1)))
w2_qti_apos = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'qti_ses1'] == 1) or (avail_data_summ.loc[x, 'qti_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w2_qti_aneg = sum(1 for x in (range(len(avail_data_summ.index))) if (((avail_data_summ.loc[x, 'qti_ses1'] == 1) or (avail_data_summ.loc[x, 'qti_ses2'] == 1)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))


print(num_w2_fse)
print(num_w2_wmn)
print(num_w2_mtrage)
print(num_w2_flair)
print(num_w2_csfn)
print(num_w2_mepi)
print(num_w2_gmn)
print(num_w2_qti)
print('/n') 
print(w2_fse_apos)
print(w2_fse_aneg)
print(w2_wmn_apos)
print(w2_wmn_aneg)
print(w2_mt_apos)
print(w2_mt_aneg)
print(w2_flair_apos)
print(w2_flair_aneg) 
print(w2_csfn_apos)
print(w2_csfn_aneg)
print(w2_mepi_apos)
print(w2_mepi_aneg)
print(w2_gmn_apos)
print(w2_gmn_aneg)
print(w2_qti_apos)
print(w2_qti_aneg)

43
42
31
14
41
38
40
26
/n
8
34
8
33
5
26
3
10
7
33
7
30
6
33
5
20


In [22]:
## for those who have a W2 7T, which W1 scans do they also have?

pidn1 = set(avail_data_summ[(avail_data_summ['fse_ses1'] == 1)]['pidn'])
pidn2 = set(avail_data_summ[(avail_data_summ['ufse_ses1'] == 1)]['pidn'])
pidn3 = set(avail_data_summ[(avail_data_summ['fse_ses2'] == 1)]['pidn'])
pidn4 = set(avail_data_summ[(avail_data_summ['ufse_ses2'] == 1)]['pidn'])

temp = list(pidn2-pidn1)
temp2 = list(pidn3-pidn1)
temp3 = list(pidn4-pidn1)

pidn_to_check = list(pidn1) + temp+temp2+temp3
print(len(pidn_to_check))

num_w1_fse = sum(1 for x in (range(len(avail_data_summ.index))) if((avail_data_summ.loc[x, 'w1_fse'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)))
w1_fse_pos = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_fse'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w1_fse_neg = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_fse'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w1_wmn = sum(1 for x in (range(len(avail_data_summ.index))) if((avail_data_summ.loc[x, 'w1_wmnmprage'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)))
w1_wmn_pos = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_wmnmprage'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w1_wmn_neg = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_wmnmprage'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w1_flair = sum(1 for x in (range(len(avail_data_summ.index))) if((avail_data_summ.loc[x, 'w1_flair'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)))
w1_flair_pos = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_flair'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w1_flair_neg = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_flair'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))

num_w1_cube = sum(1 for x in (range(len(avail_data_summ.index))) if((avail_data_summ.loc[x, 'w1_cube'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)))
w1_cube_pos = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_cube'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab+')))
w1_cube_neg = sum(1 for x in (range(len(avail_data_summ.index))) if(((avail_data_summ.loc[x, 'w1_cube'] == 1) and (avail_data_summ.loc[x, 'pidn'] in pidn_to_check)) and (avail_data_summ.loc[x, 'ab_group'] == 'ab-')))


        
print(num_w1_fse)   
print(num_w1_wmn)
print(num_w1_flair)
print(num_w1_cube)
print(w1_fse_pos)
print(w1_wmn_pos)
print(w1_flair_pos)
print(w1_cube_pos)
print(w1_fse_neg)
print(w1_wmn_neg)
print(w1_flair_neg)
print(w1_cube_neg)



43
12
4
3
9
3
1
2
3
8
3
1
5


In [23]:
## What W1 data was collected in the past two years
w1_blood_np = 0
w1_lp = 0
w1_7t = 0
w1_cni = 0

pt = 0 
for pid in avail_data_summ['pidn']:
    if avail_data_summ.loc[pt, 'w1_blood'] == 1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'intake_date_w1'])
        time_diff = datetime.date.today() - date1
        if time_diff.days <= 730:
            w1_blood_np +=1
    if avail_data_summ.loc[pt, 'w1_cni'] == 1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'cni_w1_date'])
        time_diff = datetime.date.today() - date2
        if time_diff.days <= 730:
            w1_cni +=1
    if avail_data_summ.loc[pt, 'w1_csf'] == 1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
        if time_diff.days <= 730:
            w1_lp +=1
    if avail_data_summ.loc[pt, 'w1_7t'] == 1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
#         print(time_diff)
        if time_diff.days <= 730:
            w1_7t +=1
            
    pt+=1
    
    
pt = 0 
w1_np_apos = 0
w1_cni_apos =0
w1_lp_apos=0
w1_7t_apos=0
for pid in avail_data_summ['pidn']:
    if avail_data_summ.loc[pt, 'w1_blood'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'intake_date_w1'])
        time_diff = datetime.date.today() - date1
        if time_diff.days <= 730:
            w1_np_apos +=1
    if avail_data_summ.loc[pt, 'w1_cni'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'cni_w1_date'])
        time_diff = datetime.date.today() - date2
        if time_diff.days <= 730:
            w1_cni_apos +=1
    if avail_data_summ.loc[pt, 'w1_csf'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
        if time_diff.days <= 730:
            w1_lp_apos +=1
    if avail_data_summ.loc[pt, 'w1_7t'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
#         print(time_diff)
        if time_diff.days <= 730:
            w1_7t_apos +=1
            
    pt+=1
    
pt = 0    
w1_np_aneg = 0
w1_cni_aneg =0
w1_lp_aneg=0
w1_7t_aneg=0
for pid in avail_data_summ['pidn']:
    if avail_data_summ.loc[pt, 'w1_blood'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'intake_date_w1'])
        time_diff = datetime.date.today() - date1
        if time_diff.days <= 730:
            w1_np_aneg +=1
    if avail_data_summ.loc[pt, 'w1_cni'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'cni_w1_date'])
        time_diff = datetime.date.today() - date2
        if time_diff.days <= 730:
            w1_cni_aneg +=1
    if avail_data_summ.loc[pt, 'w1_csf'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
        if time_diff.days <= 730:
            w1_lp_aneg +=1
    if avail_data_summ.loc[pt, 'w1_7t'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
#         print(time_diff)
        if time_diff.days <= 730:
            w1_7t_aneg +=1
            
    pt+=1
    
print(w1_blood_np)
print(w1_lp)
print(w1_7t)
print(w1_cni)

print(w1_np_apos)
print(w1_cni_apos)
print(w1_lp_apos)
print(w1_7t_apos)

print(w1_np_aneg)
print(w1_cni_aneg)
print(w1_lp_aneg)
print(w1_7t_aneg)

print(len(pidn_to_check))

19
12
26
11
7
4
5
9
10
5
7
15
43


In [24]:
pt = 0

fse = 0
wmn = 0
flair=0
cube=0
for pid in avail_data_summ['pidn']:
    if avail_data_summ.loc[pt, 'w1_7t'] == 1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
        if time_diff.days <= 730:
            if avail_data_summ.loc[pt, 'w1_fse'] == 1:
                fse +=1
            elif avail_data_summ.loc[pt, 'w1_ufse'] == 1:
                fse +=1
            if avail_data_summ.loc[pt, 'w1_wmnmprage'] == 1:
                wmn+=1
            if avail_data_summ.loc[pt, 'w1_flair'] == 1:
                flair +=1
            if avail_data_summ.loc[pt, 'w1_cube'] ==1:
                cube+=1

    pt+=1
    
pt = 0

fse_apos = 0
wmn_apos = 0
flair_apos=0
cube_apos=0
for pid in avail_data_summ['pidn']:
    if avail_data_summ.loc[pt, 'w1_7t'] == 1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
        if time_diff.days <= 730:
            if avail_data_summ.loc[pt, 'w1_fse'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
                fse_apos +=1
            elif avail_data_summ.loc[pt, 'w1_ufse'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
                fse_apos +=1
            if avail_data_summ.loc[pt, 'w1_wmnmprage'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
                wmn_apos+=1
            if avail_data_summ.loc[pt, 'w1_flair'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
                flair_apos +=1
            if avail_data_summ.loc[pt, 'w1_cube'] ==1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab+':
                cube_apos+=1

    pt+=1
    
pt = 0

fse_aneg = 0
wmn_aneg = 0
flair_aneg=0
cube_aneg=0
for pid in avail_data_summ['pidn']:
    if avail_data_summ.loc[pt, 'w1_7t'] == 1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[pt, 'lp_w1_date'])
        time_diff = datetime.date.today() - date3
        if time_diff.days <= 730:
            if avail_data_summ.loc[pt, 'w1_fse'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
                fse_aneg +=1
            elif avail_data_summ.loc[pt, 'w1_ufse'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
                fse_aneg +=1
            if avail_data_summ.loc[pt, 'w1_wmnmprage'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
                wmn_aneg+=1
            if avail_data_summ.loc[pt, 'w1_flair'] == 1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
                flair_aneg +=1
            if avail_data_summ.loc[pt, 'w1_cube'] ==1 and avail_data_summ.loc[pt, 'ab_group'] == 'ab-':
                cube_aneg+=1

    pt+=1
    
print(fse)
print(wmn)
print(flair)
print(cube)

print(fse_apos)
print(wmn_apos)
print(flair_apos)
print(cube_apos)

print(fse_aneg)
print(wmn_aneg)
print(flair_aneg)
print(cube_aneg)


26
24
20
1
9
8
5
1
15
14
13
0


In [25]:
pet_data = pd.DataFrame()
curr_path = '/Users/madisonhunt/Desktop/SAMS_PET.csv'
pet_data = pd.read_csv(curr_path)

pet_data.head()

Unnamed: 0,PIDN,Scan,wave,ab_group
0,203,Tau PET,Wave2,ab-
1,301,Tau PET,Baseline,ab-
2,303,Tau PET,Baseline,ab-
3,457,Amyloid PET,Baseline,ab-
4,472,Amyloid PET,Wave2,ab-


In [26]:
base_T_pos = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Baseline') and (pet_data.loc[x, 'ab_group'] == 'ab+') and (pet_data.loc[x, 'Scan'] == 'Tau PET')))
base_T_neg = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Baseline') and (pet_data.loc[x, 'ab_group'] == 'ab-')and (pet_data.loc[x, 'Scan'] == 'Tau PET')))
w2_T_pos = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Wave2') and (pet_data.loc[x, 'ab_group'] == 'ab+')and (pet_data.loc[x, 'Scan'] == 'Tau PET')))
w2_T_neg = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Wave2') and (pet_data.loc[x, 'ab_group'] == 'ab-')and (pet_data.loc[x, 'Scan'] == 'Tau PET')))


base_AB_pos = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Baseline') and (pet_data.loc[x, 'ab_group'] == 'ab+') and (pet_data.loc[x, 'Scan'] == 'Amyloid PET')))
base_AB_neg = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Baseline') and (pet_data.loc[x, 'ab_group'] == 'ab-')and (pet_data.loc[x, 'Scan'] == 'Amyloid PET')))
w2_AB_pos = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Wave2') and (pet_data.loc[x, 'ab_group'] == 'ab+')and (pet_data.loc[x, 'Scan'] == 'Amyloid PET')))
w2_AB_neg = sum(1 for x in (range(len(pet_data.index))) if((pet_data.loc[x, 'wave'] == 'Wave2') and (pet_data.loc[x, 'ab_group'] == 'ab-')and (pet_data.loc[x, 'Scan'] == 'Amyloid PET')))

In [27]:
print(base_T_pos)
print(base_T_neg)
print(w2_T_pos)
print(w2_T_neg)

print(base_AB_pos)
print(base_AB_neg)
print(w2_AB_pos)
print(w2_AB_neg)

6
27
1
2
4
14
1
8


In [28]:
print(avail_data_summ)

     pidn  w2_blood  w2_csf  w2_7T  w1_blood  w1_csf  w1_cni  w1_7t  w1_fse  \
0     203       1.0     1.0    1.0         1       1     0.0      1       0   
1     301       1.0     1.0    1.0         1       1     1.0      1       0   
2     303       1.0     1.0    1.0         1       1     1.0      1       0   
3     405       NaN     NaN    NaN         1       1     1.0      0       0   
4     415       NaN     NaN    NaN         1       1     0.0      1       0   
5     416       NaN     NaN    NaN         1       0     NaN      0       0   
6     423       NaN     NaN    NaN         1       1     1.0      1       0   
7     447       NaN     NaN    NaN         1       1     0.0      1       0   
8     448       NaN     NaN    NaN         1       1     0.0      1       0   
9     449       NaN     NaN    NaN         1       1     0.0      1       0   
10    456       NaN     NaN    NaN         1       1     0.0      1       0   
11    457       1.0     0.0    1.0         1       1

[217 rows x 40 columns]


In [29]:
dates_data = pd.DataFrame()
x = 0
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 24
    if avail_data_summ.loc[y, 'w1_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w1'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 730:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w1_cni'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'cni_w1_date'])
        if (datetime.date.today()-date2).days < 730:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w1_7t'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w1_date'])
        if (datetime.date.today()-date3).days < 730:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w1'] = temp1_count
dates_data.loc[x, 'cni'] = temp2_count
dates_data.loc[x, 'lp_w1'] = temp3_count
    
x = 1
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 18
    if avail_data_summ.loc[y, 'w1_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w1'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 548:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w1_cni'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'cni_w1_date'])
        if (datetime.date.today()-date2).days < 548:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w1_7t'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w1_date'])
        if (datetime.date.today()-date3).days < 548:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w1'] = temp1_count
dates_data.loc[x, 'cni'] = temp2_count
dates_data.loc[x, 'lp_w1'] = temp3_count

x = 2
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 30
    if avail_data_summ.loc[y, 'w1_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w1'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 913:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w1_cni'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'cni_w1_date'])
        if (datetime.date.today()-date2).days < 913:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w1_7t'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w1_date'])
        if (datetime.date.today()-date3).days < 913:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w1'] = temp1_count
dates_data.loc[x, 'cni'] = temp2_count
dates_data.loc[x, 'lp_w1'] = temp3_count

x = 3
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 36
    if avail_data_summ.loc[y, 'w1_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w1'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 1095:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w1_cni'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'cni_w1_date'])
        if (datetime.date.today()-date2).days < 1095:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w1_7t'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w1_date'])
        if (datetime.date.today()-date3).days < 1095:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w1'] = temp1_count
dates_data.loc[x, 'cni'] = temp2_count
dates_data.loc[x, 'lp_w1'] = temp3_count
    
dates_data.head()

Unnamed: 0,time,intake_w1,cni,lp_w1
0,24.0,19.0,11.0,25.0
1,18.0,5.0,2.0,10.0
2,30.0,32.0,19.0,38.0
3,36.0,59.0,46.0,61.0


In [30]:
dates_data = pd.DataFrame()
x = 0
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 24
    if avail_data_summ.loc[y, 'w2_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w2'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 730:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w2_7T'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date2).days < 730:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w2_csf'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date3).days < 730:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w2'] = temp1_count
dates_data.loc[x, '7t'] = temp2_count
dates_data.loc[x, 'w2_csf'] = temp3_count
    
x = 1
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 18
    if avail_data_summ.loc[y, 'w2_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w2'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 548:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w2_7T'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date2).days < 548:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w2_csf'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date3).days < 548:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w2'] = temp1_count
dates_data.loc[x, '7t'] = temp2_count
dates_data.loc[x, 'w2_csf'] = temp3_count

x = 2
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 30
    if avail_data_summ.loc[y, 'w2_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w2'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 913:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w2_7T'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date2).days < 913:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w2_csf'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date3).days < 913:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w2'] = temp1_count
dates_data.loc[x, '7t'] = temp2_count
dates_data.loc[x, 'w2_csf'] = temp3_count

x = 3
y=0
temp1_count = 0
temp2_count=0
temp3_count = 0
for pid in avail_data_summ['pidn']:
    dates_data.loc[x, 'time'] = 36
    if avail_data_summ.loc[y, 'w2_blood'] ==1:
        date1 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'intake_date_w2'])
        time_diff = datetime.date.today()-date1
        if time_diff.days <= 1095:
            temp1_count +=1
    if avail_data_summ.loc[y, 'w2_7T'] ==1:
        date2 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date2).days < 1095:
            temp2_count +=1
    if avail_data_summ.loc[y, 'w2_csf'] ==1:
        date3 = datetime.date.fromisoformat(avail_data_summ.loc[y, 'lp_w2_date'])
        if (datetime.date.today()-date3).days < 1095:
            temp3_count +=1
    y+=1
    
dates_data.loc[x, 'intake_w2'] = temp1_count
dates_data.loc[x, '7t'] = temp2_count
dates_data.loc[x, 'w2_csf'] = temp3_count
    
dates_data.head()

TypeError: fromisoformat: argument must be str