In [71]:
import os, glob
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from zipfile import ZipFile
import argparse
from datetime import datetime
import pdb
import HEI

# Load in the data

In [72]:
demo_df=pd.read_csv('/Users/gracer/Google Drive/BCP/data/20191008_BCP_candidate_info.csv', sep=',')
print(demo_df.shape)

(765, 8)


In [73]:
infant_df=pd.read_csv('/Users/gracer/Google Drive/BCP/data/infantfeeding1.csv', sep=',', encoding='latin1')
print(infant_df.shape)

(57, 36)


In [74]:
diet_df=pd.read_csv('/Users/gracer/Google Drive/BCP/data/Chil_BCP_datasetTOTAL.csv', sep=',')
print(diet_df.shape)

(659, 382)


## Refactor the data to make sense

In [75]:
infant_df['breastfed']=infant_df['breastfed'].replace({'no': 0, 'yes': 1})

In [76]:
infant_df['any_formula']=infant_df['any_formula'].replace({'no': 0, 'yes': 1,'NaN':'NA'})

In [77]:
infant_df['regular_formula']=infant_df['regular_formula'].replace({'no': 0, 'yes': 1,'NaN':'NA','not_answered':'NA'})

In [78]:
infant_df['age_fed_dropdown']=infant_df['age_fed_dropdown'].replace({'no': 0, 'yes': 1,'NaN':'NA','not_answered':'NA','never_not_yet':0})

In [218]:
infant_df.columns

Index(['CandID', 'PSCID', 'Gender', 'Visit_label', 'sid', 'CommentID',
       'UserID', 'Examiner', 'Data_entry_completion_status', 'Date_taken',
       'Candidate_Age', 'Window_Difference', 'breastfed', 'age_stop',
       'age_stop_status', 'age_any_formula', 'age_any_formula_status',
       'age_fed', 'age_fed_status', 'any_formula', 'age_stop_formula',
       'age_stop_formula_status', 'completed_by', 'completed_by_other',
       'completed_by_other_status', 'age_stop_dropdown', 'age_fed_dropdown',
       'age_stop_formula_dropdown', 'birth_order', 'regular_formula',
       'donor_milk', 'age_regular_formula', 'age_regular_formula_status',
       'four_month_ratio', 'donor_milk_percent', 'donor_milk_percent_status'],
      dtype='object')

# Outline variables of interest
This will allow us to drop people who are missing data in columns of interest

In [79]:
imp=['breastfed', 'age_stop_dropdown', 'age_stop', 'any_formula','age_any_formula','regular_formula','age_regular_formula',
          'age_regular_formula_status','age_stop_formula_dropdown','age_fed_dropdown','age_fed']

In [219]:
for index, row in infant_df.iterrows():
    print(row['CandID'], row['four_month_ratio'])
    if row['four_month_ratio'] == 'exclusively_breastfed'
    

981854 nan
758603 nan
758603 nan
758603 nan
683402 nan
494244 nan
675674 nan
385434 nan
530066 nan
189896 nan
488291 nan
683764 nan
266394 exclusively_breastfed
266394 nan
137012 nan
417855 nan
841279 nan
607764 nan
607764 exclusively_breastfed
518139 nan
525955 nan
36831 nan
421109 nan
382287 exclusively_breastfed
382287 nan
912823 nan
52977 nan
106436 nan
197622 exclusively_formula
197622 nan
2081 nan
868724 nan
774387 nan
945369 nan
354404 nan
354404 exclusively_breastfed
576112 nan
314462 nan
395568 exclusively_breastfed
200474 exclusively_breastfed
200474 exclusively_breastfed
960758 exclusively_breastfed
664313 not_answered
664313 not_answered
664313 not_answered
664313 not_answered
788132 exclusively_formula
505525 nan
505525 exclusively_breastfed
505525 exclusively_breastfed
340476 exclusively_breastfed
418793 nan
911829 exclusively_formula
911829 exclusively_formula
713347 exclusively_breastfed
146505 exclusively_formula
523054 more_formula


# Find what common elements (participants) we have between datasets

## compare demographics with infant feeding

In [80]:
b=list(demo_df['PSCID'])
a=list(infant_df['PSCID'])

common=list(set(a) & set(b))
missmatch = list(set(a)-set(b))
miss= list(set(b)-set(a))

In [81]:
len(common)

43

In [82]:
len(miss)

667

In [83]:
demo_df=demo_df[demo_df['PSCID'].isin(common)]
infant_df=infant_df[infant_df['PSCID'].isin(common)]

In [84]:
demo_df.shape

(52, 8)

In [85]:
demo_df_dup=demo_df[demo_df['PSCID'].duplicated()]

In [86]:
demo_df_un=demo_df.drop_duplicates(['PSCID'])

In [87]:
print(demo_df.shape)
print(demo_df_dup.shape)
print(demo_df_un.shape)

(52, 8)
(9, 8)
(43, 8)


In [88]:
demo_df_un=demo_df_un.set_index('CandID')

In [89]:
demo_dict=demo_df_un.to_dict('index')

In [90]:
infant_df_dup=infant_df[infant_df['PSCID'].duplicated()]
infant_df_un=infant_df.drop_duplicates(['PSCID'])
print(infant_df.shape)
print(infant_df_dup.shape)
print(infant_df_un.shape)

(57, 36)
(14, 36)
(43, 36)


In [91]:
infant_df_un=infant_df_un.set_index('CandID')
infant_dict=infant_df_un.to_dict('index')

In [92]:
alldiet_dict=diet_df.to_dict('index')

In [93]:
diet_df_dup=diet_df[diet_df['Participant ID'].duplicated()]
diet_df_un=diet_df.drop_duplicates(['Participant ID'])
print(diet_df.shape)
print(diet_df_dup.shape)
print(diet_df_un.shape)

(659, 382)
(492, 382)
(167, 382)


## Getting the age in months at each recall

In [94]:
for key, item in alldiet_dict.items():
    print('this is the key %s'%key)
    print(item['Date of Intake'])
    print(item['Participant ID'])
    ID = item['Participant ID']
    date=datetime.strptime(item['Date of Intake'], '%m/%d/%Y')
    if ID in demo_dict:
        print('present!')
        # datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
        birthday=datetime.strptime(demo_dict[ID]['DoB'], '%m/%d/%y')
        age = (date-birthday)
        print('this is the number of days %s'%age.days)
        alldiet_dict[key]['age']=float(age.days)/12
    else:
        print('NOPE')

this is the key 0
09/18/2018
105040
NOPE
this is the key 1
06/08/2017
105040
NOPE
this is the key 2
09/07/2017
105040
NOPE
this is the key 3
06/08/2017
105040
NOPE
this is the key 4
03/01/2017
105040
NOPE
this is the key 5
09/07/2017
105040
NOPE
this is the key 6
07/26/2017
106436
present!
this is the number of days 26
this is the key 7
07/21/2017
106436
present!
this is the number of days 21
this is the key 8
07/21/2017
106436
present!
this is the number of days 21
this is the key 9
07/26/2017
106436
present!
this is the number of days 26
this is the key 10
10/14/2017
107008
NOPE
this is the key 11
06/24/2017
107008
NOPE
this is the key 12
10/14/2017
107008
NOPE
this is the key 13
10/10/2017
107008
NOPE
this is the key 14
06/24/2017
107008
NOPE
this is the key 15
04/22/2017
107008
NOPE
this is the key 16
04/16/2018
107008
NOPE
this is the key 17
02/05/2018
107008
NOPE
this is the key 18
04/15/2018
107008
NOPE
this is the key 19
02/04/2018
107008
NOPE
this is the key 20
04/22/2017
1070

423548
NOPE
this is the key 305
09/23/2018
439037
NOPE
this is the key 306
07/10/2018
439083
NOPE
this is the key 307
01/13/2018
439999
NOPE
this is the key 308
01/10/2018
439999
NOPE
this is the key 309
08/02/2018
439999
NOPE
this is the key 310
10/10/2015
447077
NOPE
this is the key 311
08/11/2016
447077
NOPE
this is the key 312
01/21/2016
447077
NOPE
this is the key 313
04/27/2016
447077
NOPE
this is the key 314
08/11/2016
447077
NOPE
this is the key 315
11/21/2017
452200
NOPE
this is the key 316
11/25/2017
452200
NOPE
this is the key 317
05/08/2017
456907
NOPE
this is the key 318
09/02/2017
456907
NOPE
this is the key 319
09/02/2017
456907
NOPE
this is the key 320
05/08/2017
456907
NOPE
this is the key 321
11/18/2017
456907
NOPE
this is the key 322
11/12/2017
456907
NOPE
this is the key 323
02/09/2018
456907
NOPE
this is the key 324
02/16/2018
456907
NOPE
this is the key 325
10/07/2018
458050
NOPE
this is the key 326
09/13/2018
458050
NOPE
this is the key 327
09/06/2017
468570
NOPE

## Getting the data sorted by age at input (within one year of the diet recall)

In [95]:
data = {}
count=0
for key, item in alldiet_dict.items():
    ID = item['Participant ID']
    if ID in demo_dict and infant_dict:
        print('GOT IT!')
        if (abs(item['age'] - infant_dict[ID]['Candidate_Age'])) < 12:
            print('SAME YEAR %s'%ID)
            count=count+1
            data[count]={'ID':ID,'demo':demo_dict[ID], 'infant':infant_dict[ID], 'diet':item}
    else:
        print('NO DICE')

NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
GOT IT!
SAME YEAR 106436
GOT IT!
SAME YEAR 106436
GOT IT!
SAME YEAR 106436
GOT IT!
SAME YEAR 106436
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
GOT IT!
GOT IT!
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
NO DICE
GOT IT!
GOT 

Now have data in the form of a dictonary with index (arbitrary) as key and the data has the following inner keys ID, demo, infant (breastfeeding variables), and diet (04 and 09)

In [96]:
data

{1: {'ID': 106436,
  'demo': {'CenterID': 3,
   'DoB': '6/30/17',
   'EDC': nan,
   'Gender': 'Female',
   'PSCID': 'NCBCP0073_03',
   'ProjectID': 1,
   'SubprojectID': 2.0},
  'diet': {'% Calories from Alcohol': 0.0,
   '% Calories from Carbohydrate': 38.092,
   '% Calories from Fat': 55.0,
   '% Calories from MUFA': 20.82,
   '% Calories from PUFA': 6.2410000000000005,
   '% Calories from Protein': 6.283,
   '% Calories from SFA': 25.227,
   '3-Methylhistidine (mg)': 0.0,
   'Acesulfame Potassium (mg)': 0.0,
   'Added Sugars (by Available Carbohydrate) (g)': 0.0,
   'Added Sugars (by Total Sugars) (g)': 0.0,
   'Alanine (g)': 0.003,
   'Alcohol (g)': 0.0,
   'Alpha-Carotene (provitamin A carotenoid) (mcg)': 0.0,
   'Animal Protein (g)': 0.093,
   'Arginine (g)': 0.004,
   'Ash (g)': 0.018000000000000002,
   'Aspartame (mg)': 0.0,
   'Aspartic Acid (g)': 0.006999999999999999,
   'Available Carbohydrate (g)': 0.62,
   'BVA0100': 0,
   'BVA0200': 0,
   'BVA0300': 0,
   'BVA0400': 0,
  

In [97]:
# data=pd.DataFrame.from_dict(data, orient='index')


In [98]:
data

{1: {'ID': 106436,
  'demo': {'CenterID': 3,
   'DoB': '6/30/17',
   'EDC': nan,
   'Gender': 'Female',
   'PSCID': 'NCBCP0073_03',
   'ProjectID': 1,
   'SubprojectID': 2.0},
  'diet': {'% Calories from Alcohol': 0.0,
   '% Calories from Carbohydrate': 38.092,
   '% Calories from Fat': 55.0,
   '% Calories from MUFA': 20.82,
   '% Calories from PUFA': 6.2410000000000005,
   '% Calories from Protein': 6.283,
   '% Calories from SFA': 25.227,
   '3-Methylhistidine (mg)': 0.0,
   'Acesulfame Potassium (mg)': 0.0,
   'Added Sugars (by Available Carbohydrate) (g)': 0.0,
   'Added Sugars (by Total Sugars) (g)': 0.0,
   'Alanine (g)': 0.003,
   'Alcohol (g)': 0.0,
   'Alpha-Carotene (provitamin A carotenoid) (mcg)': 0.0,
   'Animal Protein (g)': 0.093,
   'Arginine (g)': 0.004,
   'Ash (g)': 0.018000000000000002,
   'Aspartame (mg)': 0.0,
   'Aspartic Acid (g)': 0.006999999999999999,
   'Available Carbohydrate (g)': 0.62,
   'BVA0100': 0,
   'BVA0200': 0,
   'BVA0300': 0,
   'BVA0400': 0,
  

In [126]:
data[1]['demo'].keys()

dict_keys(['PSCID', 'Gender', 'DoB', 'EDC', 'SubprojectID', 'ProjectID', 'CenterID'])

In [137]:
data.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43])

In [147]:
indie = [v['ID'] for k,v in data.items()]

In [148]:
indie

[106436,
 106436,
 106436,
 106436,
 200474,
 2081,
 2081,
 266394,
 266394,
 266394,
 266394,
 340476,
 340476,
 382287,
 382287,
 382287,
 382287,
 382287,
 395568,
 395568,
 395568,
 395568,
 395568,
 418793,
 418793,
 418793,
 418793,
 421109,
 421109,
 421109,
 505525,
 664313,
 664313,
 788132,
 788132,
 788132,
 911829,
 945369,
 945369,
 981854,
 981854,
 981854,
 981854]

In [169]:
columns = list(data[1]['demo'].keys())
index = indie
df_ = pd.DataFrame(index=[0], columns=columns)


In [175]:
cols=list(data[1].keys())

In [176]:
DATA_dict={}
for i in cols[1:]:
    print(i)
    df_ = pd.DataFrame(index=[0], columns=columns)
    for k,v in data.items():
        tmp=pd.DataFrame(data[k][i], index=[data[k]['ID']])
        df_=pd.concat([df_,tmp], axis=0)
    DATA_dict[i]=df_


demo
infant
diet


In [181]:
DATA_dict['diet']

Unnamed: 0,% Calories from Alcohol,% Calories from Carbohydrate,% Calories from Fat,% Calories from MUFA,% Calories from PUFA,% Calories from Protein,% Calories from SFA,3-Methylhistidine (mg),Acesulfame Potassium (mg),Added Sugars (by Available Carbohydrate) (g),...,Vitamin D2 (ergocalciferol) (mcg),Vitamin D3 (cholecalciferol) (mcg),Vitamin E (International Units) (IU),Vitamin E (Total Alpha-Tocopherol) (mg),Vitamin K (phylloquinone) (mcg),Water (g),Whole Grains (ounce equivalents),Xylitol (g),Zinc (mg),age
0,,,,,,,,,,,...,,,,,,,,,,
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,0.0,0.007,0.011,0.007,0.027,7.875,0.0,0.0,0.015,2.166667
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,0.0,0.007,0.011,0.007,0.027,7.875,0.0,0.0,0.015,1.75
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,0.0,0.007,0.011,0.007,0.027,7.875,0.0,0.0,0.015,1.75
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,0.0,0.007,0.011,0.007,0.027,7.875,0.0,0.0,0.015,2.166667
200474,0.0,58.917,28.477,8.108,6.679,12.479,11.0,0.351,0.0,11.465,...,0.0,3.045,4.252,2.592,138.722,394.728,0.924,0.006,4.382,31.75
2081,0.0,43.137,47.647,16.602,8.469,9.176,21.028,0.0,0.0,47.815,...,0.0,8.091,10.649,6.172,48.024,714.997,0.0,0.0,5.396,2.0
2081,0.0,43.137,47.647,16.602,8.469,9.176,21.028,0.0,0.0,47.815,...,0.0,8.091,10.649,6.172,48.024,714.997,0.0,0.0,5.396,2.0
266394,0.058,42.237,39.276,14.67,7.253,18.473,14.061,3.337,0.0,15.617,...,0.0,6.724,4.896,3.293,22.551,1151.822,2.351,0.031,4.817,38.083333
266394,0.0,54.933,29.879,10.255,4.849,15.161,11.923,0.493,0.0,33.683,...,0.0,4.248,4.778,3.208,37.798,1282.65,3.526,0.006,5.9,41.0


In [None]:
total_df = common4.merge(common9.drop_duplicates(subset=['Project Abbreviation','Date of Intake']), how='left')

In [192]:
DF=DATA_dict['diet'].merge(DATA_dict['infant'].drop_duplicates(), left_index=True, right_index=True)

In [193]:
DF

Unnamed: 0,% Calories from Alcohol,% Calories from Carbohydrate,% Calories from Fat,% Calories from MUFA,% Calories from PUFA,% Calories from Protein,% Calories from SFA,3-Methylhistidine (mg),Acesulfame Potassium (mg),Added Sugars (by Available Carbohydrate) (g),...,breastfed,completed_by,completed_by_other,completed_by_other_status,donor_milk,donor_milk_percent,donor_milk_percent_status,four_month_ratio,regular_formula,sid
0,,,,,,,,,,,...,,,,,,,,,,
2081,0.0,43.137,47.647,16.602,8.469,9.176,21.028,0.0,0.0,47.815,...,1.0,,,,,,,,,52.0
2081,0.0,43.137,47.647,16.602,8.469,9.176,21.028,0.0,0.0,47.815,...,1.0,,,,,,,,,52.0
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,1.0,,,,,,,,,60.0
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,1.0,,,,,,,,,60.0
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,1.0,,,,,,,,,60.0
106436,0.0,38.092,55.0,20.82,6.241,6.283,25.227,0.0,0.0,0.0,...,1.0,,,,,,,,,60.0
200474,0.0,58.917,28.477,8.108,6.679,12.479,11.0,0.351,0.0,11.465,...,1.0,mother,,,no,,,exclusively_breastfed,1.0,2376.0
266394,0.058,42.237,39.276,14.67,7.253,18.473,14.061,3.337,0.0,15.617,...,1.0,mother,,,no,,,exclusively_breastfed,0.0,2374.0
266394,0.0,54.933,29.879,10.255,4.849,15.161,11.923,0.493,0.0,33.683,...,1.0,mother,,,no,,,exclusively_breastfed,0.0,2374.0


In [194]:
DF.shape

(44, 429)

In [195]:
DF=DF.merge(DATA_dict['demo'].drop_duplicates(), left_index=True, right_index=True)


In [196]:
DF.shape


(44, 436)

In [200]:
DF['age']

0               NaN
2081       2.000000
2081       2.000000
106436     2.166667
106436     1.750000
106436     1.750000
106436     2.166667
200474    31.750000
266394    38.083333
266394    41.000000
266394    41.000000
266394    38.083333
340476    26.333333
340476    26.583333
382287    24.250000
382287    10.166667
382287    24.750000
382287    15.166667
382287    10.750000
395568    24.916667
395568    24.750000
395568    24.916667
395568    32.333333
395568    32.500000
418793    24.666667
418793    24.416667
418793    30.333333
418793    30.500000
421109     8.416667
421109     8.666667
421109     8.416667
505525     5.583333
664313     4.916667
664313     4.666667
788132    12.916667
788132     5.083333
788132     5.416667
911829    20.833333
945369     1.166667
945369     1.166667
981854    18.250000
981854    18.500000
981854     4.750000
981854     4.750000
Name: age, dtype: float64

In [209]:
DF_child=DF.query('age >= 12')

In [210]:
DF_child.shape

(23, 436)

In [211]:
DF_young=DF.query('age < 12 and age >= 8')

In [212]:
DF_young.shape

(5, 436)

In [213]:
DF_infant=DF.query(' age < 8')

In [215]:
DF_infant.shape

(15, 436)

# Diet quality index scoring

## Working on the Milk component
BF exclusive = 15  
BF partial = 10  
Formula = 5  

In [None]:
list(data.columns)[0:]


In [None]:
for index, row in data.iterrows():
    print(row['ID'], row['demo'])
