In [44]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import psycopg2.extras as extras

# We are also going to do some basic viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import xport.v56 #for working with XPT files (SAS outputs favored by the US government)
import pickle #for saving data extracts

In [40]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'fitness',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

connection = pg.connect(**connection_args)

### Download Data

Downloads Data from NHANES website (note that DXX only available from 2011 and on, prior to that had data issues and would require more data massaging)

Initial database was set up using psql and appropriate tables made for the 2017-2018 table, then used the code below to insert prior years' survey data into the table.


Database includes non-NHANES data:
    - "Fat": was created that lists fitness category by gender and body fat range. 
        - Classes: "Essential Fat", "Athlete", "Fitness", "Average" (note originally called Acceptable), "Obese"
        - Source: American Counsel on Exercise
    - "bmi_cat": lists fitness category based on BMI based on gender.
        - Classes: "Underweight", "Normal", "Overweight", "Obese"
        - Source: CDC

In [None]:
break
# no need to run the following cells.. skip to EDA section for code

In [None]:
years = dict([('2011-2012','G'),('2013-2014','H'),('2015-2016','I'),\
                  ('2017-2018','J')])
data_type = ['DEMO','DR1TOT','PAQ','DBQ','WHQ','BMX','DXX']

Download the data files from CDC website

In [None]:
import requests


for key in years:
    for data in data_type:
        url = "https://wwwn.cdc.gov/Nchs/Nhanes/"+key+"/"+data+"_"+years[key]+".XPT"
        r= requests.get(url)
        filename = 'Data/Raw/'+data+'_'+years[key]+'.XPT'
        with open(filename,'wb') as out_file:
            out_file.write(r.content)

Update database tables with additional data.

Because some fields in the 2017-2018 survey tables were newer, the code below will add new columns of 0s (initially Nulls, but had some issues halfway so switch to 0s.. but this should not impact analysis.) Therefore, prior to using any field, should do a test to see distribution by survey year to make sure that it is valid across all years.


In [46]:
#helper function to be used below
def execute_values(conn, df, table,columns):
    """
    Using psycopg2.extras.execute_values() to insert the dataframe
    """
    #make new df into same columns as existing table
    missing_cols = list(set(columns) - set(df.columns))
    extra_cols = list(set(df.columns) - set(columns))
    new_df = df
    new_df =df.drop(columns = extra_cols)
    for i in missing_cols:
        blank_series = pd.Series([0]*len(df),name=i)
        new_df = pd.concat([new_df,blank_series],axis=1)
    fin_df = new_df[columns]
    
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in fin_df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(fin_df[columns]))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, pg.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("execute_values() done")
    cursor.close()

In [51]:
data_type = ['BMX','DBQ','DEMO', 'DR1TOT', 'DXX', 'PAQ', 'WHQ']  #names of files to pull
survey = ['G','H','I'] #surveys correspond to certain years of surveys

for data in data_type:
    with open('Data/Raw/'+data+'_J.xpt', 'rb') as f:
        library = xport.v56.load(f)
    columns = list(library[data+'_J'].columns)

    for letter in survey:
        with open('Data/Raw/'+data+'_'+letter+'.xpt', 'rb') as f:
            library = xport.v56.load(f)
        execute_values(connection,library[data+'_'+letter],data,columns)

execute_values() done
execute_values() done
execute_values() done


In [53]:
#identify the missing columns

data_type = ['BMX','DBQ','DEMO', 'DR1TOT', 'DXX', 'PAQ', 'WHQ'] 
survey = ['G','H','I'] 

for data in data_type:
    with open('Data/Raw/'+data+'_J.xpt', 'rb') as f:
        library = xport.v56.load(f)
    columns = list(library[data+'_J'].columns)

    for letter in survey:
        with open('Data/Raw/'+data+'_'+letter+'.xpt', 'rb') as f:
            library = xport.v56.load(f)
        df = library[data+'_'+letter]
        missing_cols = list(set(columns) - set(df.columns))
        print(f'{data}+" "+{letter}+": "+{missing_cols}')
    

BMX+" "+G+": "+['BMIHIP', 'BMXHIP']
BMX+" "+H+": "+['BMIHIP', 'BMXHIP']
BMX+" "+I+": "+['BMIHIP', 'BMXHIP']
DBQ+" "+G+": "+['CBQ596', 'CBQ611', 'DBQ940', 'CBQ606', 'DBQ935', 'DBQ945', 'DBQ930']
DBQ+" "+H+": "+['DBQ935', 'DBQ945', 'DBQ940', 'DBQ930']
DBQ+" "+I+": "+['DBQ935', 'DBQ945', 'DBQ940', 'DBQ930']
DEMO+" "+G+": "+['DMDHSEDZ', 'DMDHRAGZ', 'DMDHREDZ', 'DMDHRMAZ']
DEMO+" "+H+": "+['DMDHSEDZ', 'DMDHRAGZ', 'DMDHREDZ', 'DMDHRMAZ']
DEMO+" "+I+": "+['DMDHSEDZ', 'DMDHRAGZ', 'DMDHREDZ', 'DMDHRMAZ']
DR1TOT+" "+G+": "+['DR1MRESP', 'DR1TWSZ', 'DR1SKY', 'DR1STY', 'DR1HELP']
DR1TOT+" "+H+": "+['DR1TWSZ', 'DR1HELP', 'DR1MRESP']
DR1TOT+" "+I+": "+['DR1TWSZ']
DXX+" "+G+": "+[]
DXX+" "+H+": "+[]
DXX+" "+I+": "+[]
PAQ+" "+G+": "+[]
PAQ+" "+H+": "+[]
PAQ+" "+I+": "+[]
WHQ+" "+G+": "+['WHQ200', 'WHD080U', 'WHQ190', 'WHQ225']
WHQ+" "+H+": "+['WHQ200', 'WHQ190', 'WHQ225']
WHQ+" "+I+": "+[]


## EDA

Exploring the categories - comparing using BMI vs Body Fat to determine someone's fitness level

In [11]:
query = '''
SELECT DISTINCT dxx.seqn, 
	CASE demo.riagendr WHEN '1.0' THEN 'M' WHEN '2.0' THEN 'F' ELSE demo.riagendr END, 
	demo.ridageyr, dxx.dxdtopf,  bmx.bmxbmi, fat.category, bmi_cat.bmi_category
FROM dxx
LEFT JOIN demo
	ON dxx.seqn = demo.seqn
LEFT JOIN bmx
    ON dxx.seqn = bmx.seqn
JOIN fat
	ON demo.riagendr = fat.gender AND dxdtopf >= fat.low AND dxdtopf < fat.high
JOIN bmi_cat
    ON bmx.bmxbmi >= bmi_cat.low AND bmx.bmxbmi < bmi_cat.high
WHERE demo.ridageyr >= 18
ORDER BY dxx.seqn
'''

cat_comp = pd.read_sql(query,connection)

In [12]:
table = cat_comp.pivot_table(index='bmi_category',columns='category',aggfunc='count')['seqn'].copy()

column_order = ['Athlete','Fitness','Average','Obese']
index_order = ['Underweight','Normal','Overweight','Obese']


table.reindex(index = index_order, columns=column_order)

category,Athlete,Fitness,Average,Obese
bmi_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Underweight,14.0,101.0,115.0,27.0
Normal,81.0,447.0,1552.0,1604.0
Overweight,5.0,53.0,658.0,2908.0
Obese,,1.0,124.0,4101.0


Note that the BMI category is different than the (body fat) category. 

There is strong overlap. However, the BMI_category does not identify Athletes or those who are fit. Otherwise there are strong correlations, as one might expect.

The 53 that are "fit" yet "Overweight" highlight a known issue with BMI those who lift weights and have more muscle mass tend to be heavier. So based on weight, they are overweight, even though having the muscles instead of fat is healthy.

One significant drawback to using the Categories defined by body fat % is that the vast majority are considered Obese. Will definitely be working with some significant class imbalance here

[Body Fat Norms by American Counsel on Exercise](https://www.acefitness.org/education-and-resources/lifestyle/tools-calculators/percent-body-fat-calculator/)

| Category | Women | Men |
| ----------- | ----------- | ----------- |
| Essential Fat | 10-13% | 2-5% |
| Athletes | 14-20% | 6-13% |
| Fitness | 21-24% | 14-17% |
| Acceptable | 25-31% | 18-24% |
| Obesity | >32% | >25% |


[BMI categories per CDC](https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html)

| BMI | Weight Status |
| ----------- | ----------- |
| Below 18.5 | Underweight |
| 18.5 – 24.9 | Normal or Healthy Weight |
| 25.0 – 29.9 | Overweight |
| 30.0 and Above | Obese |

**For the analysis, will focus on using the Body Fat Norms by ACE.
Would like to determine what factors impact fitness level, and specifically is diet or exercise more important**

## Data Extraction

Use SQL to query the database and get the dataframe ready for modeling purposes

In [26]:
full_query = '''
SELECT DISTINCT dxx.seqn, 
	demo.sddsrvyr, demo.ridageyr, CASE demo.riagendr WHEN '1.0' THEN 'M' WHEN '2.0' THEN 'F' ELSE demo.riagendr END, 
    dxx.dxdtopf,
    whq.whq030, whq.whd080a, whq.whd080b, whq.whd080c, whq.whd080d, whq.whd080e, whq.whd080f,
		whq.whd080g, whq.whd080h, whq.whd080i, whq.whd080j, whq.whd080k, whq.whd080m, whq.whd080n,
		whq.whd080o, whq.whd080p, whq.whd080q,  whq.whd080r, whq.whd080s, whq.whd080t, whq.whq190,
	dr.drqsdiet, dr.drqsdt1, dr.drqsdt2, dr.drqsdt3, dr.drqsdt4, dr.drqsdt5, dr.drqsdt6, dr.drqsdt7, 
		dr.drqsdt8, dr.drqsdt9, dr.drqsdt10, dr.drqsdt11, dr.drqsdt12, dr.drqsdt91,
	paq.paq610, paq.paq625, paq.paq640, paq.paq655, paq.paq670, 
    fat.category
FROM dxx
LEFT JOIN demo
	ON dxx.seqn = demo.seqn
LEFT JOIN whq
	ON dxx.seqn = whq.seqn
LEFT JOIN dr
    ON dxx.seqn = dr.seqn
LEFT JOIN paq
    ON dxx.seqn = paq.seqn
JOIN fat
	ON demo.riagendr = fat.gender AND dxdtopf >= fat.low AND dxdtopf < fat.high
WHERE demo.ridageyr >= 18
ORDER BY dxx.seqn
'''
full_df = pd.read_sql(query, connection)


In [44]:
query = '''
SELECT DISTINCT dxx.seqn, 
	demo.sddsrvyr, demo.ridageyr, CASE demo.riagendr WHEN '1.0' THEN 'M' WHEN '2.0' THEN 'F' ELSE demo.riagendr END, 
    dxx.dxdtopf,
    whq.whq030, whq.whd080a, whq.whd080b, whq.whd080c, whq.whd080d, whq.whd080e, whq.whd080f,
		whq.whd080g, whq.whd080h, whq.whd080i, whq.whd080j, whq.whd080k, whq.whd080m, whq.whd080n,
		whq.whd080o, whq.whd080p, whq.whd080q,  whq.whd080r, whq.whd080s, whq.whd080t, whq.whq190,
	paq.paq610, paq.paq625, paq.paq640, paq.paq655, paq.paq670, 
    fat.category
FROM dxx
LEFT JOIN demo
	ON dxx.seqn = demo.seqn
LEFT JOIN whq
	ON dxx.seqn = whq.seqn
LEFT JOIN paq
    ON dxx.seqn = paq.seqn
JOIN fat
	ON demo.riagendr = fat.gender AND dxdtopf >= fat.low AND dxdtopf < fat.high
WHERE demo.ridageyr >= 18
ORDER BY dxx.seqn
'''
df = pd.read_sql(query, connection)

In [45]:
len(df)

11826

In [46]:
df

Unnamed: 0,seqn,sddsrvyr,ridageyr,riagendr,dxdtopf,whq030,whd080a,whd080b,whd080c,whd080d,...,whd080r,whd080s,whd080t,whq190,paq610,paq625,paq640,paq655,paq670,category
0,62161.0,7.0,22.0,M,24.3,3.0,10.0,,,,...,,,,,,,,,,Average
1,62164.0,7.0,44.0,F,25.9,3.0,10.0,11.0,12.0,13.0,...,,,,,5.0,,,5.0,1.0,Average
2,62169.0,7.0,21.0,M,19.8,3.0,,,,,...,,,,,,,,,,Average
3,62172.0,7.0,43.0,F,42.2,1.0,10.0,,12.0,,...,,,,,,,,,,Obese
4,62179.0,7.0,55.0,M,27.6,1.0,,,,,...,,,,,,,5.0,,,Obese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11821,102935.0,10.0,27.0,F,33.7,3.0,10.0,11.0,,13.0,...,44.0,45.0,46.0,2.0,3.0,,4.0,1.0,1.0,Obese
11822,102944.0,10.0,55.0,M,33.4,1.0,,11.0,12.0,13.0,...,,,,2.0,,5.0,,,,Obese
11823,102948.0,10.0,31.0,F,27.7,3.0,,,,,...,,,,2.0,,,,5.0,,Average
11824,102949.0,10.0,33.0,M,15.5,3.0,,,,,...,,,,2.0,3.0,7.0,7.0,,,Fitness


In [69]:
df.category.unique()

array(['Average', 'Obese', 'Fitness', 'Athlete'], dtype=object)

In [48]:
colnames = {
    "seqn": "id",
    "sddsrvyr": "data_cycle",
    "riagendr": "gender",
    "ridageyr": "age",
    "dxdtopf": "body_fat",
    "paq610": "vig_work_days",
    "paq625": "mod_work_days",
    "paq640": "walk_bike_days",
    "paq655": "vig_rec_days",
    "paq670": "mod_rec_days",
    "whq030": "self_image",
    "whd080a": "ate_less",
    "whd080b": "low_cal_foods",
    "whd080c": "ate_less_fat",
    "whd080d": "exercise",
    "whd080e": "skip_meals",
    "whd080f": "'diet'_foods",
    "whd080g": "liquid_diet",
    "whd080h": "wt_loss_program",
    "whd080i": "rx_diet_pills",
    "whd080j": "non-rx_diet_pills",
    "whd080k": "lax_vomit",
    "whd080m": "water",
    "whd080n": "wh_special_diet",
    "whd080o": "low_carb",
    "whd080p": "smoke",
    "whd080q": "fruit_veg_salad",
    "whd080r": "change_eat_habits",
    "whd080s": "less_sugar",
    "whd080t": "less_junk_fast",
    "whq190": "surgery",
    "drqsdiet": "dr_special_diet",
    "drqsdt1": "low_cal",
    "drqsdt2": "low_fat",
    "drqsdt3": "low_salt",
    "drqsdt4": "low_sugar",
    "drqsdt5": "low_fiber",
    "drqsdt6": "high_fiber",
    "drqsdt7": "diabetic_diet",
    "drqsdt8": "bulking",
    "drqsdt9": "low_carb",
    "drqsdt10": "high_protein",
    "drqsdt11": "gluten_free",
    "drqsdt12": "renal_kidney",
    "drqsdt91": "other_diet",
}

In [49]:
df = df.rename(columns = colnames)

In [50]:
df

Unnamed: 0,id,data_cycle,age,gender,body_fat,self_image,ate_less,low_cal_foods,ate_less_fat,exercise,...,change_eat_habits,less_sugar,less_junk_fast,surgery,vig_work_days,mod_work_days,walk_bike_days,vig_rec_days,mod_rec_days,category
0,62161.0,7.0,22.0,M,24.3,3.0,10.0,,,,...,,,,,,,,,,Average
1,62164.0,7.0,44.0,F,25.9,3.0,10.0,11.0,12.0,13.0,...,,,,,5.0,,,5.0,1.0,Average
2,62169.0,7.0,21.0,M,19.8,3.0,,,,,...,,,,,,,,,,Average
3,62172.0,7.0,43.0,F,42.2,1.0,10.0,,12.0,,...,,,,,,,,,,Obese
4,62179.0,7.0,55.0,M,27.6,1.0,,,,,...,,,,,,,5.0,,,Obese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11821,102935.0,10.0,27.0,F,33.7,3.0,10.0,11.0,,13.0,...,44.0,45.0,46.0,2.0,3.0,,4.0,1.0,1.0,Obese
11822,102944.0,10.0,55.0,M,33.4,1.0,,11.0,12.0,13.0,...,,,,2.0,,5.0,,,,Obese
11823,102948.0,10.0,31.0,F,27.7,3.0,,,,,...,,,,2.0,,,,5.0,,Average
11824,102949.0,10.0,33.0,M,15.5,3.0,,,,,...,,,,2.0,3.0,7.0,7.0,,,Fitness


In [51]:
df.describe()

Unnamed: 0,id,age,body_fat,vig_work_days,mod_work_days,walk_bike_days,vig_rec_days,mod_rec_days
count,11826.0,11826.0,11826.0,2789.0,4721.0,3518.0,3854.0,5268.0
mean,82384.1621,37.655082,32.734086,4.207601,4.472358,4.926094,3.336274,3.428056
std,11536.227604,12.407943,8.672151,2.486318,3.66224,4.019599,2.227765,2.59601
min,62161.0,18.0,11.7,1.0,1.0,1.0,1.0,1.0
25%,71881.5,27.0,26.3,3.0,3.0,3.0,2.0,2.0
50%,82356.5,38.0,32.5,5.0,5.0,5.0,3.0,3.0
75%,91975.75,48.0,39.7,5.0,5.0,7.0,4.0,5.0
max,102954.0,59.0,56.1,99.0,99.0,99.0,99.0,99.0


In [52]:
df.isnull().sum()

id                      0
data_cycle              0
age                     0
gender                  0
body_fat                0
self_image              0
ate_less             1733
low_cal_foods        2019
ate_less_fat         2046
exercise             1592
skip_meals           2175
'diet'_foods         2344
liquid_diet          2362
wt_loss_program      2409
rx_diet_pills        2420
non-rx_diet_pills    2376
lax_vomit            2433
water                1713
wh_special_diet      2368
low_carb             2073
smoke                2435
fruit_veg_salad      1777
change_eat_habits    1929
less_sugar           1879
less_junk_fast       1847
surgery              6362
vig_work_days        9037
mod_work_days        7105
walk_bike_days       8308
vig_rec_days         7972
mod_rec_days         6558
category                0
dtype: int64

In [53]:
df = df.fillna(value=0)

In [58]:
df = df.replace(to_replace='NaN', value = 0)

In [67]:
df.dtypes

id                   float64
data_cycle            object
age                  float64
gender                object
body_fat             float64
self_image            object
ate_less              object
low_cal_foods         object
ate_less_fat          object
exercise              object
skip_meals            object
'diet'_foods          object
liquid_diet           object
wt_loss_program       object
rx_diet_pills         object
non-rx_diet_pills     object
lax_vomit             object
water                 object
wh_special_diet       object
low_carb              object
smoke                 object
fruit_veg_salad       object
change_eat_habits     object
less_sugar            object
less_junk_fast        object
surgery               object
vig_work_days        float64
mod_work_days        float64
walk_bike_days       float64
vig_rec_days         float64
mod_rec_days         float64
category              object
dtype: object

In [68]:
with open('data/df_all.pickle', 'wb') as f:
    pickle.dump(df, f)

## Discussion on Imbalanced Data


- Three options here:
    - Pre-processing - resampling:
        - Undersampling: Not a great approach due to minimal data (to get the ratios reasonable, would result in reducing overall data size by a considerable amount because the )
        - Oversampling: One concern with this approach is that oversampling prior to doing a validation loop (and not during) is that it would result in overfitting where a sample is both in the train set and the validation set. Unfortunately, this would not work well with sklearn's cross_validation, which does not have an oversampling parameter. Therefore any CV or GridSearch (for best parameters) would need to be implemented manually
        - Synthetic Options
            - Smote: Similar concern to the the oversampling technique, even though it would not be as drastic
            - ADASYN:
        - **My approach: Oversampling + SMOTE + ADASYN**
            - Created a function (painstakingly so) that can allow for oversampling, CV, and gridsearch all in one. 
            - Unfortunately, it only allows for one oversampling technique and one estimator (ML technique) at a time. Would need a separate loop to test out multiple sampling techniques or ML techniques (e.g. RandomForest vs Naive Bayes)
           
    - During Model - Class Weights:
        - This one is easy to implement and makes sense for the multi-class model, by putting additional weight on the classifications that are of more interest but infrequent "athlete" and "fitness"
        - This is only available for some models
    - After Model - Threshold Adjustment:
        - This approach would depend on the model is not easy to implement under a multi-class model, where the classifier selects the class with the highest probability. To attempt a threshold adjustment would require building model logic from scratch and overwriting the class selection, such that some classes (given certain thresholds) would be selected in favor of others with higher probabilities. Due to the model complexity (in building and explaining) this approach was not attempted.