In [4]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import psycopg2.extras as extras

# We are also going to do some basic viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import xport.v56 #for working with XPT files (SAS outputs favored by the US government)
import pickle #for saving data extracts

In [5]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'fitness',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

connection = pg.connect(**connection_args)

OperationalError: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?


### Download Data

Downloads Data from NHANES website (note that DXX only available from 2011 and on, prior to that had data issues and would require more data massaging)

Initial database was set up using psql and appropriate tables made for the 2017-2018 table, then used the code below to insert prior years' survey data into the table.


Database includes non-NHANES data:
    - "Fat": was created that lists fitness category by gender and body fat range. 
        - Classes: "Essential Fat", "Athlete", "Fitness", "Average" (note originally called Acceptable), "Obese"
        - Source: American Counsel on Exercise
    - "bmi_cat": lists fitness category based on BMI based on gender.
        - Classes: "Underweight", "Normal", "Overweight", "Obese"
        - Source: CDC

In [None]:
break
# no need to run the following cells.. skip to EDA section for code

In [None]:
years = dict([('2011-2012','G'),('2013-2014','H'),('2015-2016','I'),\
                  ('2017-2018','J')])
data_type = ['DEMO','DR1TOT','PAQ','DBQ','WHQ','BMX','DXX']

Download the data files from CDC website

In [None]:
import requests


for key in years:
    for data in data_type:
        url = "https://wwwn.cdc.gov/Nchs/Nhanes/"+key+"/"+data+"_"+years[key]+".XPT"
        r= requests.get(url)
        filename = 'Data/Raw/'+data+'_'+years[key]+'.XPT'
        with open(filename,'wb') as out_file:
            out_file.write(r.content)

Update database tables with additional data.

Because some fields in the 2017-2018 survey tables were newer, the code below will add new columns of 0s (initially Nulls, but had some issues halfway so switch to 0s.. but this should not impact analysis.) Therefore, prior to using any field, should do a test to see distribution by survey year to make sure that it is valid across all years.


In [46]:
#helper function to be used below
def execute_values(conn, df, table,columns):
    """
    Using psycopg2.extras.execute_values() to insert the dataframe
    """
    #make new df into same columns as existing table
    missing_cols = list(set(columns) - set(df.columns))
    extra_cols = list(set(df.columns) - set(columns))
    new_df = df
    new_df =df.drop(columns = extra_cols)
    for i in missing_cols:
        blank_series = pd.Series([0]*len(df),name=i)
        new_df = pd.concat([new_df,blank_series],axis=1)
    fin_df = new_df[columns]
    
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in fin_df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(fin_df[columns]))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, pg.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("execute_values() done")
    cursor.close()

In [51]:
#append with prior survey cycle information
data_type = ['BMX','DBQ','DEMO', 'DR1TOT', 'DXX', 'PAQ', 'WHQ']  #names of files to pull
survey = ['G','H','I'] #surveys correspond to certain years of surveys

for data in data_type:
    with open('Data/Raw/'+data+'_J.xpt', 'rb') as f:
        library = xport.v56.load(f)
    columns = list(library[data+'_J'].columns)

    for letter in survey:
        with open('Data/Raw/'+data+'_'+letter+'.xpt', 'rb') as f:
            library = xport.v56.load(f)
        execute_values(connection,library[data+'_'+letter],data,columns)

execute_values() done
execute_values() done
execute_values() done


In [53]:
#identify the missing columns

data_type = ['BMX','DBQ','DEMO', 'DR1TOT', 'DXX', 'PAQ', 'WHQ'] 
survey = ['G','H','I'] 

for data in data_type:
    with open('Data/Raw/'+data+'_J.xpt', 'rb') as f:
        library = xport.v56.load(f)
    columns = list(library[data+'_J'].columns)

    for letter in survey:
        with open('Data/Raw/'+data+'_'+letter+'.xpt', 'rb') as f:
            library = xport.v56.load(f)
        df = library[data+'_'+letter]
        missing_cols = list(set(columns) - set(df.columns))
        print(f'{data}+" "+{letter}+": "+{missing_cols}')
    

BMX+" "+G+": "+['BMIHIP', 'BMXHIP']
BMX+" "+H+": "+['BMIHIP', 'BMXHIP']
BMX+" "+I+": "+['BMIHIP', 'BMXHIP']
DBQ+" "+G+": "+['CBQ596', 'CBQ611', 'DBQ940', 'CBQ606', 'DBQ935', 'DBQ945', 'DBQ930']
DBQ+" "+H+": "+['DBQ935', 'DBQ945', 'DBQ940', 'DBQ930']
DBQ+" "+I+": "+['DBQ935', 'DBQ945', 'DBQ940', 'DBQ930']
DEMO+" "+G+": "+['DMDHSEDZ', 'DMDHRAGZ', 'DMDHREDZ', 'DMDHRMAZ']
DEMO+" "+H+": "+['DMDHSEDZ', 'DMDHRAGZ', 'DMDHREDZ', 'DMDHRMAZ']
DEMO+" "+I+": "+['DMDHSEDZ', 'DMDHRAGZ', 'DMDHREDZ', 'DMDHRMAZ']
DR1TOT+" "+G+": "+['DR1MRESP', 'DR1TWSZ', 'DR1SKY', 'DR1STY', 'DR1HELP']
DR1TOT+" "+H+": "+['DR1TWSZ', 'DR1HELP', 'DR1MRESP']
DR1TOT+" "+I+": "+['DR1TWSZ']
DXX+" "+G+": "+[]
DXX+" "+H+": "+[]
DXX+" "+I+": "+[]
PAQ+" "+G+": "+[]
PAQ+" "+H+": "+[]
PAQ+" "+I+": "+[]
WHQ+" "+G+": "+['WHQ200', 'WHD080U', 'WHQ190', 'WHQ225']
WHQ+" "+H+": "+['WHQ200', 'WHQ190', 'WHQ225']
WHQ+" "+I+": "+[]


Note: the missing columns are not a huge issue.
BMX - not using any body mass index measurements, in favor of body fat % to categorize fitness levels.\
DBQ - Diet Behavior data here mostly about a specific program called "My Plate", whether respondent is the primary food shopper and meal planner/prepper. Not going to use this table at all because it is a lot about milk consumption patters.\
DEMO - this includes spouse education level, age; reference person (proxy head of household) education level, age.\
DR1TOT - a few fields changed or changed their names, these are not consequential (DR1TWSZ used to be DR1TWS for drinking water, HELP and RESP refer to help with responding to survey, SKY and STY relate to using salt). During data cleanup, just left these as nulls when adding prior cycle data. These fields should not be used in analysis. given that they should not be consequential, did not bother with remapping prior cycle data to new field names.
WHQ - cycle G and H did not capture weight loss surgery, while this may be interesting, there are not many cases with these strategies being taken. Not consequential for this analysis on diet vs exercise.

## Comparing Potential Target Variables

Exploring the categories - comparing using BMI vs Body Fat to determine someone's fitness level

In [4]:
query = '''
SELECT DISTINCT dxx.seqn, 
	CASE demo.riagendr WHEN '1.0' THEN 'M' WHEN '2.0' THEN 'F' ELSE demo.riagendr END, 
	demo.ridageyr, dxx.dxdtopf,  bmx.bmxbmi, fat.category, bmi_cat.bmi_category
FROM dxx
LEFT JOIN demo
	ON dxx.seqn = demo.seqn
LEFT JOIN bmx
    ON dxx.seqn = bmx.seqn
JOIN fat
	ON demo.riagendr = fat.gender AND dxdtopf >= fat.low AND dxdtopf < fat.high
JOIN bmi_cat
    ON bmx.bmxbmi >= bmi_cat.low AND bmx.bmxbmi < bmi_cat.high
WHERE demo.ridageyr >= 18
ORDER BY dxx.seqn
'''

cat_comp = pd.read_sql(query,connection)

In [5]:
table = cat_comp.pivot_table(index='bmi_category',columns='category',aggfunc='count')['seqn'].copy()

column_order = ['Athlete','Fitness','Average','Obese']
index_order = ['Underweight','Normal','Overweight','Obese']


table.reindex(index = index_order, columns=column_order)

category,Athlete,Fitness,Average,Obese
bmi_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Underweight,14.0,101.0,115.0,27.0
Normal,81.0,447.0,1552.0,1604.0
Overweight,5.0,53.0,658.0,2908.0
Obese,,1.0,124.0,4101.0


Note: that the BMI category is different than the (body fat) category. 
Also note: there is a category for Essential Fat, however, no one in the survey had that low of body fat. =)

There is strong overlap. However, the BMI_category does not identify Athletes or those who are fit. Otherwise there are strong correlations, as one might expect.

The 53 that are "fit" yet "Overweight" highlight a known issue with BMI those who lift weights and have more muscle mass tend to be heavier. So based on weight, they are overweight, even though having the muscles instead of fat is healthy.

One significant drawback to using the Categories defined by body fat % is that the vast majority are considered Obese. Will definitely be working with some significant class imbalance here

[Body Fat Norms by American Counsel on Exercise](https://www.acefitness.org/education-and-resources/lifestyle/tools-calculators/percent-body-fat-calculator/)

| Category | Women | Men |
| ----------- | ----------- | ----------- |
| Essential Fat | 10-13% | 2-5% |
| Athletes | 14-20% | 6-13% |
| Fitness | 21-24% | 14-17% |
| Acceptable | 25-31% | 18-24% |
| Obesity | >32% | >25% |


[BMI categories per CDC](https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html)

| BMI | Weight Status |
| ----------- | ----------- |
| Below 18.5 | Underweight |
| 18.5 – 24.9 | Normal or Healthy Weight |
| 25.0 – 29.9 | Overweight |
| 30.0 and Above | Obese |

**For the analysis, will focus on using the Body Fat Norms by ACE.
Would like to determine what factors impact fitness level, and specifically is diet or exercise more important**

## Data Pre-processing

Use SQL to query the database and get the dataframe ready for modeling purposes

In [3]:
full_query = '''
SELECT DISTINCT dxx.seqn, 
	demo.sddsrvyr, demo.ridageyr, CASE demo.riagendr WHEN '1.0' THEN 'M' WHEN '2.0' THEN 'F' ELSE demo.riagendr END, 
    dxx.dxdtopf,
    whq.whq030, whq.whq070, whq.whd080a, whq.whd080b, whq.whd080c, whq.whd080d, whq.whd080e, whq.whd080f,
		whq.whd080g, whq.whd080h, whq.whd080i, whq.whd080j, whq.whd080k, whq.whd080m, whq.whd080n,
		whq.whd080o, whq.whd080p, whq.whd080q,  whq.whd080r, whq.whd080s, whq.whd080t,
	dr1tot.drqsdiet, dr1tot.drqsdt1, dr1tot.drqsdt2, dr1tot.drqsdt3, dr1tot.drqsdt4, dr1tot.drqsdt5, 
        dr1tot.drqsdt6, dr1tot.drqsdt7,dr1tot.drqsdt8, dr1tot.drqsdt9, dr1tot.drqsdt10, dr1tot.drqsdt11,
        dr1tot.drqsdt12, dr1tot.drqsdt91,
	paq.paq610, paq.paq625, paq.paq640, paq.paq655, paq.paq670, 
    fat.category
FROM dxx
LEFT JOIN demo
	ON dxx.seqn = demo.seqn
LEFT JOIN whq
	ON dxx.seqn = whq.seqn
LEFT JOIN dr1tot
    ON dxx.seqn = dr1tot.seqn
LEFT JOIN paq
    ON dxx.seqn = paq.seqn
JOIN fat
	ON demo.riagendr = fat.gender AND dxdtopf >= fat.low AND dxdtopf < fat.high
WHERE demo.ridageyr >= 18
ORDER BY dxx.seqn
'''
full_df = pd.read_sql(full_query, connection)


NameError: name 'connection' is not defined

In [230]:
len(full_df)

11826

In [231]:
full_df.category.unique()

array(['Average', 'Obese', 'Fitness', 'Athlete'], dtype=object)

### Rename columns into something understandable

In [233]:
colnames = {
    "seqn": "id",
    "sddsrvyr": "data_cycle",
    "riagendr": "gender",
    "ridageyr": "age",
    "dxdtopf": "body_fat",
    "paq610": "vig_work_days",
    "paq625": "mod_work_days",
    "paq640": "walk_bike_days",
    "paq655": "vig_rec_days",
    "paq670": "mod_rec_days",
    "whq030": "self_image",
    "whq070": "try_to_lose",
    "whd080a": "ate_less",
    "whd080b": "low_cal_foods",
    "whd080c": "ate_less_fat",
    "whd080d": "exercise",
    "whd080e": "skip_meals",
    "whd080f": "'diet'_foods",
    "whd080g": "liquid_diet",
    "whd080h": "wt_loss_program",
    "whd080i": "rx_diet_pills",
    "whd080j": "non-rx_diet_pills",
    "whd080k": "lax_vomit",
    "whd080m": "water",
    "whd080n": "special_diet_wh",
    "whd080o": "low_carb_wh",
    "whd080p": "smoke",
    "whd080q": "fruit_veg_salad",
    "whd080r": "change_eat_habits",
    "whd080s": "less_sugar",
    "whd080t": "less_junk_fast",
    "whq190": "surgery",
    "drqsdiet": "special_diet_dr",
    "drqsdt1": "low_cal",
    "drqsdt2": "low_fat",
    "drqsdt3": "low_salt",
    "drqsdt4": "low_sugar",
    "drqsdt5": "low_fiber",
    "drqsdt6": "high_fiber",
    "drqsdt7": "diabetic_diet",
    "drqsdt8": "bulking",
    "drqsdt9": "low_carb_dr",
    "drqsdt10": "high_protein",
    "drqsdt11": "gluten_free",
    "drqsdt12": "renal_kidney",
    "drqsdt91": "other_diet",
}

In [234]:
full_df = full_df.rename(columns = colnames)

In [235]:
full_df

Unnamed: 0,id,data_cycle,age,gender,body_fat,self_image,try_to_lose,ate_less,low_cal_foods,ate_less_fat,...,high_protein,gluten_free,renal_kidney,other_diet,vig_work_days,mod_work_days,walk_bike_days,vig_rec_days,mod_rec_days,category
0,62161.0,7.0,22.0,M,24.3,3.0,,10.0,,,...,,,,,,,,,,Average
1,62164.0,7.0,44.0,F,25.9,3.0,,10.0,11.0,12.0,...,,,,,5.0,,,5.0,1.0,Average
2,62169.0,7.0,21.0,M,19.8,3.0,2.0,,,,...,,,,,,,,,,Average
3,62172.0,7.0,43.0,F,42.2,1.0,1.0,10.0,,12.0,...,,,,,,,,,,Obese
4,62179.0,7.0,55.0,M,27.6,1.0,2.0,,,,...,,,,,,,5.0,,,Obese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11821,102935.0,10.0,27.0,F,33.7,3.0,1.0,10.0,11.0,,...,,,,,3.0,,4.0,1.0,1.0,Obese
11822,102944.0,10.0,55.0,M,33.4,1.0,1.0,,11.0,12.0,...,,,,,,5.0,,,,Obese
11823,102948.0,10.0,31.0,F,27.7,3.0,2.0,,,,...,,,,,,,,5.0,,Average
11824,102949.0,10.0,33.0,M,15.5,3.0,2.0,,,,...,,,,,3.0,7.0,7.0,,,Fitness


In [236]:
full_df.isnull().sum()

id                      0
data_cycle              0
age                     0
gender                  0
body_fat                0
self_image              0
try_to_lose           327
ate_less             1733
low_cal_foods        2019
ate_less_fat         2046
exercise             1592
skip_meals           2175
'diet'_foods         2344
liquid_diet          2362
wt_loss_program      2409
rx_diet_pills        2420
non-rx_diet_pills    2376
lax_vomit            2433
water                1713
special_diet_wh      2368
low_carb_wh          2073
smoke                2435
fruit_veg_salad      1777
change_eat_habits    1929
less_sugar           1879
less_junk_fast       1847
special_diet_dr       151
low_cal              2240
low_fat              2419
low_salt             2419
low_sugar            2442
low_fiber            2451
high_fiber           2450
diabetic_diet        2424
bulking              2444
low_carb_dr          2413
high_protein         2439
gluten_free          2447
renal_kidney

In [237]:
full_df = full_df.fillna(value=0)

In [238]:
full_df = full_df.replace(to_replace='NaN', value = 0)

### Consider which fields to binarize, as well as consolidating certain values that are immaterial (e.g. only 1 response where answer is 'Don't Know')

**Informational fields**
- ID: should stay unique
- Data Cycle: stay unique
- Age: keep with range, though should not use for analysis
- Gender: on purpose will not use for analysis. Gender was already used to target variable category (fit status). should not have bearing on analysis, and since this is not something that can be changed not needed for classification. One argument to include gender for analysis is that men or women might need to take different strategies, however, a generalized result would be better.
- Body fat: keep as float, will not be used for analysis
- Self-image: not relevant for analysis. Accidentally ran the model with this, and it is highly predictive (makes sense as those with better self image are likely to be in better shape.) But also, this is not useful as it is not an action that can be taken.

**For Analysis**
- Most fields should be binarized, they are Y/N questions
- For the number of days of exercise, these should be included as float. (While there is a different field for Y/N on exercise, I am opting for number of days as it naturally has more information and can be more predictive. For Bernoulli Naive Bayes, this can then be binarized just for that case)
- Category: This is the target variable and should stay as is

In [239]:
for i in range(len(full_df.columns)):
    print(full_df.columns[i],full_df.iloc[:,i].unique())

id [ 62161.  62164.  62169. ... 102948. 102949. 102954.]
data_cycle ['7.0' '8.0' '9.0' '10.0']
age [22. 44. 21. 43. 55. 35. 26. 57. 42. 36. 28. 38. 31. 41. 32. 54. 19. 48.
 23. 58. 24. 34. 18. 47. 52. 27. 29. 33. 30. 56. 46. 37. 50. 40. 25. 20.
 53. 49. 39. 59. 51. 45.]
gender ['M' 'F']
body_fat [24.3 25.9 19.8 42.2 27.6 31.7 17.1 28.8 28.  26.4 31.  21.8 41.1 41.6
 49.7 24.  23.4 18.5 35.7 47.9 41.5 14.3 48.7 38.1 22.8 29.5 26.3 12.4
 32.2 24.4 49.8 20.  43.5 28.1 35.8 35.3 28.5 37.3 19.3 28.9 33.6 18.6
 34.4 25.  30.9 33.3 45.  29.8 25.3 30.  17.  18.2 45.8 37.  33.9 42.7
 27.4 25.7 18.4 42.5 33.2 44.6 19.2 28.2 31.6 18.9 32.9 43.4 42.8 30.4
 17.3 45.6 44.2 22.4 27.5 36.5 37.6 14.8 23.2 36.2 39.2 27.9 41.7 34.2
 28.3 19.9 29.  24.1 47.4 22.9 25.6 34.1 39.8 33.  39.5 20.5 36.1 33.1
 38.9 34.3 14.9 42.  35.2 27.1 26.6 31.8 22.7 49.2 13.  45.9 41.8 30.7
 43.1 42.4 34.7 19.6 15.7 40.1 39.3 47.5 32.1 39.1 36.8 39.9 43.7 43.
 32.8 31.2 19.1 26.  23.3 39.6 39.  16.4 38.7 40.7 35.5 36.3 20.8

### Data manipulation to prepare for analysis
prior to binarizing, need to manipulate certain data points

In [240]:
full_df.groupby('try_to_lose')['id'].nunique()

try_to_lose
0      1558
1.0    4122
2.0    6143
7.0       1
9.0       2
Name: id, dtype: int64

for those who "tried to lose weight", 1 means yes, 2 is No, 7 is refused to answer, 9 is don't know, and 0 is missing. 

A significant amount were missing. However, given that I am mostly interested in positive indicators if something exists or not. Willing to sacrifice a little bit of information here to binarize this field. Also, if they did not answer, then they likely were not trying to lose weight. and they would not have a response for any of the other weight loss survey questions (wh__)

In [241]:
full_df['try_to_lose'] = full_df['try_to_lose'].replace(to_replace=['2.0','7.0','9.0'], value = 0)
full_df.groupby('try_to_lose')['id'].nunique()

try_to_lose
0      7704
1.0    4122
Name: id, dtype: int64

In [242]:
full_df.groupby('special_diet_dr')['id'].nunique()

special_diet_dr
0       571
1.0    1570
2.0    9639
9.0      46
Name: id, dtype: int64

for dr_special_diet, 1 means yes, 2 is No, 9 is don't know, and 0 is missing. Will convert the missing and don't knows into 9, given that there are relatively few of them, and mostly interested in positive indicators if something exists or not. willing to sacrifice a little bit of information here to binarize this field

In [243]:
full_df['special_diet_dr'] = full_df['special_diet_dr'].replace(to_replace=['2.0','9.0'], value = 0)

In [244]:
full_df.groupby('special_diet_dr')['id'].nunique()

special_diet_dr
0      10256
1.0     1570
Name: id, dtype: int64

In [245]:
full_df.groupby('vig_work_days')['id'].nunique()

vig_work_days
0.0     9037
1.0      234
2.0      342
3.0      432
4.0      286
5.0      939
6.0      302
7.0      253
99.0       1
Name: id, dtype: int64

Because only one value is "don't know", will convert to 0

In [246]:
full_df['vig_work_days'] = full_df['vig_work_days'].replace(to_replace=[99], value = 0)
full_df.groupby('vig_work_days')['id'].nunique()

vig_work_days
0.0    9038
1.0     234
2.0     342
3.0     432
4.0     286
5.0     939
6.0     302
7.0     253
Name: id, dtype: int64

In [247]:
full_df.groupby('mod_work_days')['id'].nunique()

mod_work_days
0.0     7105
1.0      287
2.0      502
3.0      713
4.0      540
5.0     1669
6.0      421
7.0      583
77.0       1
99.0       5
Name: id, dtype: int64

Because only one value is "77/refuse", and 5 are"99/Don't Know", these are immaterial and will convert to 0

In [248]:
full_df['mod_work_days'] = full_df['mod_work_days'].replace(to_replace=[77,99], value = 0)
full_df.groupby('mod_work_days')['id'].nunique()

mod_work_days
0.0    7111
1.0     287
2.0     502
3.0     713
4.0     540
5.0    1669
6.0     421
7.0     583
Name: id, dtype: int64

In [249]:
full_df.groupby('vig_rec_days')['id'].nunique()

vig_rec_days
0.0     7972
1.0      515
2.0      785
3.0     1019
4.0      623
5.0      531
6.0      190
7.0      190
99.0       1
Name: id, dtype: int64

because only 1 value is "don't know", will convert to 0

In [250]:
full_df['vig_rec_days'] = full_df['vig_rec_days'].replace(to_replace=[77,99], value = 0)
full_df.groupby('vig_rec_days')['id'].nunique()

vig_rec_days
0.0    7973
1.0     515
2.0     785
3.0    1019
4.0     623
5.0     531
6.0     190
7.0     190
Name: id, dtype: int64

In [251]:
full_df.groupby('mod_rec_days')['id'].nunique()

mod_rec_days
0.0     6558
1.0      774
2.0     1158
3.0     1273
4.0      609
5.0      735
6.0      178
7.0      539
99.0       2
Name: id, dtype: int64

because only 2 values are "don't know", will convert to 0

In [252]:
full_df['mod_rec_days'] = full_df['mod_rec_days'].replace(to_replace=[77,99], value = 0)
full_df.groupby('mod_rec_days')['id'].nunique()

mod_rec_days
0.0    6560
1.0     774
2.0    1158
3.0    1273
4.0     609
5.0     735
6.0     178
7.0     539
Name: id, dtype: int64

In [278]:
full_df.groupby('walk_bike_days')['id'].nunique()

walk_bike_days
0.0     8308
1.0      186
2.0      344
3.0      467
4.0      315
5.0      963
6.0      181
7.0     1057
99.0       5
Name: id, dtype: int64

In [279]:
full_df['walk_bike_days'] = full_df['walk_bike_days'].replace(to_replace=[77,99], value = 0)
full_df.groupby('walk_bike_days')['id'].nunique()

walk_bike_days
0.0    8313
1.0     186
2.0     344
3.0     467
4.0     315
5.0     963
6.0     181
7.0    1057
Name: id, dtype: int64

In [281]:
full_df.columns

Index(['id', 'data_cycle', 'age', 'gender', 'body_fat', 'self_image',
       'try_to_lose', 'ate_less', 'low_cal_foods', 'ate_less_fat', 'exercise',
       'skip_meals', ''diet'_foods', 'liquid_diet', 'wt_loss_program',
       'rx_diet_pills', 'non-rx_diet_pills', 'lax_vomit', 'water',
       'special_diet_wh', 'low_carb_wh', 'smoke', 'fruit_veg_salad',
       'change_eat_habits', 'less_sugar', 'less_junk_fast', 'special_diet_dr',
       'low_cal', 'low_fat', 'low_salt', 'low_sugar', 'low_fiber',
       'high_fiber', 'diabetic_diet', 'bulking', 'low_carb_dr', 'high_protein',
       'gluten_free', 'renal_kidney', 'other_diet', 'vig_work_days',
       'mod_work_days', 'walk_bike_days', 'vig_rec_days', 'mod_rec_days',
       'category'],
      dtype='object')

## Binarize the Y/N fields

In [282]:
from sklearn.preprocessing import Binarizer

In [283]:
binarizer = Binarizer()

In [284]:
bin_df = binarizer.transform(full_df[['try_to_lose','ate_less', 'low_cal_foods', 'ate_less_fat', 'exercise',\
        'skip_meals', "'diet'_foods", 'liquid_diet', 'wt_loss_program', 'rx_diet_pills',\
         'non-rx_diet_pills', 'lax_vomit', 'water', 'special_diet_wh',\
       'low_carb_wh', 'smoke', 'fruit_veg_salad', 'change_eat_habits',\
       'less_sugar', 'less_junk_fast', 'special_diet_dr', 'low_cal', 'low_fat',\
       'low_salt', 'low_sugar', 'low_fiber', 'high_fiber', 'diabetic_diet',\
       'bulking', 'low_carb_dr', 'high_protein', 'gluten_free', 'renal_kidney',\
       'other_diet']])


In [285]:
bin_df.shape

(11826, 34)

In [286]:
bin_df = pd.DataFrame(bin_df, columns = ['try_to_lose','ate_less', 'low_cal_foods', 'ate_less_fat', 'exercise',\
        'skip_meals', "'diet'_foods", 'liquid_diet', 'wt_loss_program', 'rx_diet_pills',\
         'non-rx_diet_pills', 'lax_vomit', 'water', 'special_diet_wh',\
       'low_carb_wh', 'smoke', 'fruit_veg_salad', 'change_eat_habits',\
       'less_sugar', 'less_junk_fast', 'special_diet_dr', 'low_cal', 'low_fat',\
       'low_salt', 'low_sugar', 'low_fiber', 'high_fiber', 'diabetic_diet',\
       'bulking', 'low_carb_dr', 'high_protein', 'gluten_free', 'renal_kidney',\
       'other_diet'])

In [287]:
bin_df.describe()

Unnamed: 0,try_to_lose,ate_less,low_cal_foods,ate_less_fat,exercise,skip_meals,'diet'_foods,liquid_diet,wt_loss_program,rx_diet_pills,...,low_sugar,low_fiber,high_fiber,diabetic_diet,bulking,low_carb_dr,high_protein,gluten_free,renal_kidney,other_diet
count,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,...,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0,11826.0
mean,0.348554,0.279554,0.154152,0.145019,0.331304,0.084052,0.040166,0.026044,0.020125,0.015052,...,0.004566,8.5e-05,0.000846,0.012261,0.002621,0.012092,0.004313,0.002452,0.000761,0.005243
std,0.476533,0.448799,0.36111,0.352135,0.470702,0.277478,0.196356,0.159274,0.140434,0.121763,...,0.067422,0.009196,0.029068,0.110054,0.051134,0.109301,0.065531,0.049461,0.027578,0.072219
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Recombine the data set

In [288]:
frames = [full_df[['id', 'data_cycle', 'age', 'gender', 'body_fat', 'self_image',]],
         bin_df, full_df[['vig_work_days', 'mod_work_days', 'walk_bike_days',
       'vig_rec_days', 'mod_rec_days', 'category']]]

In [289]:
final_df = pd.concat(frames, axis=1,sort=False)

In [290]:
final_df

Unnamed: 0,id,data_cycle,age,gender,body_fat,self_image,try_to_lose,ate_less,low_cal_foods,ate_less_fat,...,high_protein,gluten_free,renal_kidney,other_diet,vig_work_days,mod_work_days,walk_bike_days,vig_rec_days,mod_rec_days,category
0,62161.0,7.0,22.0,M,24.3,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Average
1,62164.0,7.0,44.0,F,25.9,3.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,1.0,Average
2,62169.0,7.0,21.0,M,19.8,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Average
3,62172.0,7.0,43.0,F,42.2,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Obese
4,62179.0,7.0,55.0,M,27.6,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,Obese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11821,102935.0,10.0,27.0,F,33.7,3.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,4.0,1.0,1.0,Obese
11822,102944.0,10.0,55.0,M,33.4,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,Obese
11823,102948.0,10.0,31.0,F,27.7,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,Average
11824,102949.0,10.0,33.0,M,15.5,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,7.0,7.0,0.0,0.0,Fitness


In [291]:
with open('data/final_df.pickle', 'wb') as f:
    pickle.dump(final_df, f)

In [277]:
final_df.pivot_table(index=['data_cycle','try_to_lose','ate_less'],columns='category',aggfunc='count')['id']

Unnamed: 0_level_0,Unnamed: 1_level_0,category,Athlete,Average,Fitness,Obese
data_cycle,try_to_lose,ate_less,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10.0,0.0,0.0,25.0,357.0,115.0,844.0
10.0,0.0,1.0,,25.0,4.0,168.0
10.0,1.0,0.0,1.0,52.0,4.0,335.0
10.0,1.0,1.0,2.0,38.0,4.0,478.0
7.0,0.0,0.0,27.0,538.0,151.0,1096.0
7.0,0.0,1.0,1.0,35.0,,197.0
7.0,1.0,0.0,,61.0,5.0,317.0
7.0,1.0,1.0,,54.0,6.0,477.0
8.0,0.0,0.0,17.0,572.0,139.0,1222.0
8.0,0.0,1.0,,23.0,1.0,222.0


Note: that for the "Weight History" questionaire there is an initial question "try_to_lose weight (WHQ070)", that is supposed to prompt follow up questions ONLY if the answer is yes. However, from the data it appears that that some who answered No/refused/Don't know/Missing - did have a response for a followup question (e.g. exercise). While this does mean we may be missing some responses, for these actions that likely have a health benefit or neutral, missing responses may not be fatal to the analysis. We can still identify actions that have an impact, though we may understate impact of actions that have missing data.

In [271]:
full_df = full_df.fillna(0)

In [275]:
full_df = full_df.replace(np.nan,0)

In [276]:
full_df = full_df.replace('NaN',0)