## Data preprocessing and filtering
Load all data from NHNES dataset (https://www.kaggle.com/datasets/cdc/national-health-and-nutrition-examination-survey/code?datasetId=731&sortBy=voteCount and https://wwwn.cdc.gov/Nchs/Nhanes/Search/variablelist.aspx?Component=Dietary&CycleBeginYear=2013)

In [32]:
import pandas as pd
import numpy as np

# NOTE: Medications has a weird multiple issue for SEQN number (participant ID, essentially)
#  - for now, let's not use it
# medications = pd.read_csv('data/NHNES/medications.csv', encoding = "ISO-8859-1")

demographic = pd.read_csv('data/NHNES/demographic.csv', encoding = "ISO-8859-1")
diet = pd.read_csv('data/NHNES/diet.csv', encoding = "ISO-8859-1")
examination = pd.read_csv('data/NHNES/examination.csv', encoding = "ISO-8859-1")
labs = pd.read_csv('data/NHNES/labs.csv', encoding = "ISO-8859-1")
questionnaire = pd.read_csv('data/NHNES/questionnaire.csv', encoding = "ISO-8859-1")

In [33]:
# Merge dataframes
temp1 = demographic.merge(diet, on='SEQN')
temp2 = temp1.merge(examination, on='SEQN')
temp3 = temp2.merge(labs, on='SEQN')
unfiltered_data = temp3.merge(questionnaire, on='SEQN')

In [34]:
rename_map_intake = {
    "DR1TKCAL": "total_calories",
    "DR1TPROT": "total_protein",
    "DR1TCARB": "total_carbohydrates",
    "DR1TSUGR": "total_sugar",
    "DR1TFIBE": "total_fiber",
    "DR1TTFAT": "total_fat",
    "DR1TCHOL": "total_cholesterol",
    "DR1TCALC": "total_calcium",
    "DR1TCOPP": "total_copper",
    "DR1TIRON": "total_iron",
    "DR1TLYCO": "total_lycopene",
    "DR1TMAGN": "total_magnesium",
    "DR1TMFAT": "total_monounsaturated_fat",
    "DR1TPFAT": "total_polyunsaturated_fat",
    "DR1TSODI": "total_sodium",
    "DR1TPOTA": "total_potassium",
    "DR1TVC": "total_vitamin_c",
    "DR1TVD": "total_vitamin_d",
    "DR1TVK": "total_vitamin_k",
    "DR1TATOC": "total_vitamin_e",
    "DR1TVB1": "total_vitamin_b1",
    "DR1TVB2": "total_vitamin_b2",
    "DR1TVB6": "total_vitamin_b6",
    "DR1TVB12": "total_vitamin_b12",
    "DR1TFOLA": "total_folate",
    "DR1TZINC": "total_zinc",
    "DR1TSELE": "total_selenium",
    "DR1TCAFF": "total_caffeine",
    "DR1TALCO": "total_alcohol",
    "DR1.320Z": "total_water",
}

rename_map_diet = {
    "DRQSDT1": "diet_weight_loss_low_calorie_diet",
    "DRQSDT2": "diet_low_fat_diet",
    "DRQSDT3": "diet_low_salt_diet",
    "DRQSDT4": "diet_low_sugar_diet",
    "DRQSDT5": "diet_low_fiber_diet",
    "DRQSDT6": "diet_high_fiber_diet",
    "DRQSDT7": "diet_diabetic",
    "DRQSDT8": "diet_weight_gain_muscle",
    "DRQSDT9": "diet_low_carb",
    "DRQSDT10": "diet_high_protein",
    "DRQSDT11": "diet_celiac",
    "DRQSDT91": "diet_other",
}

rename_map_demographics = {
    "RIDAGEYR": "age",
    "RIAGENDR": "gender",
    "RIDEXPRG": "pregnant",
    "DMDEDUC2": "education_1",
    "DMDEDUC3": "education_2",
    "DMDFMSIZ": "family_size",
    "DMDHHSIZ": "household_size",
    "DMDMARTL": "marital_status",
    "DMQMILIZ": "military",
    "INDFMIN2": "income",
    "INDFMPIR": "poverty_index",
    "INDHHIN2": "household_income",
    "RIDRETH3": "race",
    "DMDBORN4": "birthplace",
}

rename_map_body_measurements = {
    "BMXWT": "weight",
    "BMXHT": "height",
    "BMXBMI": "bmi",
    "BPQ020": "blood_pressure",
    "BPQ080": "high_cholesterol",
}

rename_map_questionnaire = {
    "HSD010": "healthy",
    "SMD641": "smoke_30_days",
    "SMQ670": "trying_to_quit",
    'DUQ370' : 'needle_drugs',
    "CBQ505": "fast_food",
    "PAQ677": "exercise",
    "ALQ130": "alcohol_frequency",
    'IND235' : 'monthly_income',
    'INQ244' : 'family_savings',
    'CBD070' : 'grocery_budget',
    'CBD090' : 'nonfood_budget',
    'CBD110' : 'food_budget',
    'CBD120' : 'restaurant_budget',
    'CBD130' : 'food_delivery_budget',
    'CBQ550' : 'eat_restaurants',
    'CBQ552' : 'eat_chain_restaurants',
    'CDQ010' : 'short_breath_stairs',
    'DBD100' : 'salt_frequency',
    'DBD895' : 'meals_not_homemade',
    'DBD900' : 'meals_fast_food',
    'DBD905' : 'meals_prepackaged',
    'DBD910' : 'frozen_meals_per_month',
    'FSD032A' : 'food_insecure',
    'FSD032B' : 'not_enough_food',
    'FSDHH' : 'household_food_secure',
    'DBQ095Z' : 'salt_type',
    'DBQ197' : 'milk_product_per_month',
    'DBQ229' : 'milk_drinker',
    'DBQ700' : 'healthy_diet',
    'DIQ010' : 'diabetes',
    'DIQ050' : 'taking_insulin',
    'DIQ160' : 'prediabetes',
    'DIQ170' : 'diabetes_risk',
    'DIQ172' : 'diabetes_concern',
    'DIQ180' : 'blood_test_3y',
    'DLQ010' : 'deaf',
    'DLQ020' : 'blind',
    'DPQ100' : 'depression_difficulty',
    'HSQ590' : 'hiv',
    'SXD021' : 'sex_ever',
    'URXUCR' : 'creatinine_urine',
    'WHD010' : 'height_in',
    'WHD020' : 'current_weight_lb',
    'WHD120' : 'weight_age_25',
    'WHD140' : 'greatest_weight',
    'WHQ030' : 'overweight_self',
    'WHQ040' : 'weightloss_desire',
    'WHQ070' : 'weightloss_attempt',
    'WHQ150' : 'age_when_heaviest',
    'SLD010H' : 'sleep_hours',
    'SLQ050' : 'trouble_sleeping',
    'SLQ060' : 'sleep_disorder',
    'PAD680' : 'sedentary_time',
    'PAQ605' : 'vigorous_work',
    'PAQ620' : 'moderate_work',
    'PAQ635' : 'walk_or_bike',
    'PAQ650' : 'vigorous_recreation',
    'PAQ665' : 'moderate_recreation',
    'PAQ710' : 'tv_hours',
    'PAQ715' : 'pc_hours',
}

rename_map_health = {
    "MCQ220": "cancer",
    'MCQ010' : 'asthma_ever',
    'MCQ025' : 'asthma_age',
    'MCQ035' : 'asthma',
    'MCQ040' : 'asthma_year',
    'MCQ050' : 'asthma_ER',
    'MCQ053' : 'anemia',
    'MCQ070' : 'psoriasis',
    'MCQ080' : 'overweight',
    'MCQ082' : 'celiac_disease',
    'MCQ086' : 'gluten_free',
    'MCQ092' : 'blood_transfusion',
    'MCQ149' : 'menstruate',
    'MCQ151' : 'menstruate_age',
    'MCQ160A' : 'arthritis',
    'MCQ160B' : 'congestive_heart_failure',
    'MCQ160C' : 'coronary_heart_disease',
    'MCQ160D' : 'angina',
    'MCQ160E' : 'heart_attack',
    'MCQ160F' : 'stroke',
    'MCQ160G' : 'emphysema',
    'MCQ160K' : 'bronchitis_ever',
    'MCQ160L' : 'liver_condition_ever',
    'MCQ160M' : 'thyroid_ever',
    'MCQ160N' : 'gout',
    'MCQ160O' : 'COPD',
    'MCQ170K' : 'bronchitis_now',
    'MCQ170L' : 'liver_condition',
    'MCQ170M' : 'thyroid_now',
    'MCQ180A' : 'arthritis_age',
    'MCQ180B' : 'heart_failure_age',
    'MCQ180C' : 'heart_disease_age',
    'MCQ180D' : 'angina_age',
    'MCQ180E' : 'heart_attack_age',
    'MCQ180F' : 'stroke_age',
    'MCQ180G' : 'emphysema_age',
    'MCQ180K' : 'bronchitis_age',
    'MCQ180L' : 'liver_condition_age',
    'MCQ180M' : 'thyroid_age',
    'MCQ180N' : 'gout_age',
    'MCQ195' : 'arthritis_type',
    'MCQ203' : 'jaundice',
    'MCQ206' : 'jaundice_age',
    'MCQ220' : 'cancer',
    'MCQ230A' : 'cancer_type1',
    'MCQ230B' : 'cancer_type2',
    'MCQ230C' : 'cancer_type3',
    'MCQ230D' : 'cancer_type4',
    'MCQ240A' : 'bladder_cancer_age',
    'MCQ240AA' : 'test_cancer_age',
    'MCQ240B' : 'blood_cancer_age',
    'MCQ240BB' : 'thyroid_cancer_age',
    'MCQ240C' : 'bone_cancer_age',
    'MCQ240CC' : 'uterine_cancer_age',
    'MCQ240D' : 'brain_cancer_age',
    'MCQ240DK' : 'cancer_age',
    'MCQ240E' : 'breast_cancer_age',
    'MCQ240F' : 'cervical_cancer_age',
    'MCQ240G' : 'colon_cancer_age',
    'MCQ240H' : 'esoph_cancer_age',
    'MCQ240I' : 'gallbladder_cancer_age',
    'MCQ240J' : 'kidney_cancer_age',
    'MCQ240K' : 'larynx_cancer_age',
    'MCQ240L' : 'leukemia_age',
    'MCQ240M' : 'liver_cancer_age',
    'MCQ240N' : 'lung_cancer_age',
    'MCQ240O' : 'lymphoma_age',
    'MCQ240P' : 'melanoma_age',
    'MCQ240Q' : 'mouth_cancer_age',
    'MCQ240R' : 'nervous_cancer_age',
    'MCQ240S' : 'ovarian_cancer_age',
    'MCQ240T' : 'pancreatic_cancer_age',
    'MCQ240U' : 'prostate_cancer_age',
    'MCQ240V' : 'rectal_cancer_age',
    'MCQ240X' : 'skin_cancer_age',
    'MCQ240Y' : 'soft_cancer_age',
    'MCQ240Z' : 'stomach_cancer_age',
    'MCQ300A' : 'relative_heart_attack',
    'MCQ300B' : 'relative_asthma',
    'MCQ300C' : 'relative_diabetes',
    'MCQ365A' : 'need_weight_loss',
    'MCQ365B' : 'need_exercise',
    'MCQ365C' : 'need_reduce_salt',
    'MCQ365D' : 'need_reduce_calories',
    'MCQ370A' : 'losing_weight',
    'MCQ370B' : 'excercising',
    'MCQ370C' : 'reducing_salt',
    'MCQ370D' : 'reducing_fat',
}

In [35]:
# At minimum, participants need to have reported their total calories
filtered_data = unfiltered_data[unfiltered_data['DR1TKCAL'].notnull()]

# Rename columns
filtered_data = filtered_data.rename(columns=rename_map_intake)
filtered_data = filtered_data.rename(columns=rename_map_diet)
filtered_data = filtered_data.rename(columns=rename_map_demographics)
filtered_data = filtered_data.rename(columns=rename_map_body_measurements)
filtered_data = filtered_data.rename(columns=rename_map_questionnaire)
filtered_data = filtered_data.rename(columns=rename_map_health)

# Drop all columns but the renamed ones
column_list = list(rename_map_intake.values()) + \
    list(rename_map_diet.values()) + \
        list(rename_map_demographics.values()) + \
            list(rename_map_body_measurements.values()) + \
                list(rename_map_questionnaire.values()) + \
                    list(rename_map_health.values())

filtered_data = filtered_data[column_list]

# Simple model
Demonstrates relevance of the data by building a simple model to predict overweight status

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report


X_train, X_test, y_train, y_test = train_test_split(filtered_data.drop('overweight', axis=1), filtered_data['overweight'], test_size=0.2, random_state=42)

model = RandomForestClassifier()
imputer = SimpleImputer(strategy='constant', fill_value=-1)
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
y_test = y_test.fillna(-1)
y_train = y_train.fillna(-1)

model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00       583
         1.0       0.77      0.74      0.75       372
         2.0       0.87      0.89      0.88       752

    accuracy                           0.89      1707
   macro avg       0.88      0.87      0.88      1707
weighted avg       0.89      0.89      0.89      1707



In [54]:
filtered_data.to_csv('data/filtered_data.csv', index=False)