## Diet Exploration
This notebook designs a clean dataset for diet variables and analyzes their effects on dementia.

In [11]:
# Data Management/Investigation
import pandas as pd
import numpy as np
import missingno as miss
from plotnine import *
import matplotlib.pyplot as plt
import pylab as pl
import warnings
warnings.filterwarnings("ignore")

# For pre-processing data 
from sklearn import preprocessing as pp 
from sklearn.compose import ColumnTransformer 

# For splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold # Cross validation 
from sklearn.model_selection import cross_validate # Cross validation 
from sklearn.model_selection import GridSearchCV # Cross validation + param. tuning.

# Machine learning methods 
from sklearn.linear_model import LinearRegression as LM
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.tree import DecisionTreeRegressor as DTree
from sklearn import tree # For plotting the decision tree rules
from sklearn.ensemble import BaggingRegressor as Bag
from sklearn.ensemble import RandomForestRegressor as RF

# For evaluating our model's performance
import sklearn.metrics as m

# Pipeline to combine modeling elements
from sklearn.pipeline import Pipeline

In [16]:
dta = pd.read_csv('data/clean_data.csv')
dta_dummy = pd.read_csv('data/dummy_data.csv')
dta = dta.dropna()
dta.columns

Index(['Participant_ID', 'Category', 'Age_yrs', 'Age_category',
       'Education_yrs', 'Education_category', 'Sex', 'Marital_status',
       'Income', 'Occupation_class', 'Living_area', 'BP_sistol', 'BP_diastol',
       'BMI', 'GDS', 'Glucose', 'Triglyceride', 'HDL', 'Hypertension',
       'BMI_category', 'Diabetes', 'Tri_200', 'HDL_40', 'Smoking_status',
       'Stroke', 'Depression', 'INA_AD8', 'ADL', 'AMT', 'Intellectual_1',
       'Intellectual_2', 'Intellectual_3', 'Intellectual_4', 'Intellectual_5',
       'Intellectual_6', 'Intellectual_7', 'Intellectual_8', 'Social_1',
       'Social_2', 'Social_3', 'Social_4', 'Social_5', 'Social_6', 'Social_7',
       'Social_8', 'Social_9', 'Recreational_1', 'Recreational_2',
       'Recreational_3', 'Recreational_4', 'Recreational_5', 'Recreational_6',
       'Recreational_7', 'Recreational_8', 'Physical_1', 'Physical_2',
       'Physical_3', 'Physical_4', 'Physical_5', 'Intellectually_active',
       'Socially_active', 'Recreationally_act

In [24]:
dta = pd.get_dummies(dta, columns = ['Carbo_category', 'Protein_category', 'Vegetable_category', 
                               'Fruit_category', 'Salted_fish_category', 
                               'Instant_noodle_category', 'Tempe_category'])

In [25]:
dta.columns

Index(['Participant_ID', 'Category', 'Age_yrs', 'Age_category',
       'Education_yrs', 'Education_category', 'Sex', 'Marital_status',
       'Income', 'Occupation_class', 'Living_area', 'BP_sistol', 'BP_diastol',
       'BMI', 'GDS', 'Glucose', 'Triglyceride', 'HDL', 'Hypertension',
       'BMI_category', 'Diabetes', 'Tri_200', 'HDL_40', 'Smoking_status',
       'Stroke', 'Depression', 'INA_AD8', 'ADL', 'AMT', 'Intellectual_1',
       'Intellectual_2', 'Intellectual_3', 'Intellectual_4', 'Intellectual_5',
       'Intellectual_6', 'Intellectual_7', 'Intellectual_8', 'Social_1',
       'Social_2', 'Social_3', 'Social_4', 'Social_5', 'Social_6', 'Social_7',
       'Social_8', 'Social_9', 'Recreational_1', 'Recreational_2',
       'Recreational_3', 'Recreational_4', 'Recreational_5', 'Recreational_6',
       'Recreational_7', 'Recreational_8', 'Physical_1', 'Physical_2',
       'Physical_3', 'Physical_4', 'Physical_5', 'Intellectually_active',
       'Socially_active', 'Recreationally_act

In [28]:
dta_diet = dta[['Participant_ID', 'Category', 'Age_yrs', 'Education_yrs', 
        'Carbo_category_Frequent', 'Protein_category_Frequent',
       'Vegetable_category_Frequent', 'Fruit_category_Frequent',
       'Salted_fish_category_Frequent', 'Instant_noodle_category_Frequent',
       'Tempe_category_Frequent']]
dta_diet

Unnamed: 0,Participant_ID,Category,Age_yrs,Education_yrs,Carbo_category_Frequent,Protein_category_Frequent,Vegetable_category_Frequent,Fruit_category_Frequent,Salted_fish_category_Frequent,Instant_noodle_category_Frequent,Tempe_category_Frequent
37,80042,Normal,65.0,4.0,1,0,0,0,1,1,1
39,80044,Normal,65.0,6.0,1,0,0,0,1,1,1
40,80045,Normal,72.0,6.0,0,0,0,0,1,1,1
43,80048,Normal,68.0,6.0,1,0,1,0,1,1,0
47,80054,Dementia,73.0,3.0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
647,80700,Normal,65.0,6.0,0,0,1,0,1,0,1
648,80701,Normal,68.0,2.0,1,0,1,0,0,1,1
649,80702,Dementia,81.0,2.0,1,0,1,0,1,1,1
673,80726,Dementia,75.0,3.0,1,1,1,0,0,1,1


In [30]:
dta_diet.to_csv(r'data/diet_data.csv', index = False)