In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from feature_engine.outliers import Winsorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import KNNImputer

0.  impute missing values -- yes or no, if no drop
1.  drop persistent outliers -- yes, no
2.  log transform y -- yes or no
3.  reduce y range -- yes or no
4.  use transformed nutrients -- yes or no
5.  PCA -- yes or no
6.  Winsorize X -- yes or no
7.  include chronic conditions -- yes or no
8.  include alcohol and tobacco -- yes or no
9.  transformed alcohol -- yes or no
10.  transformed physical activity -- yes or no

Want to save model number, r2, adjusted r2, training mae, test mae, training rmse, test rmse


To get an idea of what to do for the model, I'm training ten models picked randomly
1.  Drop missing values, drop persistent outliers, log transform y, use transformed nutrients, PCA, no winsorization, no chronic conditions, include alcohol and tobacco, don't use transformed alcohol, use transformed physical activity
2.  Drop missing values, don't drop outliers, don't log transform y, reduce y range, use transformed nutrients, no PCA, winsorize x values, include chronic conditions, dont include alcohol and tobacco, transformed physical activity.
3.  drop missing values, drop outliers, don't log transform, reduce y range, use transformed nutrients, use PCA, no winsorizatiion, no chronic conditions, include alcohol and tobacco, include transformed alcohol, don't transform physical activity.
4. impute missing values, don't drop outliers, don't log transform y, reduce y range, use transformed nutrients, no PCA, winsorize x, include chronic conditions, don't include alcohol and tobacco, don't transform physical activity
5. impute missing values, drop outliers, log transform y, don't use transformed nutrients, no PCA, don't winsorize, include chronic conditions, include alcohol and tobacco, use transformed alcohol, don't transform physical activity
6.  drop missing values, drop outliers, don't log transform y, reduce y range, don't use transformed nutrients, PCA, winsorize X, include chronic conditions, don't include alcohol and tobacco, don't use transformed activity
7. impute missing values, drop persistent outliers, don't log transform y, reduce y range, use transformed nutrients, no PCA, no winsorization, don't include chronic conditions, include alcohol and tobacco, use transformed alcohol, don't transform physical activity
8. drop missing values, drop persistent outliers, don't log transform y, don't reduce range, use transformed nutrients, use PCA, winsorize X, don't include chronic conditions, don't include alcohol and tobacco, don't transform physicial activity


could add use height and weight instead of bmi
include irregular pulse

In [7]:
df_missing = pd.read_csv('../Data/df_transformed.csv')
df_missing.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
df_imputed = pd.read_csv('../Data/dfi_transformed.csv')
df_imputed.drop('Unnamed: 0',axis=1,inplace=True)

In [8]:
df_missing.columns

Index(['LBXTC', 'RIAGENDR', 'RIDRETH3', 'RIDAGEYR', 'BMXWT', 'BMXHT', 'BMXBMI',
       'BPXPLS', 'BPXPULS', 'DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA', 'DR1_300',
       'ALQ120Q', 'ALQ120U', 'ALQ130', 'BPQ020', 'BPQ050A', 'BPQ080',
       'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681', 'PAQ610', 'PAD615', 'PAQ625',
       'PAD630', 'PAQ655', 'PAD660', 'PAQ670', 'PAD675', 'Systolic',
       'Diastolic', 'outlier', 'MET_work', 'MET_rec', 'DR1TKCAL_t',
       'DR1TPROT_t', 'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t',
       'DR1TSFAT_t', 'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t',
       'DR1TVD_t', 'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t', 'avgALC',
       'log_LBXTC'],
      dtype='object')

In [9]:
df_imputed.columns

Index(['LBXTC', 'RIAGENDR', 'RIDRETH3', 'RIDAGEYR', 'BMXWT', 'BMXHT', 'BMXBMI',
       'BPXPLS', 'BPXPULS', 'DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA', 'DR1_300',
       'ALQ120Q', 'ALQ120U', 'ALQ130', 'BPQ020', 'BPQ050A', 'BPQ080',
       'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681', 'PAQ610', 'PAD615', 'PAQ625',
       'PAD630', 'PAQ655', 'PAD660', 'PAQ670', 'PAD675', 'Systolic',
       'Diastolic', 'outlier', 'BMXWT_i', 'BMXHT_i', 'BMXBMI_i', 'BPXPLS_i',
       'BPXPULS_i', 'DR1_300_i', 'ALQ120Q_i', 'ALQ120U_i', 'ALQ130_i',
       'BPQ020_i', 'BPQ050A_i', 'BPQ080_i', 'BPQ100D_i', 'DIQ010_i',
       'MCQ170M_i', 'SMQ681_i', 'PAQ610_i', 'PAD615_i', 'PAQ625_i', 'PAD630_i',
       'PAQ655_i', 'PAD660_i', 'PAQ670_i', 'PAD675_i', 'Systolic_i',
       'Diastolic_i', 'MET_work', 'MET_rec', 'DR1TKCAL_t', 'DR1TPROT_t',
       'DR1TCARB_t', 'DR1TSUGR_t

In [10]:
Models = {
    'number':[],
    'r2':[],
    'adjusted r2':[],
    'training MAE':[],
    'testing MAE':[],
    'training RMSE':[],
    'testing RMSE':[]
}

Model 1
Drop missing values, drop persistent outliers, log transform y, use transformed nutrients, PCA, no winsorization, no chronic conditions, include alcohol and tobacco, don't use transformed alcohol, use transformed physical activity

In [11]:
df = df_missing[df_missing.outlier==0].copy()

nutrients = ['DR1TKCAL_t', 'DR1TPROT_t',
       'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t', 'DR1TSFAT_t',
       'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t', 'DR1TVD_t',
       'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t']
alc_tob = ['ALQ120Q', 'ALQ120U', 'ALQ130','SMQ681']
variables = nutrients + alc_tob + ['MET_rec','log_LBXTC','RIAGENDR', 'RIDRETH3', 'RIDAGEYR','Systolic',
       'Diastolic','BMXWT', 'BMXHT','BPXPLS']
df = df[variables].dropna()
y = df.log_LBXTC
X = df.drop('log_LBXTC',axis=1)
numerical_variables = ['DR1TKCAL_t', 'DR1TPROT_t',
       'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t', 'DR1TSFAT_t',
       'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t', 'DR1TVD_t',
       'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t','ALQ120Q','ALQ130','MET_rec','RIDAGEYR','Systolic',
       'Diastolic','BMXWT', 'BMXHT','BPXPLS']
categorical_variables = ['ALQ120U','SMQ681','RIAGENDR', 'RIDRETH3']



In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=675, test_size=0.2)

In [15]:
numeric_transformer = Pipeline(steps=[('scalar',MinMaxScaler())])
cat_transformer = Pipeline(steps=[('one_hot',OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num',numeric_transformer,numerical_variables),
    ('cat',cat_transformer,categorical_variables)])
knn1 = Pipeline(steps =[
    ('preprocessor',preprocessor),
    ('PCA',PCA(0.95)),
    ('kneighbors', KNeighborsRegressor())])

In [17]:
knn1.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scalar',
                                                                   MinMaxScaler())]),
                                                  ['DR1TKCAL_t', 'DR1TPROT_t',
                                                   'DR1TCARB_t', 'DR1TSUGR_t',
                                                   'DR1TFIBE_t', 'DR1TTFAT_t',
                                                   'DR1TSFAT_t', 'DR1TMFAT_t',
                                                   'DR1TPFAT_t', 'DR1TCHOL_t',
                                                   'DR1TSODI_t', 'DR1TVD_t',
                                                   'DR1TCALC_t', 'DR1TIRON_t',
                                                   'DR1TPOTA_t', 'ALQ120Q',
                                                   'ALQ130', 'MET_rec',
                                                

In [21]:
Models['number'].append(1)
r2 = knn1.score(X_train,y_train)
Models['r2'].append(r2)

In [24]:
Models['adjusted r2'].append(((1-r2)/(X_train.shape[0]-1))/(X_train.shape[0]-X_train.shape[1]-1))

In [25]:
training_preds = knn1.predict(X_train)

In [26]:
ypreds = knn1.predict(X_test)

In [27]:
Models['training MAE'].append(mean_absolute_error(y_train,training_preds))
Models['testing MAE'].append(mean_absolute_error(y_test,ypreds))
Models['training RMSE'].append(mean_squared_error(y_train,training_preds,squared=False))
Models['testing RMSE'].append(mean_squared_error(y_test,ypreds,squared=False))

In [28]:
Models

{'number': [1],
 'r2': [0.27477294125010665],
 'adjusted r2': [8.903822887199862e-08],
 'training MAE': [0.14642303850119112],
 'testing MAE': [0.17462034466022128],
 'training RMSE': [0.18658239612019176],
 'testing RMSE': [0.22191043832633375]}