In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.decomposition import PCA
from feature_engine.outliers import Winsorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import KNNImputer

0.  impute missing values -- yes or no, if no drop -- done
1.  drop persistent outliers -- yes, no -- done
2.  use log transformed y -- yes or no -- done
3.  reduce y range -- yes or no -- done
4.  include nutrients -- yes or no-- done
5.  if so, which nutrients -- original, transformed, both  -- done

6.  PCA -- yes or no
7.  Winsorize X -- yes or no  -- done

8.  include chronic conditions -- yes or no -- done
9.  include alcohol and tobacco -- yes or no -- done
10.  if alchohol included, which -- original, transformed, both -- done
11. transformed physical activity -- yes or no -- done
12. use height and weight -- yes or no, if no, bmi  -- done
13.  include irregular pulse -- yes,no -- done

14.  log transform all x numerical values -- yes, no  -- done

Want to save model number, description,r2, adjusted r2, training mae, test mae, training rmse, test rmse


In [2]:
df_missing = pd.read_csv('../Data/df_transformed.csv')
df_missing.drop('Unnamed: 0',axis=1,inplace=True)

In [3]:
df_imputed = pd.read_csv('../Data/dfi_transformed.csv')
df_imputed.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
Models = {
    'number':[],
    'description':[],
    'r2':[],
    'adjusted r2':[],
    'training MAE':[],
    'testing MAE':[],
    'training RMSE':[],
    'testing RMSE':[]
}

In [5]:
def model(number):
    global Models,df_missing,df_imputed
    Models['number'].append(number)
    m = ""
    imputed = False
    num_variables=['RIDAGEYR','BPXPLS','Systolic','Diastolic']
    cat_variables =['RIAGENDR','RIDRETH3']
    if number % 2 == 0: # use imputed values (remember to drop na before train test split)
        df = df_missing
        m = "drop missing, "
    else:
        df = df_imputed
        imputed = True
        m = "imputed, "
        cat_variables.extend(['BPXPLS_i','Systolic_i','Diastolic_i'])
    number = (number - number%2)//2
    if number%2==1: # drop outliers
        m = m + "drop outliers, "
        df = df[df.outlier==0].copy()
    number = (number - number%2)//2
    if number % 2 == 1: # reduce y range
        m = m + "reduce y range, "
        number -= 1
        df = df[(df.LBXTC>=100)&(df.LBXTC<400)].copy()
    number = number//2
    if number % 2 == 1: # use height and weight
        m = m + "use height and weight, "
        number = number - 1
        num_variables.extend([ 'BMXWT', 'BMXHT'])
        if imputed:
            cat_variables.extend(['BMXWT_i', 'BMXHT_i'])
    else:
        m = m + "use bmi, "
        num_variables.append('BMXBMI')
        if imputed:
            cat_variables.append('BMXBMI_i')
    number = number//2
    if number%2==1: # include irregular pulse
        m = m + "include irregular pulse, "
        number -= 1
        cat_variables.append('BPXPULS')
        if imputed:
            cat_variables.append('BPXPULS_i')
    number = number//2
    if number % 2 == 1: # include chronic conditions
        number -= 1
        m = m + "include chronic conditions, "
        cat_variables.extend(['BPQ020', 'BPQ050A', 'BPQ080','BPQ100D', 'DIQ010', 'MCQ170M'])
        if imputed:
            cat_variables.extend(['BPQ020_i', 'BPQ050A_i', 'BPQ080_i', 'BPQ100D_i', 'DIQ010_i','MCQ170M_i'])
    number = number//2
    if number % 2 == 1: # use transformed physical activity
        number -= 1
        m = m + "transformed physical activity,"
        num_variables.append('MET_rec')
        if imputed:
            cat_variables.append('MET_rec_i')
    else:
        num_variables.extend(['PAQ655','PAD660','PAQ670','PAD675'])
        if imputed:
            cat_variables.extend(['PAQ655_i','PAD660_i','PAQ670_i','PAD675_i'])
    number = number // 2
    if number%2 == 0: # don't include nutrients
        number = (number - number%6)//6
    else:
        number = (number-1)//2
        if number % 3 == 0: # include original nutrients
            num_variables.extend(['DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA'])
            cat_variables.append('DR1_300')
            if imputed:
                cat_variables.append('DR1_300_i')
            m = m + "original nutrients included, "
        elif number % 3 == 1: # include transformed nutrients
            m = m + "transformed nutrients included"
            num_variables.extend(['DR1TKCAL_t',
       'DR1TPROT_t', 'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t',
       'DR1TSFAT_t', 'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t',
       'DR1TVD_t', 'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t'])
            if imputed:
                cat_variables.append('DR1_300_i')
        else: # include both
            m = m + "both nutrients included, "
            num_variables.extend(['DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA','DR1TKCAL_t',
       'DR1TPROT_t', 'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t',
       'DR1TSFAT_t', 'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t',
       'DR1TVD_t', 'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t'])
            if imputed:
                cat_variables.append('DR1_300_i')
        number = (number - number%3)//3
    if number % 2 == 0: # don't include alcohol and tobacco
        number = (number - number % 6)//6
    else: # include alcohol and tobacco
        number = (number - 1)//2
        # add tobacco here
        cat_variables.append('SMQ681')
        if imputed:
            cat_variables.append('SMQ681_i')
        if number % 3 == 0: # include original alcohol quantities
            m = m + "tobacco and alcohol questionairre variables"
            num_variables.extend([ 'ALQ120Q','ALQ130'])
            cat_variables.append('ALQ120U')
            if imputed:
                cat_variables.extend(['ALQ120Q_i', 'ALQ120U_i', 'ALQ130_i'])
        elif number % 3 == 1: # include transformed alcohol quantities
            m = m + "tobacco and average alcohol, "
            num_variables.append('avgALC')
            if imputed:
                cat_variables.append('avgALC_i')
        else: # include both
            m = m + "tobacco and both alcohol variables"
            num_variables.extend([ 'ALQ120Q','ALQ130','avgALC'])
            cat_variables.append('ALQ120U')
            if imputed:
                cat_variables.extend(['ALQ120Q_i', 'ALQ120U_i', 'ALQ130_i','avgALC_i'])
        number = (number-number%3)//3
    df = df[num_variables+cat_variables+['LBXTC','log_LBXTC']].copy()
    df.dropna(inplace=True)
    y = df.LBXTC
    if number % 2 == 1: # use log transformed y
        m = m + "log transform y"
        number -= 1
        y = df.log_LBXTC
    number = number//2
    X = df[num_variables+cat_variables].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=675, test_size=0.2)
    numeric_transformer = Pipeline(steps=[('scalar',MinMaxScaler())])
    cat_transformer = Pipeline(steps=[('one_hot',OneHotEncoder(handle_unknown='ignore'))])
    if number % 2 == 1: # Winsorize x
        m = m + "Winsorize x, "
        numeric_transformer.steps.insert(0,('winsorizor',Winsorizer()))
        number -= 1
    number = number//2
    if number % 2 == 1: # log transform x
        m = m + "Log transform x values, "
        number -= 1
        numeric_transformer.steps.insert(0,('log transform x',FunctionTransformer(np.log1p)))
    number = number // 2
    preprocessor = ColumnTransformer(transformers=[
        ('num',numeric_transformer,num_variables),
        ('cat',cat_transformer,cat_variables)])
    knn1 = Pipeline(steps =[
        ('preprocessor',preprocessor),
        ('kneighbors', KNeighborsRegressor())])
    if number % 2 == 1: # add PCA
        knn1.steps.insert(1,("pca",PCA(0.95)))
        m = m + "added PCA"
    Models['description'].append(m)
    knn1.fit(X_train,y_train)
    r2 = knn1.score(X_train,y_train)
    Models['r2'].append(r2)
    k = knn1.named_steps['kneighbors'].n_features_in_
    n = knn1.named_steps['kneighbors'].n_samples_fit_
    ar2 = ((1-r2)/(n-1))/(n-k-1)
    Models['adjusted r2'].append(ar2)
    y_train_preds = knn1.predict(X_train)
    y_preds = knn1.predict(X_test)
    Models['training MAE'].append(mean_absolute_error(y_train,y_train_preds))
    Models['testing MAE'].append(mean_absolute_error(y_test,y_preds))
    Models['training RMSE'].append(mean_squared_error(y_train,y_train_preds,squared=False))
    Models['testing RMSE'].append(mean_squared_error(y_test,y_preds,squared=False))

In [6]:
model_numbers = []
for i in range(20):
    n = np.random.randint(73728)
    while n in model_numbers:
        n = np.random.randint(73728)
    model_numbers.append(n)
    model(n)
models = pd.DataFrame(Models)
models

Unnamed: 0,number,description,r2,adjusted r2,training MAE,testing MAE,training RMSE,testing RMSE
0,15171,"imputed, drop outliers, use bmi, transformed p...",0.293789,5.202227e-08,0.144811,0.175273,0.18502,0.222001
1,26469,"imputed, reduce y range, use bmi, include chro...",0.340959,4.902624e-08,0.135476,0.16341,0.171898,0.206004
2,62251,"imputed, drop outliers, use height and weight,...",0.325005,4.943953e-08,0.142471,0.167757,0.180885,0.21376
3,32121,"imputed, use height and weight, include irregu...",0.334131,4.925449e-08,26.219758,32.331539,33.56723,44.445762
4,10764,"drop missing, reduce y range, use height and w...",0.267705,5.678819e-08,27.153513,32.631521,34.492215,41.332062
5,2003,"imputed, drop outliers, use bmi, include irreg...",0.298998,5.172323e-08,27.247341,33.935922,35.13787,44.238091
6,14366,"drop missing, drop outliers, reduce y range, u...",0.277048,5.637732e-08,0.142178,0.180982,0.179037,0.226011
7,23350,"drop missing, drop outliers, reduce y range, u...",0.354435,5.198492e-08,0.133126,0.167229,0.16883,0.209489
8,73607,"imputed, drop outliers, reduce y range, use bm...",0.257365,5.510739e-08,0.144605,0.178982,0.181935,0.223895
9,39370,"drop missing, drop outliers, use height and we...",0.276741,8.833009e-08,27.573649,34.283287,36.035932,43.224038


In [None]:
# I can modify this so it only adds to Models models with r2 > 0.3