In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [15]:
# zf = zipfile.ZipFile('../Data/nhanes_labs2.zip')
# if zipped use zf.open('nhanes_labs2.csv')
df = pd.read_csv('Data/nhanes_labs2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
def PredictionBreakdown(Gender, RaceEth, Age_Lower=1, Age_Upper=85, full_output=False):
    adults = df[(df.RIDAGEYR >= Age_Lower) & (df.RIDAGEYR <= Age_Upper)
               & (df.RIAGENDR == Gender) & (df.RIDRETH1 == RaceEth)]
    thresh = len(adults) * 0.05
    adults = adults.dropna(thresh=thresh, axis = 1)
    thresh2 = 20
    adults = adults.dropna(thresh=thresh2, axis=0)
    adults['RIAGENDR'] = pd.Categorical(adults.RIAGENDR)
    adults['RIDRETH1'] = pd.Categorical(adults.RIDRETH1)
    adults['LBXHE1'] = pd.Categorical(adults.LBXHE1)
    y = adults["RIDAGEYR"]
    try:
        X = adults.drop(["RIDAGEYR", "WTMEC2YR","SDDSRVYR", "LBXBVPH", "LBXINSI", 
                        "LBXGLUSI", "RIDRETH1", "RIAGENDR", "INDFMPIR"], axis=1)
    except:
        X = adults.drop(["RIDAGEYR", "WTMEC2YR","SDDSRVYR",
                        "RIDRETH1", "RIAGENDR", "INDFMPIR"], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    my_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', StandardScaler())
    ])
    X_train = my_pipeline.fit_transform(X_train)
    X_train = pd.DataFrame(X_train)
    X_test = my_pipeline.transform(X_test)
    
    forest_reg = RandomForestRegressor(n_estimators=200, max_features=50, n_jobs=-1, bootstrap=True)
    forest_reg.fit(X_train, y_train)
    mae = mean_absolute_error(y_test, forest_reg.predict(X_test))
    forest_preds = forest_reg.predict(X_test)
    actuals = y_test
    resids = forest_preds - actuals
    varimp = []
    for name, score in zip(X.columns, forest_reg.feature_importances_):
        if score >= 0.000001:
            varimp.append((name, score))

    varimpdf = pd.DataFrame(varimp)

    d = {'Factor': varimpdf[0], 'Relative Importance': varimpdf[1]}
    varimps = pd.DataFrame(data=d)
    varimps["Gender"] = Gender
    varimps["RIDRETH1"] = RaceEth
    if full_output == True:
        return {'Feature_Importances':varimps,
                "N":adults.shape,
                "MAE":mae,
                "Errors":resids}
    else:
        return (varimps)

In [17]:
# Test 
PredictionBreakdown(Gender=2, RaceEth=2)

Unnamed: 0,Factor,Relative Importance,Gender,RIDRETH1
0,LBXWBCSI,0.002695,2,2
1,LBXLYPCT,0.003618,2,2
2,LBXMOPCT,0.002598,2,2
3,LBXNEPCT,0.003121,2,2
4,LBXEOPCT,0.001983,2,2
5,LBXBAPCT,0.002305,2,2
6,LBXRBCSI,0.002504,2,2
7,LBXHGB,0.002325,2,2
8,LBXHCT,0.002615,2,2
9,LBXMCVSI,0.008387,2,2


In [18]:
Gender = [1,2]
Ridreth1 = [1,2,3,4,5]

bd = []
for i in Gender:
    for j in Ridreth1:
        bd.append((PredictionBreakdown(i,j)))

In [19]:
group_breakdown = pd.concat([bd[0],bd[1],bd[2],bd[3],bd[4],bd[5],bd[6],bd[7],bd[8],bd[9]])  

In [20]:
group_breakdown.head(50)

Unnamed: 0,Factor,Relative Importance,Gender,RIDRETH1
0,LBXWBCSI,0.001323,1,1
1,LBXLYPCT,0.002172,1,1
2,LBXMOPCT,0.001469,1,1
3,LBXNEPCT,0.002021,1,1
4,LBXEOPCT,0.001443,1,1
5,LBXBAPCT,0.001029,1,1
6,LBXRBCSI,0.006748,1,1
7,LBXHGB,0.008806,1,1
8,LBXHCT,0.007557,1,1
9,LBXMCVSI,0.025214,1,1
