In [111]:
"""
This file builds a model with limited features to restrict the amount of information the web app will need
to make a prediction
"""
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

web_app_features = ["age","race","ethnicity","sex","other_language_spoken_at_home","born_in_usa",
                "marriage_status","military_status","total_income", "employment_status",
                "insurance_status","highest_education","number_of_visits","diag_amt","total_expenditure"]

In [112]:
def get_cat_num_features(df): 
    num_unique = df.nunique()
    categorical_features = num_unique[num_unique <= 10].index.tolist()
    numerical_features = [f for f in df.columns if f not in categorical_features]
    return categorical_features, numerical_features

def normalize_target(df, target_column):
    # medical expenditure is strongly positively skewed so best practice is to normalized it for our model
    vals = df[target_column].values 
    return np.array([0 if v == 0 else np.log(v) for v in vals])

def readin_and_split(path, target_column, features):
    df = pd.read_csv(path) 
    df = df[features]
    # remove employment status == 2
    df = df[df.employment_status != 2]
    df = df[df['age'] >= 0] 
    df = df[df['marriage_status'] >= 0]
    df = df[df['total_income'] >= 0]
    df = df[(df[["other_language_spoken_at_home","born_in_usa",
                "marriage_status","military_status","employment_status",
                "insurance_status","highest_education"]] >= -1).all(1)]
    df[target_column] = normalize_target(df, target_column)
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=0)


#TODO: Replace with pd get dummies and see if it fixes the issue
def one_hot_scikit(df):
    cat_cols = get_cat_num_features(df)[0]
    encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
    df_enc = pd.DataFrame(encoder.fit_transform(df[cat_cols]))
    df_enc.columns = encoder.get_feature_names(cat_cols)
    df.drop(cat_cols, axis = 1, inplace = True)
    df = pd.concat([df, df_enc], axis =1)
    return df

In [113]:
 X_train, X_test, y_train, y_test = readin_and_split("../data/meps_data_2019_new_feats.csv", "total_expenditure",  web_app_features)

In [114]:
X_train = one_hot_scikit(X_train)
X_test = one_hot_scikit(X_test)

In [115]:
X_train.head()

Unnamed: 0,age,total_income,number_of_visits,diag_amt,race_1.0,race_2.0,race_3.0,race_4.0,race_5.0,ethnicity_1.0,...,insurance_status_7.0,insurance_status_8.0,highest_education_1.0,highest_education_2.0,highest_education_3.0,highest_education_4.0,highest_education_5.0,highest_education_6.0,highest_education_7.0,highest_education_8.0
0,32.0,40000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,42.0,40000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,16.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,72.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,,,,,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
