In [61]:
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 19 12:48:59 2020

@author: mathemacode

Machine learning modeling of ERAU PSF data after
feature engineering done to create "ML_frame.csv".

Data file built in ./R/all_numbers_merge_data.R

"""

import pandas as pd
import shap
import eli5
import os
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from xgboost import XGBRegressor


os.getcwd()
os.chdir("C:\\Users\\dell\\Documents\\GitHub\\PSF_ERAU")

# Import data
df = pd.read_csv('./data/ml/ML_frame.csv')

df.head()

Unnamed: 0,id,zip,zip_count,number_removals,number_placements,number_participants,case_duration_yrs,number_caregivers,age_child,avg_age_caregiver,avg_gross_income_zip,first_placement
0,6017341000,34479,1,4,17,25,4.4,8,13.6,48.4,42992,1
1,9123872000,32091,68,1,1,3,0.6,2,15.4,42.6,48438,2
2,1152740000,32780,1,4,25,10,2.1,4,16.1,37.0,170585,1
3,10101546999,32696,42,1,1,8,1.2,3,14.4,47.1,44626,3
4,3544513075,32601,82,2,5,15,1.9,3,13.3,47.9,54063,2


In [62]:
# Normalize df
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_norm = pd.DataFrame(np_scaled)

df_norm.columns = df.columns
df_norm.head()

Unnamed: 0,id,zip,zip_count,number_removals,number_placements,number_participants,case_duration_yrs,number_caregivers,age_child,avg_age_caregiver,avg_gross_income_zip,first_placement
0,0.595406,0.827528,0.0,0.214286,0.347826,0.666667,0.619718,0.5,0.6,0.514894,0.063069,0.0
1,0.902792,0.027796,0.475177,0.0,0.0,0.055556,0.084507,0.071429,0.68,0.432624,0.071524,0.5
2,0.114062,0.25854,0.0,0.214286,0.521739,0.25,0.295775,0.214286,0.711111,0.353191,0.261153,0.0
3,0.999532,0.230409,0.29078,0.0,0.0,0.194444,0.169014,0.142857,0.635556,0.496454,0.065606,1.0
4,0.350724,0.198593,0.574468,0.071429,0.086957,0.388889,0.267606,0.142857,0.586667,0.507801,0.080256,0.5


In [63]:
# Use these rows for prediction:
features = ['zip', 'zip_count', 'number_participants',
            'case_duration_yrs', 'number_caregivers',
            'age_child', 'avg_age_caregiver',
            'avg_gross_income_zip', 'first_placement']
X = df_norm[features]

# What to predict:
y = df_norm.number_removals

# Train/test split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [64]:
# Random Forest
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)

rf_predictions = rf_model.predict(val_X)
print("\nRandom Forest Mean Absolute Error: \n", 
      mean_absolute_error(val_y, rf_predictions))

perm = PermutationImportance(rf_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Random Forest Mean Absolute Error: 
 0.03773363095238095


Weight,Feature
0.3654  ± 0.2006,first_placement
0.1952  ± 0.0966,age_child
0.1949  ± 0.0596,case_duration_yrs
0.1326  ± 0.0499,number_participants
0.1167  ± 0.0533,zip_count
0.0759  ± 0.0169,avg_age_caregiver
0.0271  ± 0.0692,avg_gross_income_zip
0.0242  ± 0.0447,number_caregivers
-0.0013  ± 0.0202,zip


In [65]:
# XGBoost
xg_model = XGBRegressor(n_estimators=500, learning_rate=0.05)
xg_model.fit(train_X, train_y,
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)], 
             verbose=False)

xg_predictions = xg_model.predict(val_X)
print("\nXGBoost Mean Absolute Error: \n",
      str(mean_absolute_error(xg_predictions, val_y)))

perm = PermutationImportance(xg_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())


XGBoost Mean Absolute Error: 
 0.043509737221258026


Weight,Feature
0.6170  ± 0.2167,first_placement
0.4305  ± 0.3732,age_child
0.4022  ± 0.1904,case_duration_yrs
0.2016  ± 0.0899,number_participants
0.0729  ± 0.0264,zip_count
0.0695  ± 0.0555,number_caregivers
0.0272  ± 0.0597,avg_age_caregiver
0.0193  ± 0.0539,avg_gross_income_zip
-0.0183  ± 0.0166,zip


In [68]:
# Neural Network
X, y = make_regression(n_samples=200, random_state=1)

regr = MLPRegressor(random_state=1, max_iter=500).fit(train_X, train_y)

regr_predictions = regr.predict(val_X)
print("\nNeural Network Mean Absolute Error: \n",
      str(mean_absolute_error(regr_predictions, val_y)))

# regr.score(val_X, val_y)
#regr.predict(val_X[:3])


Neural Network Mean Absolute Error: 
 0.04071396582630648


In [69]:
perm = PermutationImportance(regr, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

Weight,Feature
0.2144  ± 0.0485,first_placement
0.0559  ± 0.0584,age_child
0.0466  ± 0.0246,case_duration_yrs
0.0380  ± 0.0290,number_participants
0.0336  ± 0.0148,zip_count
0.0128  ± 0.0120,zip
0.0121  ± 0.0176,avg_gross_income_zip
0.0049  ± 0.0072,number_caregivers
-0.0036  ± 0.0104,avg_age_caregiver
