In [23]:
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 19 12:48:59 2020

@author: mathemacode

Machine learning modeling of ERAU PSF data after
feature engineering done to create "ML_frame.csv".

Data file built in ./R/all_numbers_merge_data.R

"""

import pandas as pd
import shap
import eli5
import os
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from xgboost import XGBRegressor


os.getcwd()
os.chdir("C:\\Users\\dell\\Documents\\GitHub\\PSF_ERAU")

# Import data
df = pd.read_csv('./data/ml/ML_frame.csv')

In [24]:
# Normalize df
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_norm = pd.DataFrame(np_scaled)

df_norm.columns = df.columns


In [25]:
# Use these rows for prediction:
features = ['zip', 'zip_count', 'number_participants',
            'case_duration_yrs', 'number_caregivers',
            'age_child', 'avg_age_caregiver',
            'avg_gross_income_zip']
X = df_norm[features]

# What to predict:
y = df_norm.number_removals

# Train/test split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [30]:
# Random Forest
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)

rf_predictions = rf_model.predict(val_X)
print("\nRandom Forest Mean Absolute Error: \n", 
      mean_absolute_error(val_y, rf_predictions))

perm = PermutationImportance(rf_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Random Forest Mean Absolute Error: 
 0.044691964285714286


Weight,Feature
0.2120  ± 0.2617,number_participants
0.1210  ± 0.2409,age_child
0.0608  ± 0.1366,case_duration_yrs
0.0492  ± 0.0952,zip_count
0.0406  ± 0.0927,number_caregivers
0.0326  ± 0.0698,avg_age_caregiver
-0.0504  ± 0.0856,avg_gross_income_zip
-0.0686  ± 0.0439,zip


In [27]:
# XGBoost
xg_model = XGBRegressor(n_estimators=500, learning_rate=0.05)
xg_model.fit(train_X, train_y,
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)], 
             verbose=False)

xg_predictions = xg_model.predict(val_X)
print("\nXGBoost Mean Absolute Error: \n",
      str(mean_absolute_error(xg_predictions, val_y)))

perm = PermutationImportance(xg_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())


XGBoost Mean Absolute Error: 
 0.047301701988492685


Weight,Feature
0.2331  ± 0.1776,number_participants
0.2255  ± 0.3081,age_child
0.1212  ± 0.1289,zip_count
0.1060  ± 0.1364,avg_age_caregiver
0.0758  ± 0.0823,case_duration_yrs
0.0246  ± 0.0342,number_caregivers
0.0083  ± 0.0225,zip
0.0073  ± 0.0475,avg_gross_income_zip


In [28]:
# Neural Network
X, y = make_regression(n_samples=200, random_state=1)

regr = MLPRegressor(random_state=1, max_iter=500).fit(train_X, train_y)

regr.score(val_X, val_y)

0.011437775823711527

In [29]:
regr.predict(val_X[:3])

array([0.03890451, 0.01209132, 0.00684114])