Author: Marissa Munoz-Ruiz <br>
SLU Capstone: HDS 5960 <br>
Databricks Notebook: SLU_Capstone_ML

---
##### Goal of Script: Process and Compare Regression ML Models 

* Multiple Linear Regression, Extreme Gradient Boosting, and Neural Networks were used
  * Note: NN models can't handle null values  
* RMSE was used as the performance metric

In [0]:
%pip install xgboost tensorflow

Python interpreter will be restarted.
Collecting xgboost
  Downloading xgboost-1.7.2-py3-none-manylinux2014_x86_64.whl (193.6 MB)
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
Collecting flatbuffers>=2.0
  Downloading flatbuffers-22.12.6-py2.py3-none-any.whl (26 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-2.1.1-py3-none-any.whl (6.2 kB)
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.51.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0

In [0]:
%run ./SLU_Capstone_Preprocessing 



Out[11]: 'gross_revenue ~ total_bad_debt_expense + medicaid_charges + cost_to_charge_ratio + buildings + major_movable_equipment + salaries_wages_and_fees_payable + total_unreimbursed_and_uncompensated_care + net_income + cash_on_hand_and_in_banks + total_assets + cost_of_uncompensated_care + depreciation_cost + inventory + other_assets + total_fund_balances + total_salaries_from_worksheet_a + less_total_operating_expense + total_days_v_xviii_xix_unknown_total_for_all_subproviders + accounts_payable + total_bed_days_available + less_contractual_allowance_and_discounts_on_patients_accounts + net_revenue_from_medicaid + net_income_from_service_to_patients + outpatient_total_charges + overhead_nonsalary_costs + prepaid_expenses + total_discharges_v_xviii_xix_unknown_total_for_all_subproviders + fte_employees_on_payroll + accounts_receivable + general_fund_balance + total_liabilities'

  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


Unnamed: 0,gross_revenue
count,53508.0
mean,540965200.0
std,1082218000.0
min,-177031900.0
25%,38115930.0
50%,127172800.0
75%,591822500.0
max,29390140000.0


#### Import Libraries

In [0]:
## Import Modules
import os
import sys
import time
import random 
#import numpy as np
#import pandas as pd
import seaborn as sns
from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE

from xgboost import XGBRegressor
from xgboost import cv
from xgboost import DMatrix

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l1

#### Develop Functions for Machine Learning Models

In [0]:
def LinearRegression_predict(data_df,y_var):
    
    t0 = time.time()
    
    ## Create formula for all variables in model
    vars_remove = [y_var]
    vars_left = set(data_df.columns) - set(vars_remove)
    formula = y_var + " ~ " + " + ".join(vars_left)

    ## Use Patsy to create model matrices
    Y,X = dmatrices(formula,data_df,return_type='dataframe')

    ## Split Data into training and sample
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        np.ravel(Y), # prevents dimensionality error later!
                                                        test_size=0.20,
                                                        random_state=30)

    ## Fit Linear Regression model
    model = LinearRegression(fit_intercept=True) 
    model.fit(X_train,Y_train)

    ## Get 5-CV train results
    cv = KFold(n_splits=5,shuffle=True,random_state=None)
    train_results = cross_val_score(model,X_train,Y_train,scoring='neg_mean_squared_error',cv=cv,n_jobs=-1)
    train_rmse = np.sqrt(np.absolute(train_results).mean())

    ## Predict Linear model
    pred = model.predict(X_test)
    test_rmse = np.sqrt(MSE(Y_test, pred))

    t1 = time.time()

    model_str = "ScikitLearn_LinearRegression"

    return [model_str,train_rmse,test_rmse,(t1-t0)]

In [0]:
def xgb_predict(data_df,y_var):
    
    t0 = time.time()
    X,Y = data_df.loc[:,data_df.columns != y_var], data_df.loc[:,data_df.columns == y_var]
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y, # prevents dimensionality error later!
                                                        test_size=0.20,
                                                        random_state=30)

    ## Use Dmatrix object optimized for XGBoost
    train_dmatrix = DMatrix(data = X_train, label = Y_train)
    test_dmatrix = DMatrix(data = X_test, label = Y_test)

    ## Create CV df w/ hyperparameters 
    params = {"objective":"reg:linear",
              'colsample_bytree': 1, 
              'learning_rate': 0.1,
              'gamma': 0,
              'max_depth': 1, 
              'min_child_weight':1,
              'subsample':1,
              'nthread':3}
    
#     params = {"objective":"reg:linear",
#                   'colsample_bytree': 1, 
#                   'learning_rate': 0.1,
#                   'gamma': 0,
#                   'max_depth': 5, 
#                   'min_child_weight':1,
#                   'subsample':1,
#                   'nthread':3}
        
    ## Fit XGB model
    model = XGBRegressor(**params, verbosity=0)
    model.fit(X_train,Y_train)

    cv = KFold(n_splits=5,shuffle=True,random_state=None)

    ## Get 5-CV train results
    train_results = cross_val_score(model,X_train,Y_train,scoring='neg_mean_squared_error',cv=cv,n_jobs=-1)
    train_rmse = np.sqrt(np.absolute(train_results).mean())

    ## Predict XGB model
    pred = model.predict(X_test)
    test_rmse = np.sqrt(MSE(Y_test, pred))

    t1 = time.time()
    
    model_str = "ScikitLearn_XGboost"

    return [model_str,train_rmse,test_rmse,(t1-t0)]

In [0]:
def NN_model(data_df,y_var):
    
    t0 = time.time()
    
    # create training & testing data sets
    X,Y = data_df.loc[:,data_df.columns != y_var], data_df.loc[:,data_df.columns == y_var]
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.8,random_state=30)

    ## Create NN model structure
    model = Sequential()
    model.add(Dense(64,
                    activation='relu',
                    input_dim=X_train.shape[1]))
    model.add(Dense(64,
                    activation='relu'))
    model.add(Dense(1,
                    activation='linear'))
    model.compile(optimizer='rmsprop',
                  loss='mse',
                  metrics=['mse'])

    # fit NN model/architecture to data
    model.fit(X_train,
              Y_train,
              epochs=500,
              validation_split=0.2,
              verbose=0)

    # performance metrics 
    epoch_mse = np.sqrt(np.min(model.history.history['val_mse']))
    epoch_pos = np.argmin(model.history.history['val_mse'])
    train_rmse = np.sqrt(model.history.history['mse'][epoch_pos])
    
    test_mse = model.evaluate(x=X_test,y=Y_test,verbose=0)
    test_rmse = np.sqrt(test_mse[0])
    
    t1 = time.time()
    
    model_str = "Tensorflow_NN"
    
    return [model_str,train_rmse,test_rmse,(t1-t0)]

#### ML Model Comparison

In [0]:
#data_test = MLdata_pdf[0:100]
data_test = MLdata_pdf
y_var = 'gross_revenue'

results = []
results.append(LinearRegression_predict(data_test,y_var))
results.append(xgb_predict(data_test,y_var))
results.append(NN_model(data_test,y_var))


In [0]:
result_df = pd.DataFrame(results, columns = ['ML Method','Training_RMSE','Testing_RMSE','Time_Taken(s)'])
result_df.sort_values(by=['Testing_RMSE','Training_RMSE'],ascending=True)

Unnamed: 0,ML Method,Training_RMSE,Testing_RMSE,Time_Taken(s)
0,ScikitLearn_LinearRegression,69017910.0,53986140.0,2.139336
1,ScikitLearn_XGboost,108243100.0,84503590.0,5.924739
2,Tensorflow_NN,,,983.395139
