# Machine Learning Model Production

Reference Article: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

Reference Video: https://www.youtube.com/watch?v=7O4dpR9QMIM&list=PL2zq7klxX5ASFejJj80ob9ZAnBHdz5O1t&index=5

**Tasks**
1. Import Libraries
2. Load Data
3. Choose Relevant Columns
4. Train Test Split
5. Multiple Linear Regression
6. Lasso Regression
7. Random Forest
8. Tune Models
9. Test


In [84]:
#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [3]:
#Load Data
insurance = pd.read_csv("Datasets/insuranceForExploring.csv")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmiR,ageR,regionR
0,19,female,27.9,0,yes,southwest,16884.924,2,1,1
1,18,male,33.77,1,no,southeast,1725.5523,3,1,0
2,28,male,33.0,3,no,southeast,4449.462,3,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,2,3
4,32,male,28.88,0,no,northwest,3866.8552,2,2,3


In [4]:
#Choose Relevant Columns
insurance.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges', 'bmiR',
       'ageR', 'regionR'],
      dtype='object')

In [5]:
insuranceModel = insurance[["ageR","regionR", "sex", "children", "smoker", "bmiR", "charges" ]]
insuranceModel.head()

Unnamed: 0,ageR,regionR,sex,children,smoker,bmiR,charges
0,1,1,female,0,yes,2,16884.924
1,1,0,male,1,no,3,1725.5523
2,1,0,male,3,no,3,4449.462
3,2,3,male,0,no,1,21984.47061
4,2,3,male,0,no,2,3866.8552


In [6]:
#get dummy data
insurance_dum = pd.get_dummies(insuranceModel)
insurance_dum.head()

Unnamed: 0,ageR,regionR,children,bmiR,charges,sex_female,sex_male,smoker_no,smoker_yes
0,1,1,0,2,16884.924,1,0,0,1
1,1,0,1,3,1725.5523,0,1,1,0
2,1,0,3,3,4449.462,0,1,1,0
3,2,3,0,1,21984.47061,0,1,1,0
4,2,3,0,2,3866.8552,0,1,1,0


## Train Test Split

In [7]:
x = insurance_dum.drop("charges", axis = 1)
y = insurance_dum.charges.values

In [88]:
insurance_dum.charges

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [89]:
insurance_dum.charges.values

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

## Multiple Linear Regression

In [14]:
#Create Constant
X_sm = X = sm.add_constant(x)

In [15]:
#Build Model
model = sm.OLS(y, X_sm)

In [16]:
#View model fit summary
model.fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,669.9
Date:,"Wed, 19 Oct 2022",Prob (F-statistic):,0.0
Time:,13:21:02,Log-Likelihood:,-13547.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1331,BIC:,27140.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3720.0911,307.945,12.080,0.000,3115.980,4324.202
ageR,2643.1438,125.006,21.144,0.000,2397.914,2888.374
regionR,250.2464,150.615,1.661,0.097,-45.223,545.716
children,600.8838,137.396,4.373,0.000,331.347,870.421
bmiR,1979.2161,160.487,12.333,0.000,1664.381,2294.051
sex_female,1943.4253,225.772,8.608,0.000,1500.518,2386.333
sex_male,1776.6658,227.439,7.812,0.000,1330.489,2222.843
smoker_no,-1.005e+04,236.140,-42.550,0.000,-1.05e+04,-9584.571
smoker_yes,1.377e+04,276.625,49.771,0.000,1.32e+04,1.43e+04

0,1,2,3
Omnibus:,311.005,Durbin-Watson:,2.096
Prob(Omnibus):,0.0,Jarque-Bera (JB):,752.796
Skew:,1.248,Prob(JB):,3.4100000000000003e-164
Kurtosis:,5.696,Cond. No.,1.11e+16


## Findings:
1. Our model explains ~75% of the variance in medical charges for our customers
2. Our "coef" can explain the following:
    - For each move away from the base category we can increase our charges by the amount in "coef" column
    - Smoking Status is our largest increase with over \$10,000 expected difference between "yes and no"
    - Age is our next largest accounting for an increase of over \$2600 per category increase

In [18]:
#Linear Model
lm = LinearRegression()
lm.fit(X_train, y_train)

In [23]:
#Cross Validation - gives us good sense of model performance
cross_val_score(lm, X_train, y_train, scoring = "neg_mean_absolute_error")

array([-4439.83883465, -4241.65053381, -4637.32119668, -4525.98979201,
       -4124.32732776])

In [24]:
np.mean(cross_val_score(lm, X_train, y_train, scoring = "neg_mean_absolute_error"))

-4393.82553698265

#### This shows that our model is off by approximately \$4400 

# ______________________________________________________________________________________________________________________________

## Lasso Regression

In [28]:
lm_l = Lasso()

In [29]:
np.mean(cross_val_score(lm_l, X_train, y_train, scoring = "neg_mean_absolute_error"))

-4393.705116237838

In [31]:
#Tuning Lasso Model

alpha = []
error = []

In [59]:
for i in range(1,100):
    alpha.append(i/100)
    lml = Lasso(alpha=(i/100))
    error.append(np.mean(cross_val_score(lml, X_train, y_train, scoring = "neg_mean_absolute_error")))

In [60]:
err = tuple(zip(alpha, error))
insurance_err = pd.DataFrame(err, columns = ["alpha","error"])

In [61]:
insurance_err[insurance_err.error == max(insurance_err.error)]

Unnamed: 0,alpha,error
594,9.9,-4392.59419


## Random Forest

In [66]:
rf = RandomForestRegressor()

In [68]:
#Find Cross Validation Score
np.mean(cross_val_score(rf, X_train, y_train, scoring = "neg_mean_absolute_error"))

-3163.214606189927

In [69]:
#Random Forest model has provided best error reducing by over $1200

## Tune Model

In [78]:
parameters = {'n_estimators':(range(10,300,10)), "criterion":("mse","mae"), "max_features":("auto", "sqrt", "log2")}

In [79]:
gs = GridSearchCV(rf, parameters, scoring = "neg_mean_absolute_error")

In [80]:
gs.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [81]:
gs.best_score_

-2708.5138257844337

In [82]:
gs.best_estimator_

## Test Ensembles

In [83]:
#New Lasso Model with Alpha 9.9
lm_l = Lasso(alpha=9.9)
lm_l.fit(X_train, y_train)

In [85]:
tpred_lm= lm.predict(X_test)

In [86]:
tpred_lml= lm_l.predict(X_test)

In [87]:
tpred_rf= gs.best_estimator_.predict(X_test)

### Mean Absolute Error

In [91]:
mean_absolute_error(y_test, tpred_lm)

4228.764586389593

In [92]:
mean_absolute_error(y_test, tpred_lml)

4229.948681276244

In [93]:
mean_absolute_error(y_test, tpred_rf)

2539.4724888373976