# Medical Cost with Regression

### Workflow

In [267]:
workflow = ["1. Prepare Data",
            "2. Choose Model",
            "3. Training",
            "4. Evaluation",
            "5. Tuning",
            "6. Prediction"]

### Imports

In [268]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
%matplotlib inline

### 1. Prepare Data

In [269]:
# Read the csv into pandas
data = pd.read_csv('insurance.csv')

In [270]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [271]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [272]:
# Lucky us, there is no null values, and minimal conversion needed.

In [273]:
# Convert categorical columns into dtype category
# data[['sex', 'smoker', 'region']] = data[['sex', 'smoker', 'region']].astype('category')
# data.dtypes

In [274]:
# Convert categories into numerical values
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [275]:
# Convert sex to numerical
label.fit(data.sex.drop_duplicates())
data.sex = label.transform(data.sex)

In [276]:
# Convert smoker to numerical
label.fit(data.smoker.drop_duplicates())
data.smoker = label.transform(data.smoker)

In [277]:
# Convert region to numerical
label.fit(data.region.drop_duplicates())
data.region = label.transform(data.region)

In [278]:
# Round charges to nearest cent
data.charges = round(data.charges, 2)

In [279]:
data.dtypes

age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region        int64
charges     float64
dtype: object

In [280]:
# All data is in numerical format

In [281]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [282]:
# There are no null values to fill

### 2. Choose Model

In [283]:
# Choosing between two models
# RandomForestRegressor
# LinearRegression
# Will choose the model after training, testing and tuning each.

### 3. Training

In [284]:
# Import scikit-learn training
from sklearn.model_selection import train_test_split

In [285]:
# Create X/y
X = data.drop(["charges"], axis=1)
y = data["charges"]

In [286]:
# Split to train/test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [287]:
# Verify shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 6), (268, 6), (1070,), (268,))

In [288]:
# Normally, would OneHotEncode here but our data is already in numerical format
# Generally, the more narrow the category LabelEncoder is more e

In [289]:
# RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

In [290]:
model.score(X_test, y_test)

0.871155752579035

### 4. Evaluation

In [291]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

In [292]:
X = data.drop(["charges"], axis=1)
y = data["charges"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [293]:
model = RandomForestRegressor(n_estimators=5)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=5)

In [294]:
# Scoring the train set (should be close to 1.0)
model.score(X_train, y_train)

0.9637270551507771

In [295]:
# Scoring the test set
model.score(X_test, y_test)

0.8161465552645224

In [296]:
# Cross validation
np.random.seed(99)
np.mean(cross_val_score(model, X, y, cv=5)) * 100

80.57805356764837

### 5. Tuning

In [297]:
from sklearn.metrics import r2_score, mean_squared_error

##### Tuning the RandomForestRegressor

In [298]:
model = RandomForestRegressor(n_estimators=5)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8028584088254199

In [299]:
# 83% seems a little bit low, let's adjust the estimators

In [300]:
np.random.seed(99)
for i in range(10, 100, 10):
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f"Accuracy with {i} estimators: {model.score(X_test, y_test) * 100}%")

Accuracy with 10 estimators: 84.73902631735224%
Accuracy with 20 estimators: 83.68222858658231%
Accuracy with 30 estimators: 85.10173832461967%
Accuracy with 40 estimators: 84.75884841510994%
Accuracy with 50 estimators: 85.08489272200673%
Accuracy with 60 estimators: 85.14801255231139%
Accuracy with 70 estimators: 85.1942867036696%
Accuracy with 80 estimators: 85.38958626038517%
Accuracy with 90 estimators: 84.96458201939544%


In [301]:
# 60 estimators is much better

In [302]:
model = RandomForestRegressor(n_estimators=60)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8480495819912851

In [303]:
# Let's check the R2 score

In [304]:
r2_score(y_test, model.predict(X_test))

0.8480495819912851

#### Tuning the LinearRegression model

In [305]:
X = data.drop(["charges"], axis=1)
y = data["charges"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

model = LinearRegression()
model

LinearRegression()

In [306]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.781194044939986

In [307]:
# Let's try and improve

In [308]:
X = data.drop(["charges"], axis=1)
y = data["charges"]

for i in range(1, 6, 1):
    pf = PolynomialFeatures(degree=i)
    pfX = pf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(pfX, y, random_state = 0)
    model = LinearRegression().fit(X_train, y_train)
    print(f"Accuracy with Polynomial degree of {i}: {model.score(X_test, y_test) * 100}%")

Accuracy with Polynomial degree of 1: 79.62732107222696%
Accuracy with Polynomial degree of 2: 88.4628370764262%
Accuracy with Polynomial degree of 3: 87.90556044548325%
Accuracy with Polynomial degree of 4: 85.78909979500838%
Accuracy with Polynomial degree of 5: 78.18568666291672%


In [309]:
# Degree of 2 is clearly the best

In [325]:
X = data.drop(["charges"], axis=1)
y = data["charges"]
pf = PolynomialFeatures(degree=2)
pfX = pf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pfX, y, random_state=0)
model = LinearRegression().fit(X_train, y_train)
model.score(X_test, y_test)

(1003, 28)

#### Results of Tuning both models

In [311]:
# RandomForestRegressor
# Start 0.8386016203764185
# Tuned 0.8703725249262994

# LinearRegression
# Start 0.7641859401555667
# Tuned 0.8846283707642619

### 6. Prediction

In [312]:
# Here we need to save and load the model

In [2]:
import pickle

In [314]:
pickle.dump(model, open("lrmodel.pkl", "wb"))

In [315]:
# Now we can load the model

In [3]:
model = pickle.load(open("lrmodel.pkl", "rb"))

In [317]:
# And use the model

In [360]:
model.score(X_test, y_test)

0.8846283707642619

In [405]:
###################################################################

In [417]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [418]:
# create dataframe
data = pd.read_csv('insurance_clean.csv')

In [408]:
# create X, y
X = data.drop(["charges"], axis=1)
y = data["charges"]

In [419]:
model = RandomForestRegressor(n_estimators=60)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8517273640547705

In [430]:
model.predict([[28,0,25,0,0,0]])

array([3881.61989683])

In [431]:
import pickle

In [432]:
pickle.dump(model, open("rfmodel.pkl", "wb"))

In [433]:
model = pickle.load(open("rfmodel.pkl", "rb"))

In [434]:
model.predict([[28,0,25,0,0,0]])

array([3881.61989683])