In [66]:
#First, load in the necessary libraries - we're going to use Pandas to explore and preprocess the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
prop = fm.FontProperties(fname='../extras/fonts/Nunito-Medium.ttf')
soft_white = '#FBFAF5'
discord_dark = '#424549'

#Get our machine learning libraries
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_transformer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
pd.set_option('display.max_columns', None)

In [55]:
#Next, load in the dataset from the url below. After doing so, print out each column and it's related data type (dtype) as well as a small data sample
#The dataset is a breakdown of charges to an individual's insurance, and includes characteristics of the individual that are age, sex, bmi, # of children, smoker, and region.
df = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv')
display(df.head())
print('-----------------')
for c in df:
    print(c, df[c].dtype)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


-----------------
age int64
sex object
bmi float64
children int64
smoker object
region object
charges float64


In [97]:
#Preprocess data - #Start by converting string to int
df['sex'] = pd.Categorical(df['sex']).codes
df['children'] = pd.Categorical(df['children']).codes
df['smoker'] = pd.Categorical(df['smoker']).codes
df['region'] = pd.Categorical(df['region']).codes

#Create transformer
transformer = make_column_transformer(
    (MinMaxScaler(), ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'])
)

#Transform and split data
transformer.fit(df)
data = transformer.transform(df)

#Split data
y_data = data[:,-1]
X_data = df.iloc[:,:-1]

test_size = 100
X_train, X_test = X_data[:-test_size], X_data[-test_size:]
y_train, y_test = y_data[:-test_size], y_data[-test_size:]

#What does our data look like now?
display(df.head())

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [99]:
#Create model
model = Sequential()
model.add(Dense(256, input_shape=(X_data.shape[1],), activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(1))

#Compile and fit model
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(), metrics='mse')
model.fit(X_train, y_train, batch_size=8, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18352534280>

In [85]:
#Get predictions and mse
preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)
x = np.arange(len(preds))

#Plot data
plt.figure(facecolor=soft_white)
ax = plt.axes()
ax.plot(x, y_test, label='True')
ax.plot(x, preds, label='Preds')

#Styling
ax.set_facecolor(soft_white)
ax.set_xlabel('Sample Number', fontproperties=prop)
ax.set_ylabel('Mean Squared Error (MSE)', fontproperties=prop)
ax.set_title(f'Regression Neural Network MSE ({round(mse, 5)})')
for label in ax.get_xticklabels():
    label.set_fontproperties(prop)
for label in ax.get_yticklabels():
    label.set_fontproperties(prop)

ax.legend()
plt.savefig('../extras/figures/nn_mse.png')



In [102]:
#sklearn regression - SVM, SGD, KNN, Gaussian, Bayesian, Decision Tree, AdaBoostRegressor, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor

ensembles = [('adaboost_mse.png', AdaBoostRegressor()), ('randomforest_mse.png', RandomForestRegressor()), ('gradientboost_mse.png', GradientBoostingRegressor())]
for savepath, e in ensembles:
    e.fit(X_train, y_train)
    preds = e.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    print(e, mse)

    x = np.arange(len(preds))
    
    #Plot data
    plt.figure(facecolor=soft_white)
    ax = plt.axes()
    ax.plot(x, y_test, label='True')
    ax.plot(x, preds, label='Preds')

    #Styling
    ax.set_facecolor(soft_white)
    ax.set_xlabel('Sample Number', fontproperties=prop)
    ax.set_ylabel('Mean Squared Error (MSE)', fontproperties=prop)
    ax.set_title(f'{str(e)[:-2]} MSE ({round(mse, 5)})')
    for label in ax.get_xticklabels():
        label.set_fontproperties(prop)
    for label in ax.get_yticklabels():
        label.set_fontproperties(prop)

    ax.legend()
    plt.savefig(f'../extras/figures/{savepath}')


BayesianRidge() 0.010070940238109234
AdaBoostRegressor() 0.006146852104194975
RandomForestRegressor() 0.003976904703289654
GradientBoostingRegressor() 0.00463148916592886
