In [1]:
import pandas as pd

data = pd.read_csv("insurance.csv")
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [2]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# Experiment 1

In [3]:
#data X,y
X=data.iloc[:,:-1] #grab all rows, except last col
y = data.iloc[:,-1] #grab last col as label

print(X.head())
print(y.head())

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest
0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64


In [4]:
#Note that data has categorical features in 3 cols:
# sex, smoker and region.
#numerize them
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#Use OneHotEncoder on text and return the rest as is:
cols2trans = [1,4,5]
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), cols2trans)], remainder='passthrough')

# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
X_counts = columnTransformer.fit_transform(X)
print(X.shape)
print(X_counts.shape) #notice extra cols

(1338, 6)
(1338, 11)


In [5]:
print(X_counts[:2,:])
#cols are rearraanged:
#sex:F, sex:M, Smoker:N, Smoker:Y, Region:?, Region:?, Region:southeast, Region:southwest, age, bmi, children

[[ 1.    0.    0.    1.    0.    0.    0.    1.   19.   27.9   0.  ]
 [ 0.    1.    1.    0.    0.    0.    1.    0.   18.   33.77  1.  ]]


In [6]:
#Score is bad
from sklearn.linear_model import LinearRegression
Xn = X.iloc[:,[0,2,3]] 
reg = LinearRegression().fit(Xn, y)
print(reg.score(Xn, y))
# print(reg.coef_)
# print(reg.intercept_)

0.12009819576246927


In [7]:
#Score is much better with transformer
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_counts, y)
print(reg.score(X_counts, y))
print(reg.coef_)
print(reg.intercept_)

0.7509130345985207
[    65.6571797     -65.6571797  -11924.26727096  11924.26727096
    587.00923503    234.0453356    -448.01281436   -373.04175627
    256.85635254    339.19345361    475.50054515]
-666.9377199366809


## Experiment 2

We don't want to manually do this with every new data. 

Create a pipeline for automating the process for new data


In [8]:
print(list(X.columns))
print(X.head(1))

['age', 'sex', 'bmi', 'children', 'smoker', 'region']
   age     sex   bmi  children smoker     region
0   19  female  27.9         0    yes  southwest


In [9]:
#We don't want to manually do this. 
#Create a pipeline for automating the process for new data

#see: 
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#we have no missing values to fill here, but... 

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

numeric_features = ['age', 'bmi','children']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(2))])

categorical_features = ['sex', 'smoker','region']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('poly', PolynomialFeatures(2))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LinearRegression(fit_intercept=True))])
#                       ('classifier', svm.SVR(kernel='linear'))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model TRAIN score: %.3f" % clf.score(X_train, y_train))
print("model TEST score: %.3f" % clf.score(X_test, y_test))

model TRAIN score: 0.768
model TEST score: 0.734


In [10]:
#Viz
from sklearn import set_config
set_config(display='diagram')
clf

In [11]:
#see list of param:
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__imputer', 'preprocessor__num__scaler', 'preprocessor__num__poly', 'preprocessor__num__imputer__add_indicator', 'preprocessor__num__imputer__copy', 'preprocessor__num__imputer__fill_value', 'preprocessor__num__imputer__missing_values', 'preprocessor__num__imputer__strategy', 'preprocessor__num__imputer__verbose', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'preprocessor__num__poly__degree', 'preprocessor__num__poly__include_bias', 'preprocessor__num__poly__interaction_only', 'preprocessor__num__poly__order', 'preprocessor__cat__

In [12]:
#more fun: tune up a model if you want

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
#     'classifier__C': [0.1, 1.0, 10, 100],
    'classifier__fit_intercept': [True, False],
    'preprocessor__num__poly__degree': [1,2,3,4],
}

grid_search = GridSearchCV(clf, param_grid, cv=2)
grid_search.fit(X_train, y_train)

print(("best TEST score from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best TEST score from grid search: 0.738


## Save model

In [13]:
#First, save test data:
X_test.to_csv('X_test.csv',index=False)
y_test.to_csv('y_test.csv',index=False)

clf2 = pickle.loads(s)

In [14]:
#then save model:
from joblib import dump, load
dump(clf, 'ins_linreg.joblib') 

['ins_linreg.joblib']

In [15]:
#Test: load and predict
clf2 = load('ins_linreg.joblib') 
print(clf2.score(X_test,y_test))
# print(clf2.predict(X_test))

0.7343689246935985


## Plot

In [16]:
# import matplotlib.pyplot as plt
#dont have mpl installed and dont know how to alt
import altair as alt
# Plot outputs
y_pred = clf2.predict(X_test)


# plt.scatter(X_test, y_test,  color='black')
# plt.plot(X_test, y_pred, color='blue', linewidth=3)
# plt.xticks(())
# plt.yticks(())
# plt.show()

In [17]:
# %whos

In [18]:
#test reading CSV

data = pd.read_csv('X_test.csv')
model = load('ins_linreg.joblib') 
predictions = model.predict(data)
print(predictions[predictions<0])
# predictions[predictions<0] = 0
predictions[predictions<0] = 0.
# print(predictions)
# print(predictions[predictions<10])

# rng0= -10
# rng1 = -1
# print(predictions[rng0:rng1])
# print(y_pred[rng0:rng1])
# print(y_test[rng0:rng1])

[-1732.  -580.]


In [19]:
print(X_test.head(1))

    age   sex    bmi  children smoker     region
92   59  male  29.83         3    yes  northeast


In [20]:
#test online pred

def predict(model, input_df):
    # predictions_df = predict_model(estimator=model, data=input_df)
    predictions = model.predict(input_df)
    # predictions = predictions_df['Label'][0]
    return predictions

model = load('ins_linreg.joblib') 

age=19
sex='male'
bmi=25.0
children=1
smoker='no'
region='southwest'
input_dict = {'age' : age, 'sex' : sex, 'bmi' : bmi, 'children' : children, 'smoker' : smoker, 'region' : region}
input_df = pd.DataFrame([input_dict])
        
print(input_df.shape)
print(X_test.shape)

y_blah = clf2.predict(input_df)
print(y_blah)

output = predict(model=model, input_df=input_df)[0]
output = '$' + str(round(output,2))
print(output)


(1, 6)
(268, 6)
[1550.]
$1550.0


In [22]:
#not sure how to do this either...
#don't think there is a native option.
#prob need flask/flask_restful

# import requests
# url = 'https://streamlit-blah.herokuapp.com/predict_api'
# pred = requests.post(url,json={'age':55, 'sex':'male', 'bmi':59, 'children':1, 'smoker':'male', 'region':'northwest'})
# print(pred.json())