In [1]:
# This takes a dataset, prepares the data, splits it, tries out different ML models,
# picks the best one based on test data, checks it on validation data, saves the pipeline,
# and maybe tries out a web app

# (In the style of Chpt. 2 exercises, p. 84)

In [85]:
import os
import time
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from scipy.stats import uniform

%matplotlib qt

In [2]:
STEM = "/Users/bandari/Documents/git.repos/ml/"
DATA_PATH = os.path.join(STEM, "datasets")

### Function defs

In [3]:
def load_data(data_path=DATA_PATH):
    csv_path = os.path.join(DATA_PATH, "heart.csv")
    return pd.read_csv(csv_path)

In [4]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [5]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

### Read in data

In [6]:
df = load_data()

### View it 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [8]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


### Some pre-processing

#### encoding before splitting

In [9]:
# turn some string quantities into numerical ones ('encode')
# and show what the numbers stand for

# remove NaNs first
#df["Expedition"].fillna("unk", inplace=True)

ordinal_encoder = OrdinalEncoder()
df["Sex_encoded"] = ordinal_encoder.fit_transform(df[["Sex"]])
print(ordinal_encoder.categories_)
print("----")
df["ChestPainType_encoded"] = ordinal_encoder.fit_transform(df[["ChestPainType"]])
print(ordinal_encoder.categories_)
print("----")
df["RestingECG_encoded"] = ordinal_encoder.fit_transform(df[["RestingECG"]])
print(ordinal_encoder.categories_)
print("----")
df["ExerciseAngina_encoded"] = ordinal_encoder.fit_transform(df[["ExerciseAngina"]])
print(ordinal_encoder.categories_)
print("----")
df["ST_Slope_encoded"] = ordinal_encoder.fit_transform(df[["ST_Slope"]])
print(ordinal_encoder.categories_)
print("----")

[array(['F', 'M'], dtype=object)]
----
[array(['ASY', 'ATA', 'NAP', 'TA'], dtype=object)]
----
[array(['LVH', 'Normal', 'ST'], dtype=object)]
----
[array(['N', 'Y'], dtype=object)]
----
[array(['Down', 'Flat', 'Up'], dtype=object)]
----


In [10]:
# scatter matrix

scatter_matrix(df)

array([[<AxesSubplot:xlabel='Age', ylabel='Age'>,
        <AxesSubplot:xlabel='RestingBP', ylabel='Age'>,
        <AxesSubplot:xlabel='Cholesterol', ylabel='Age'>,
        <AxesSubplot:xlabel='FastingBS', ylabel='Age'>,
        <AxesSubplot:xlabel='MaxHR', ylabel='Age'>,
        <AxesSubplot:xlabel='Oldpeak', ylabel='Age'>,
        <AxesSubplot:xlabel='HeartDisease', ylabel='Age'>,
        <AxesSubplot:xlabel='Sex_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='ChestPainType_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='RestingECG_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='ExerciseAngina_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='ST_Slope_encoded', ylabel='Age'>],
       [<AxesSubplot:xlabel='Age', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='RestingBP', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='Cholesterol', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='FastingBS', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='MaxHR', ylabel='Re

### Split into training and validation datasets

In [11]:
# note this is before feature scaling

train_set, test_set = split_train_test(df, 0.2)

In [12]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 735 entries, 681 to 517
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     735 non-null    int64  
 1   Sex                     735 non-null    object 
 2   ChestPainType           735 non-null    object 
 3   RestingBP               735 non-null    int64  
 4   Cholesterol             735 non-null    int64  
 5   FastingBS               735 non-null    int64  
 6   RestingECG              735 non-null    object 
 7   MaxHR                   735 non-null    int64  
 8   ExerciseAngina          735 non-null    object 
 9   Oldpeak                 735 non-null    float64
 10  ST_Slope                735 non-null    object 
 11  HeartDisease            735 non-null    int64  
 12  Sex_encoded             735 non-null    float64
 13  ChestPainType_encoded   735 non-null    float64
 14  RestingECG_encoded      735 non-null    

### Feature-scale only the training set

In [13]:
# make a 'pipeline' to feature scale

num_pipeline = Pipeline([('std_scaler', StandardScaler())])

# drop non-numerical features
train_set_dropped = train_set.drop(columns=["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"], inplace=False)

# the syntax here is necessary to restore column keys
train_set_preprocessed = pd.DataFrame(num_pipeline.fit_transform(train_set_dropped),columns = train_set_dropped.columns)

### Separate training set into predictors and labels

In [14]:
# predictors: the things that help us predict
# labels: the things we want to predict

data_predictors = train_set_preprocessed.drop("HeartDisease", axis=1)
data_labels = train_set_preprocessed["HeartDisease"].copy()

In [15]:
type(data_predictors)

pandas.core.frame.DataFrame

### Try individual ML models with a grid search for each

#### linear model

In [16]:
# linear model

lin_reg = LinearRegression()
lin_reg.fit(data_predictors, data_labels)
data_predcns_lin = lin_reg.predict(data_predictors)
mean_squared_error(data_labels,data_predcns_lin)

0.48643517760502203

In [17]:
# see a few examples

print("Predictions: ", lin_reg.predict(data_predictors)[:5])
print("Labels: ", list(data_labels)[:5])

Predictions:  [-0.36163091 -0.12913792 -0.45140984  0.43995668 -0.95392987]
Labels:  [-1.1643750387092773, 0.858829815785566, -1.1643750387092773, 0.858829815785566, -1.1643750387092773]


In [18]:
data_predcns_lin.shape

(735,)

In [19]:
# How good are the predictions?

plt.hist(data_predcns_lin[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_lin[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [20]:
# see rmse of whole dataset

lin_mse = mean_squared_error(data_labels, data_predcns_lin) # squeeze so both are pandas series
lin_rmse = np.sqrt(lin_mse)

print(lin_rmse)

0.6974490501857623


In [21]:
# see coefficients 
# (ref. https://scikit-learn.org/stable/modules/linear_model.html )

lin_reg.coef_

array([ 0.02490006,  0.0087377 , -0.10778071,  0.1101188 , -0.12886041,
        0.10822668,  0.13456085, -0.16688587, -0.0198067 ,  0.17376875,
       -0.2970542 ])

#### decision tree model

In [22]:
tree_reg = DecisionTreeRegressor()
#tree_reg.fit(data_preds, data_labels)
tree_reg.fit(data_predictors, data_labels)
data_predcns_dt = tree_reg.predict(data_predictors) # _dt: decision_tree
tree_mse = mean_squared_error(data_labels, data_predcns_dt)
tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)

8.884521570044615e-16


In [23]:
# cross-validate to avoid overfitting

scores_tree = cross_val_score(tree_reg, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores_tree)

scores_lin = cross_val_score(lin_reg, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores_lin)

In [24]:
display_scores(lin_rmse_scores)

Scores:  [0.64007831 0.69610174 0.67199793 0.79153637 0.69641026 0.69697922
 0.68949356 0.74762391 0.65951315 0.7903745 ]
Mean:  0.7080108936816955
Standard deviation:  0.04932975991189266


In [25]:
display_scores(tree_rmse_scores)

Scores:  [0.91089775 0.7437449  0.9697247  0.88001083 0.78004623 0.97634406
 0.74882171 0.97634406 0.94719287 0.94719287]
Mean:  0.8880319976199023
Standard deviation:  0.09039808406223959


In [26]:
# How good are the predictions?

plt.hist(data_predcns_dt[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_dt[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [27]:
data_predictors.shape

(735, 11)

#### random forest regressor

In [28]:
# note the way the parameters being grid-searched are named, given that the search takes in
# an entire pipeline, and not just one step in isolation

pipeline_rf = Pipeline(steps = [("rf_reg", RandomForestRegressor())])

param_grid = [
    {'rf_reg__n_estimators': [3,10,30], 'rf_reg__max_features': [2,4,6,8]},
    {'rf_reg__bootstrap': [False], 'rf_reg__n_estimators': [3,10], 'rf_reg__max_features': [2,3,4]}
]

grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

Best parameter (CV score=-0.417):
{'rf_reg__max_features': 4, 'rf_reg__n_estimators': 30}


In [29]:
pipeline_rf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rf_reg', 'rf_reg__bootstrap', 'rf_reg__ccp_alpha', 'rf_reg__criterion', 'rf_reg__max_depth', 'rf_reg__max_features', 'rf_reg__max_leaf_nodes', 'rf_reg__max_samples', 'rf_reg__min_impurity_decrease', 'rf_reg__min_impurity_split', 'rf_reg__min_samples_leaf', 'rf_reg__min_samples_split', 'rf_reg__min_weight_fraction_leaf', 'rf_reg__n_estimators', 'rf_reg__n_jobs', 'rf_reg__oob_score', 'rf_reg__random_state', 'rf_reg__verbose', 'rf_reg__warm_start'])

#### support vector machine regressor

In [30]:
svm_reg = SVR(kernel="linear", gamma="scale", C=0.8)
#tree_reg.fit(data_preds, data_labels)
svm_reg.fit(data_predictors, data_labels)
data_predcns_svm = svm_reg.predict(data_predictors) # _svm: support vector machine
svm_mse = mean_squared_error(data_labels, data_predcns_svm)
svm_rmse = np.sqrt(svm_mse)

print(svm_rmse)

0.7400159204443904


In [31]:
# How good are the predictions?

plt.hist(data_predcns_svm[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_svm[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [32]:
# try a grid search

pipeline_svm = Pipeline(steps = [("svm_reg", SVR())])

param_grid = [
    {'svm_reg__epsilon': [0.1,0.2,0.3],
    'svm_reg__gamma': ["scale", "auto"], 
     'svm_reg__kernel': ["linear", "poly", "rbf", "sigmoid"], 
     'svm_reg__C': [0.1,0.3]}
]

grid_search = GridSearchCV(pipeline_svm, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

Best parameter (CV score=-0.423):
{'svm_reg__C': 0.3, 'svm_reg__epsilon': 0.3, 'svm_reg__gamma': 'auto', 'svm_reg__kernel': 'rbf'}


In [33]:
# apply those best-fit parameters and see the results

svm_reg = SVR(kernel=grid_search.best_params_["svm_reg__kernel"], 
              gamma=grid_search.best_params_["svm_reg__gamma"], 
              epsilon=grid_search.best_params_["svm_reg__epsilon"],
              C=grid_search.best_params_["svm_reg__C"])
#tree_reg.fit(data_preds, data_labels)
svm_reg.fit(data_predictors, data_labels)
data_predcns_svm = svm_reg.predict(data_predictors) # _svm: support vector machine
svm_mse = mean_squared_error(data_labels, data_predcns_svm)
svm_rmse = np.sqrt(svm_mse)

plt.hist(data_predcns_svm[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_svm[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [34]:
pipeline_rf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rf_reg', 'rf_reg__bootstrap', 'rf_reg__ccp_alpha', 'rf_reg__criterion', 'rf_reg__max_depth', 'rf_reg__max_features', 'rf_reg__max_leaf_nodes', 'rf_reg__max_samples', 'rf_reg__min_impurity_decrease', 'rf_reg__min_impurity_split', 'rf_reg__min_samples_leaf', 'rf_reg__min_samples_split', 'rf_reg__min_weight_fraction_leaf', 'rf_reg__n_estimators', 'rf_reg__n_jobs', 'rf_reg__oob_score', 'rf_reg__random_state', 'rf_reg__verbose', 'rf_reg__warm_start'])

In [35]:
# now try a RandomizedSearchCV

distributions = dict(rf_reg__max_features=np.arange(1,5), rf_reg__max_samples=np.arange(1,5))

random_search = RandomizedSearchCV(pipeline_rf, 
                                   distributions, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

random_search.fit(data_predictors, data_labels)
data_predcns_rf_rs = random_search.predict(data_predictors) # _rf_rs: random forest, random search

print("Best parameter (CV score=%0.3f):" % random_search.best_score_)
print(random_search.best_params_)


plt.hist(data_predcns_rf_rs[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_rf_rs[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

Best parameter (CV score=-0.668):
{'rf_reg__max_samples': 4, 'rf_reg__max_features': 4}


In [36]:
svm_reg = SVR(kernel="linear", gamma="scale", C=0.8)
svm_reg.get_params().keys()

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])

### Which model appears to perform best?

#### consolidate grid searches of each model that is non-linear: decision tree, random forest, support vector machine

In [37]:
# linear model, rehash here for comparison

#lin_mse = mean_squared_error(data_labels, data_predcns_lin)
#lin_rmse = np.sqrt(lin_mse)

# cross-validate

scores_lin = cross_val_score(lin_reg, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores_lin)

In [38]:
print("Linear rmse scores: ", lin_rmse_scores)
print("Avg: ", np.mean(lin_rmse_scores))

Linear rmse scores:  [0.64007831 0.69610174 0.67199793 0.79153637 0.69641026 0.69697922
 0.68949356 0.74762391 0.65951315 0.7903745 ]
Avg:  0.7080108936816955


In [39]:
pipeline_dt.get_params().keys()

NameError: name 'pipeline_dt' is not defined

#### decision tree

In [40]:
# grid search, then cross-validate

pipeline_dt = Pipeline(steps = [("dt_reg", DecisionTreeRegressor())])

param_grid = [
    {'dt_reg__max_leaf_nodes': [1,2,3], 'dt_reg__min_weight_fraction_leaf': [0.0,0.3]},
    {'dt_reg__min_impurity_decrease': [0.0,0.1]}
]

grid_search = GridSearchCV(pipeline_dt, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)


scores_tree = cross_val_score(pipeline_dt, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores_tree)

Traceback (most recent call last):
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/tree/_classes.py", line 1256, in fit
    X_idx_sorted=X_idx_sorted)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/tree/_classes.py", line 295, in fit
    "or larger than 1").format(max_leaf_nodes))
ValueError: max_leaf_nodes 1 must be either None or larger than 1

Traceback (most recent call last):
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_

Best parameter (CV score=-0.559):
{'dt_reg__max_leaf_nodes': 3, 'dt_reg__min_weight_fraction_leaf': 0.0}


 -0.82424077 -0.60607335]
 -5.30632325e-01 -6.00539385e-01 -1.53202355e-30 -6.00539385e-01]


In [41]:
print("Decision tree rmse scores: ", tree_rmse_scores)
print("Avg: ", np.mean(tree_rmse_scores))

Decision tree rmse scores:  [0.88001083 0.78004623 0.99783849 0.81473171 0.81473171 1.00464975
 0.82029309 0.91711555 0.8860178  0.91711555]
Avg:  0.8832550710374422


#### random forest regressor

In [42]:
pipeline_rf = Pipeline(steps = [("rf_reg", RandomForestRegressor())])

param_grid = [
    {'rf_reg__n_estimators': [3,10,30], 'rf_reg__max_features': [2,4,6,8]},
    {'rf_reg__bootstrap': [False], 'rf_reg__n_estimators': [3,10], 'rf_reg__max_features': [2,3,4]}
]

grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

scores_rf = cross_val_score(pipeline_rf, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-scores_rf)

Best parameter (CV score=-0.403):
{'rf_reg__max_features': 2, 'rf_reg__n_estimators': 30}


In [43]:
print("Random forest rmse scores: ", rf_rmse_scores)
print("Avg: ", np.mean(rf_rmse_scores))

Random forest rmse scores:  [0.64698967 0.61269865 0.6585776  0.66297672 0.6538994  0.70383664
 0.59469363 0.66150674 0.58460392 0.66364364]
Avg:  0.6443426609331281


#### support vector machine regressor

In [44]:
pipeline_svm = Pipeline(steps = [("svm_reg", SVR())])

param_grid = [
    {'svm_reg__epsilon': [0.1,0.2,0.3],
    'svm_reg__gamma': ["scale", "auto"], 
     'svm_reg__kernel': ["linear", "poly", "rbf", "sigmoid"], 
     'svm_reg__C': [0.1,0.3]}
]

grid_search = GridSearchCV(pipeline_svm, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

scores_svm = cross_val_score(pipeline_svm, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-scores_svm)

Best parameter (CV score=-0.423):
{'svm_reg__C': 0.3, 'svm_reg__epsilon': 0.3, 'svm_reg__gamma': 'auto', 'svm_reg__kernel': 'rbf'}


In [45]:
print("Support vector machine rmse scores: ", svm_rmse_scores)
print("Avg: ", np.mean(svm_rmse_scores))

Support vector machine rmse scores:  [0.64202369 0.60788866 0.60124862 0.68982056 0.63324615 0.64226174
 0.5874549  0.68893765 0.54627377 0.74539151]
Avg:  0.6384547229428599


### Check performance of all models on validation data. 
### Is the best model based on the training data also the best based on the validation data?

In [46]:
# drop the non-numerical stuff

test_set_dropped = test_set.drop(
    columns=["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"], 
    inplace=False)

In [47]:
# apply transformation to the validation set, as applied to the training set
# * note that this uses the function transform()---not fit_transform()---so that
# the same transformation is applied while blind to the content of the test data

# the syntax here is necessary to restore column keys
test_set_preprocessed = pd.DataFrame(num_pipeline.transform(train_set_dropped),columns = test_set_dropped.columns)

In [48]:
test_set_preprocessed

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_encoded,ChestPainType_encoded,RestingECG_encoded,ExerciseAngina_encoded,ST_Slope_encoded
0,-0.297668,0.390043,0.592328,-0.571590,1.966222,-0.842863,-1.164375,0.506370,-0.789753,-1.554706,1.164375,1.074942
1,0.653576,0.914717,0.565434,-0.571590,0.822046,1.517154,0.858830,-1.974842,-0.789753,-1.554706,-0.858830,-0.554094
2,0.019413,-0.659304,0.699904,-0.571590,0.545866,0.518685,-1.164375,-1.974842,0.256136,0.021444,-0.858830,-0.554094
3,0.336495,1.701727,0.843339,1.749507,-0.479947,0.064836,0.858830,0.506370,-0.789753,-1.554706,-0.858830,-0.554094
4,-1.988771,-0.659304,1.013668,-0.571590,1.729496,-0.842863,-1.164375,0.506370,0.256136,-1.554706,-0.858830,1.074942
...,...,...,...,...,...,...,...,...,...,...,...,...
730,-1.460301,1.019651,0.251670,-0.571590,1.768951,-0.842863,0.858830,0.506370,-0.789753,0.021444,-0.858830,1.074942
731,1.393434,1.439390,1.694985,1.749507,-0.243221,-0.842863,0.858830,0.506370,-0.789753,1.597595,1.164375,-0.554094
732,-1.354607,-1.498782,-1.747455,-0.571590,-0.992854,-0.842863,-1.164375,0.506370,-0.789753,1.597595,-0.858830,1.074942
733,0.864964,-0.921641,-1.747455,1.749507,-2.531574,-1.296713,0.858830,0.506370,-0.789753,0.021444,1.164375,-0.554094


In [49]:
# predictors: the things that help us predict
# labels: the things we want to predict

test_data_predictors = test_set_preprocessed.drop("HeartDisease", axis=1)
test_data_labels = test_set_preprocessed["HeartDisease"].copy()

In [50]:
test_data_labels

0     -1.164375
1      0.858830
2     -1.164375
3      0.858830
4     -1.164375
         ...   
730    0.858830
731    0.858830
732   -1.164375
733    0.858830
734    0.858830
Name: HeartDisease, Length: 735, dtype: float64

### Agglomerate into a single pipeline, and save the best one

In [51]:
# do grid searches with all the models we tried on the training
# data

# linear model
print("Linear model...")
lin_reg = LinearRegression()
lin_reg.fit(test_data_predictors, test_data_labels)
data_predcns_lin = lin_reg.predict(test_data_predictors)
print("Score: ", -mean_squared_error(test_data_labels, data_predcns_lin))
print("--------------------")

# decision tree
print("Decision tree...")
param_grid_dt = [
    {'dt_reg__max_leaf_nodes': [1,2,3], 'dt_reg__min_weight_fraction_leaf': [0.0,0.3]},
    {'dt_reg__min_impurity_decrease': [0.0,0.1]}
]
grid_search_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)
grid_search_dt.fit(test_data_predictors, test_data_labels)
print("Best parameter (CV score=%0.3f):" % grid_search_dt.best_score_)
print(grid_search_dt.best_params_)
print("--------------------")

# random forest regressor
print("Random forest...")
param_grid_rf = [
    {'rf_reg__n_estimators': [3,10,30], 'rf_reg__max_features': [2,4,6,8]},
    {'rf_reg__bootstrap': [False], 'rf_reg__n_estimators': [3,10], 'rf_reg__max_features': [2,3,4]}
]
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)
grid_search_rf.fit(test_data_predictors, test_data_labels)
print("Best parameter (CV score=%0.3f):" % grid_search_rf.best_score_)
print(grid_search_rf.best_params_)
print("--------------------")

# support vector machine
print("Support vector machine...")
param_grid_svm = [
    {'svm_reg__epsilon': [0.1,0.2,0.3],
    'svm_reg__gamma': ["scale", "auto"], 
     'svm_reg__kernel': ["linear", "poly", "rbf", "sigmoid"], 
     'svm_reg__C': [0.1,0.3]}
]
grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)
grid_search_svm.fit(test_data_predictors, test_data_labels)
print("Best parameter (CV score=%0.3f):" % grid_search_svm.best_score_)
print(grid_search_svm.best_params_)
print("--------------------")


Linear model...
Score:  -0.48643517760502203
--------------------
Decision tree...


Traceback (most recent call last):
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/tree/_classes.py", line 1256, in fit
    X_idx_sorted=X_idx_sorted)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/tree/_classes.py", line 295, in fit
    "or larger than 1").format(max_leaf_nodes))
ValueError: max_leaf_nodes 1 must be either None or larger than 1

Traceback (most recent call last):
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_

 -0.7963948  -0.60607335]
 -5.30632325e-01 -6.00539385e-01 -1.53225833e-30 -6.00539385e-01]


Best parameter (CV score=-0.559):
{'dt_reg__max_leaf_nodes': 3, 'dt_reg__min_weight_fraction_leaf': 0.0}
--------------------
Random forest...
Best parameter (CV score=-0.414):
{'rf_reg__max_features': 4, 'rf_reg__n_estimators': 30}
--------------------
Support vector machine...
Best parameter (CV score=-0.423):
{'svm_reg__C': 0.3, 'svm_reg__epsilon': 0.3, 'svm_reg__gamma': 'auto', 'svm_reg__kernel': 'rbf'}
--------------------


In [54]:
plt.clf()
plt.hist(test_set_preprocessed["HeartDisease"].sort_values(ascending=False))
plt.show()

In [55]:
test_set_preprocessed["HeartDisease"]

0     -1.164375
1      0.858830
2     -1.164375
3      0.858830
4     -1.164375
         ...   
730    0.858830
731    0.858830
732   -1.164375
733    0.858830
734    0.858830
Name: HeartDisease, Length: 735, dtype: float64

### View feature importances

In [None]:
feature_names = data_predictors.keys()

In [76]:
# decision tree regression

tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_predictors, data_labels)

importances_tr = tree_reg.feature_importances_

In [77]:
plt.barh(feature_names,importances_tr)
plt.show()

In [80]:
# support vector machine

svm_reg = SVR(kernel="linear", gamma="scale", C=0.8)
svm_reg.fit(data_predictors, data_labels)

importances_svm = svm_reg.feature_importances_

AttributeError: 'SVR' object has no attribute 'feature_importances_'

In [None]:
plt.barh(feature_names,importances)
plt.show()

In [82]:
svm_reg.feature_names_in_

AttributeError: 'SVR' object has no attribute 'feature_names_in_'

### Save a model

In [87]:
filename="junk.sav"
pickle.dump(tree_reg, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(data_predictors, data_labels)
print(result)

1.0


### Try a web app (may need to read bookmarked websites)

In [59]:
svm_

Best parameter (CV score=-0.405):
{'svm_reg__C': 0.3, 'svm_reg__epsilon': 0.1, 'svm_reg__gamma': 'auto', 'svm_reg__kernel': 'rbf'}
--------------------
