In [1]:
# This takes a dataset, prepares the data, splits it, tries out different ML models,
# picks the best one based on test data, checks it on validation data, saves the pipeline,
# and maybe tries out a web app

# (In the style of Chpt. 2 exercises, p. 84)

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import uniform

%matplotlib qt

In [2]:
STEM = "/Users/bandari/Documents/git.repos/ml/"
DATA_PATH = os.path.join(STEM, "datasets")

### Function defs

In [3]:
def load_data(data_path=DATA_PATH):
    csv_path = os.path.join(DATA_PATH, "heart.csv")
    return pd.read_csv(csv_path)

In [4]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [5]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

### Read in data

In [6]:
df = load_data()

### View it 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [8]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


### Some pre-processing

#### encoding before splitting

In [9]:
# turn some string quantities into numerical ones ('encode')
# and show what the numbers stand for

# remove NaNs first
#df["Expedition"].fillna("unk", inplace=True)

ordinal_encoder = OrdinalEncoder()
df["Sex_encoded"] = ordinal_encoder.fit_transform(df[["Sex"]])
print(ordinal_encoder.categories_)
print("----")
df["ChestPainType_encoded"] = ordinal_encoder.fit_transform(df[["ChestPainType"]])
print(ordinal_encoder.categories_)
print("----")
df["RestingECG_encoded"] = ordinal_encoder.fit_transform(df[["RestingECG"]])
print(ordinal_encoder.categories_)
print("----")
df["ExerciseAngina_encoded"] = ordinal_encoder.fit_transform(df[["ExerciseAngina"]])
print(ordinal_encoder.categories_)
print("----")
df["ST_Slope_encoded"] = ordinal_encoder.fit_transform(df[["ST_Slope"]])
print(ordinal_encoder.categories_)
print("----")

[array(['F', 'M'], dtype=object)]
----
[array(['ASY', 'ATA', 'NAP', 'TA'], dtype=object)]
----
[array(['LVH', 'Normal', 'ST'], dtype=object)]
----
[array(['N', 'Y'], dtype=object)]
----
[array(['Down', 'Flat', 'Up'], dtype=object)]
----


In [10]:
# scatter matrix

scatter_matrix(df)

array([[<AxesSubplot:xlabel='Age', ylabel='Age'>,
        <AxesSubplot:xlabel='RestingBP', ylabel='Age'>,
        <AxesSubplot:xlabel='Cholesterol', ylabel='Age'>,
        <AxesSubplot:xlabel='FastingBS', ylabel='Age'>,
        <AxesSubplot:xlabel='MaxHR', ylabel='Age'>,
        <AxesSubplot:xlabel='Oldpeak', ylabel='Age'>,
        <AxesSubplot:xlabel='HeartDisease', ylabel='Age'>,
        <AxesSubplot:xlabel='Sex_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='ChestPainType_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='RestingECG_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='ExerciseAngina_encoded', ylabel='Age'>,
        <AxesSubplot:xlabel='ST_Slope_encoded', ylabel='Age'>],
       [<AxesSubplot:xlabel='Age', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='RestingBP', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='Cholesterol', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='FastingBS', ylabel='RestingBP'>,
        <AxesSubplot:xlabel='MaxHR', ylabel='Re

### Split into training and validation datasets

In [10]:
# note this is before feature scaling

train_set, test_set = split_train_test(df, 0.2)

In [11]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 735 entries, 491 to 252
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     735 non-null    int64  
 1   Sex                     735 non-null    object 
 2   ChestPainType           735 non-null    object 
 3   RestingBP               735 non-null    int64  
 4   Cholesterol             735 non-null    int64  
 5   FastingBS               735 non-null    int64  
 6   RestingECG              735 non-null    object 
 7   MaxHR                   735 non-null    int64  
 8   ExerciseAngina          735 non-null    object 
 9   Oldpeak                 735 non-null    float64
 10  ST_Slope                735 non-null    object 
 11  HeartDisease            735 non-null    int64  
 12  Sex_encoded             735 non-null    float64
 13  ChestPainType_encoded   735 non-null    float64
 14  RestingECG_encoded      735 non-null    

### Feature-scale only the training set

In [12]:
# make a 'pipeline' to feature scale

num_pipeline = Pipeline([('std_scaler', StandardScaler())])

# drop non-numerical features
train_set_dropped = train_set.drop(columns=["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"], inplace=False)

# the syntax here is necessary to restore column keys
train_set_preprocessed = pd.DataFrame(num_pipeline.fit_transform(train_set_dropped),columns = train_set_dropped.columns)

### Separate training set into predictors and labels

In [13]:
# predictors: the things that help us predict
# labels: the things we want to predict

data_predictors = train_set_preprocessed.drop("HeartDisease", axis=1)
data_labels = train_set_preprocessed["HeartDisease"].copy()

In [16]:
type(data_predictors)

pandas.core.frame.DataFrame

### Try individual ML models with a grid search for each

#### linear model

In [16]:
# linear model

lin_reg = LinearRegression()
lin_reg.fit(data_predictors, data_labels)
data_predcns_lin = lin_reg.predict(data_predictors)

In [17]:
# see a few examples

print("Predictions: ", lin_reg.predict(data_predictors)[:5])
print("Labels: ", list(data_labels)[:5])

Predictions:  [1.58437966 0.9057943  0.52973211 0.88754973 0.11836307]
Labels:  [0.9126432521723156, 0.9126432521723156, 0.9126432521723156, 0.9126432521723156, 0.9126432521723156]


In [18]:
data_predcns_lin.shape

(735,)

In [19]:
# How good are the predictions?

plt.hist(data_predcns_lin[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_lin[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [20]:
# see rmse of whole dataset

lin_mse = mean_squared_error(data_labels, data_predcns_lin) # squeeze so both are pandas series
lin_rmse = np.sqrt(lin_mse)

print(lin_rmse)

0.6979503765930667


In [21]:
# see coefficients 
# (ref. https://scikit-learn.org/stable/modules/linear_model.html )

lin_reg.coef_

array([ 0.03788052,  0.010703  , -0.09758579,  0.13869747, -0.0773848 ,
        0.09548141,  0.14664717, -0.18829068, -0.03163875,  0.16580564,
       -0.32096646])

#### decision tree model

In [22]:
tree_reg = DecisionTreeRegressor()
#tree_reg.fit(data_preds, data_labels)
tree_reg.fit(data_predictors, data_labels)
data_predcns_dt = tree_reg.predict(data_predictors) # _dt: decision_tree
tree_mse = mean_squared_error(data_labels, data_predcns_dt)
tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)

6.336593434619975e-16


In [23]:
# cross-validate to avoid overfitting

scores_tree = cross_val_score(tree_reg, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores_tree)

scores_lin = cross_val_score(lin_reg, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores_lin)

In [24]:
display_scores(lin_rmse_scores)

Scores:  [0.69046722 0.67382764 0.78626005 0.79999187 0.70500155 0.70622461
 0.62357806 0.68815448 0.75775499 0.63428938]
Mean:  0.7065549860405064
Standard deviation:  0.05610995612643165


In [25]:
display_scores(tree_rmse_scores)

Scores:  [0.80875445 0.84177832 0.99051786 1.01766036 0.77432343 0.84752431
 0.9402438  0.77960898 0.84752431 0.87951754]
Mean:  0.8727453379916795
Standard deviation:  0.08009986378399449


In [26]:
# How good are the predictions?

plt.hist(data_predcns_dt[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_dt[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [27]:
data_predictors.shape

(735, 11)

#### random forest regressor

In [28]:
# note the way the parameters being grid-searched are named, given that the search takes in
# an entire pipeline, and not just one step in isolation

pipeline_rf = Pipeline(steps = [("rf_reg", RandomForestRegressor())])

param_grid = [
    {'rf_reg__n_estimators': [3,10,30], 'rf_reg__max_features': [2,4,6,8]},
    {'rf_reg__bootstrap': [False], 'rf_reg__n_estimators': [3,10], 'rf_reg__max_features': [2,3,4]}
]

grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

Best parameter (CV score=-0.429):
{'rf_reg__max_features': 4, 'rf_reg__n_estimators': 30}


In [29]:
pipeline_rf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rf_reg', 'rf_reg__bootstrap', 'rf_reg__ccp_alpha', 'rf_reg__criterion', 'rf_reg__max_depth', 'rf_reg__max_features', 'rf_reg__max_leaf_nodes', 'rf_reg__max_samples', 'rf_reg__min_impurity_decrease', 'rf_reg__min_impurity_split', 'rf_reg__min_samples_leaf', 'rf_reg__min_samples_split', 'rf_reg__min_weight_fraction_leaf', 'rf_reg__n_estimators', 'rf_reg__n_jobs', 'rf_reg__oob_score', 'rf_reg__random_state', 'rf_reg__verbose', 'rf_reg__warm_start'])

#### support vector machine regressor

In [30]:
svm_reg = SVR(kernel="linear", gamma="scale", C=0.8)
#tree_reg.fit(data_preds, data_labels)
svm_reg.fit(data_predictors, data_labels)
data_predcns_svm = svm_reg.predict(data_predictors) # _svm: support vector machine
svm_mse = mean_squared_error(data_labels, data_predcns_svm)
svm_rmse = np.sqrt(svm_mse)

print(svm_rmse)

0.7523313471510406


In [31]:
# How good are the predictions?

plt.hist(data_predcns_svm[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_svm[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [32]:
# try a grid search

pipeline_svm = Pipeline(steps = [("svm_reg", SVR())])

param_grid = [
    {'svm_reg__epsilon': [0.1,0.2,0.3],
    'svm_reg__gamma': ["scale", "auto"], 
     'svm_reg__kernel': ["linear", "poly", "rbf", "sigmoid"], 
     'svm_reg__C': [0.1,0.3]}
]

grid_search = GridSearchCV(pipeline_svm, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

Best parameter (CV score=-0.437):
{'svm_reg__C': 0.3, 'svm_reg__epsilon': 0.3, 'svm_reg__gamma': 'auto', 'svm_reg__kernel': 'rbf'}


In [33]:
# apply those best-fit parameters and see the results

svm_reg = SVR(kernel=grid_search.best_params_["svm_reg__kernel"], 
              gamma=grid_search.best_params_["svm_reg__gamma"], 
              epsilon=grid_search.best_params_["svm_reg__epsilon"],
              C=grid_search.best_params_["svm_reg__C"])
#tree_reg.fit(data_preds, data_labels)
svm_reg.fit(data_predictors, data_labels)
data_predcns_svm = svm_reg.predict(data_predictors) # _svm: support vector machine
svm_mse = mean_squared_error(data_labels, data_predcns_svm)
svm_rmse = np.sqrt(svm_mse)

plt.hist(data_predcns_svm[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_svm[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

In [34]:
pipeline_rf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rf_reg', 'rf_reg__bootstrap', 'rf_reg__ccp_alpha', 'rf_reg__criterion', 'rf_reg__max_depth', 'rf_reg__max_features', 'rf_reg__max_leaf_nodes', 'rf_reg__max_samples', 'rf_reg__min_impurity_decrease', 'rf_reg__min_impurity_split', 'rf_reg__min_samples_leaf', 'rf_reg__min_samples_split', 'rf_reg__min_weight_fraction_leaf', 'rf_reg__n_estimators', 'rf_reg__n_jobs', 'rf_reg__oob_score', 'rf_reg__random_state', 'rf_reg__verbose', 'rf_reg__warm_start'])

In [35]:
# now try a RandomizedSearchCV

distributions = dict(rf_reg__max_features=np.arange(1,5), rf_reg__max_samples=np.arange(1,5))

random_search = RandomizedSearchCV(pipeline_rf, 
                                   distributions, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

random_search.fit(data_predictors, data_labels)
data_predcns_rf_rs = random_search.predict(data_predictors) # _rf_rs: random forest, random search

print("Best parameter (CV score=%0.3f):" % random_search.best_score_)
print(random_search.best_params_)


plt.hist(data_predcns_rf_rs[np.where(data_labels > 0)], color="k", alpha=0.5)
plt.axvline(x=0.8878745226, color="k", alpha=1)
plt.hist(data_predcns_rf_rs[np.where(data_labels < 0)], color="blue", alpha=0.5)
plt.axvline(x=-1.126285, color="blue", alpha=1)
#plt.legend()
plt.show()

Best parameter (CV score=-0.687):
{'rf_reg__max_samples': 4, 'rf_reg__max_features': 4}


In [36]:
svm_reg = SVR(kernel="linear", gamma="scale", C=0.8)
svm_reg.get_params().keys()

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])

### Which model appears to perform best?

#### consolidate grid searches of each model that is non-linear: decision tree, random forest, support vector machine

In [38]:
# linear model, rehash here for comparison

#lin_mse = mean_squared_error(data_labels, data_predcns_lin)
#lin_rmse = np.sqrt(lin_mse)

# cross-validate

scores_lin = cross_val_score(lin_reg, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores_lin)

In [58]:
print("Linear rmse scores: ", lin_rmse_scores)
print("Avg: ", np.mean(lin_rmse_scores))

Linear rmse scores:  [0.69046722 0.67382764 0.78626005 0.79999187 0.70500155 0.70622461
 0.62357806 0.68815448 0.75775499 0.63428938]
Avg:  0.7065549860405064


In [40]:
pipeline_dt.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'dt_reg', 'dt_reg__ccp_alpha', 'dt_reg__criterion', 'dt_reg__max_depth', 'dt_reg__max_features', 'dt_reg__max_leaf_nodes', 'dt_reg__min_impurity_decrease', 'dt_reg__min_impurity_split', 'dt_reg__min_samples_leaf', 'dt_reg__min_samples_split', 'dt_reg__min_weight_fraction_leaf', 'dt_reg__random_state', 'dt_reg__splitter'])

#### decision tree

In [45]:
# grid search, then cross-validate

pipeline_dt = Pipeline(steps = [("dt_reg", DecisionTreeRegressor())])

param_grid = [
    {'dt_reg__max_leaf_nodes': [1,2,3], 'dt_reg__min_weight_fraction_leaf': [0.0,0.3]},
    {'dt_reg__min_impurity_decrease': [0.0,0.1]}
]

grid_search = GridSearchCV(pipeline_dt, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)


scores_tree = cross_val_score(pipeline_dt, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores_tree)

Traceback (most recent call last):
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/tree/_classes.py", line 1256, in fit
    X_idx_sorted=X_idx_sorted)
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/tree/_classes.py", line 295, in fit
    "or larger than 1").format(max_leaf_nodes))
ValueError: max_leaf_nodes 1 must be either None or larger than 1

Traceback (most recent call last):
  File "/Users/bandari/anaconda3/envs/ml_env/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_

Best parameter (CV score=-0.560):
{'dt_reg__max_leaf_nodes': 3, 'dt_reg__min_weight_fraction_leaf': 0.0}


 -0.71889886 -0.61803595]
 -5.36329047e-01 -6.10590100e-01 -3.72105387e-31 -6.10590100e-01]


In [57]:
print("Decision tree rmse scores: ", tree_rmse_scores)
print("Avg: ", np.mean(tree_rmse_scores))

Decision tree rmse scores:  [0.69270578 0.61375707 0.74202945 0.8444774  0.63869843 0.62209937
 0.63425526 0.64689611 0.68199685 0.54210988]
Avg:  0.665902558461936


#### random forest regressor

In [51]:
pipeline_rf = Pipeline(steps = [("rf_reg", RandomForestRegressor())])

param_grid = [
    {'rf_reg__n_estimators': [3,10,30], 'rf_reg__max_features': [2,4,6,8]},
    {'rf_reg__bootstrap': [False], 'rf_reg__n_estimators': [3,10], 'rf_reg__max_features': [2,3,4]}
]

grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

scores_rf = cross_val_score(pipeline_rf, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-scores_rf)

Best parameter (CV score=-0.434):
{'rf_reg__max_features': 2, 'rf_reg__n_estimators': 30}


In [56]:
print("Random forest rmse scores: ", rf_rmse_scores)
print("Avg: ", np.mean(rf_rmse_scores))

Random forest rmse scores:  [0.67096975 0.61622985 0.74131658 0.84806494 0.62111078 0.64211665
 0.6542131  0.64391257 0.66970511 0.51882484]
Avg:  0.6626464162528128


#### support vector machine regressor

In [53]:
pipeline_svm = Pipeline(steps = [("svm_reg", SVR())])

param_grid = [
    {'svm_reg__epsilon': [0.1,0.2,0.3],
    'svm_reg__gamma': ["scale", "auto"], 
     'svm_reg__kernel': ["linear", "poly", "rbf", "sigmoid"], 
     'svm_reg__C': [0.1,0.3]}
]

grid_search = GridSearchCV(pipeline_svm, param_grid, cv=5,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_search.fit(data_predictors, data_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)

scores_svm = cross_val_score(pipeline_svm, data_predictors, data_labels, 
                         scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-scores_svm)

Best parameter (CV score=-0.437):
{'svm_reg__C': 0.3, 'svm_reg__epsilon': 0.3, 'svm_reg__gamma': 'auto', 'svm_reg__kernel': 'rbf'}


In [55]:
print("Support vector machine rmse scores: ", svm_rmse_scores)
print("Avg: ", np.mean(svm_rmse_scores))

Support vector machine rmse scores:  [0.63671376 0.61410371 0.79190585 0.89267856 0.63394097 0.57032208
 0.6247518  0.60991254 0.70810432 0.52425747]
Avg:  0.6606691078891382


### Check performance of all models on validation data. 
### Is the best model based on the training data also the best based on the validation data?

In [22]:
# drop the non-numerical stuff

test_set_dropped = test_set.drop(
    columns=["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"], 
    inplace=False)

In [23]:
# apply transformation to the validation set, as applied to the training set
# * note that this uses the function transform()---not fit_transform()---so that
# the same transformation is applied while blind to the content of the test data

# the syntax here is necessary to restore column keys
test_set_preprocessed = pd.DataFrame(num_pipeline.transform(train_set_dropped),columns = test_set_dropped.columns)

In [25]:
test_set_preprocessed

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_encoded,ChestPainType_encoded,RestingECG_encoded,ExerciseAngina_encoded,ST_Slope_encoded
0,2.274636,2.011374,0.014028,1.809214,-1.147527,-0.823146,0.885427,0.523288,-0.808224,1.620157,-0.846930,-0.583614
1,-0.625470,-1.367360,-0.349970,-0.552726,1.468190,0.985020,-1.129399,-1.910994,-0.808224,0.042918,-0.846930,1.066218
2,1.415345,1.030451,0.687423,-0.552726,1.351069,-0.823146,-1.129399,-1.910994,1.287171,0.042918,-0.846930,1.066218
3,0.018998,-0.713412,0.405325,-0.552726,-1.069446,-0.823146,-1.129399,0.523288,0.239474,0.042918,-0.846930,1.066218
4,1.200522,0.104024,0.478124,-0.552726,-0.405756,1.708287,0.885427,0.523288,-0.808224,-1.534321,-0.846930,-0.583614
...,...,...,...,...,...,...,...,...,...,...,...,...
730,-0.625470,-0.985890,-1.833259,1.809214,-0.366716,-0.823146,0.885427,0.523288,-0.808224,0.042918,-0.846930,-0.583614
731,1.522756,0.648981,-1.833259,1.809214,-0.054391,0.804204,0.885427,0.523288,-0.808224,0.042918,-0.846930,1.066218
732,-0.195825,-0.713412,1.124220,-0.552726,1.351069,-0.642330,-1.129399,0.523288,0.239474,0.042918,-0.846930,1.066218
733,1.737579,-0.168455,1.096920,-0.552726,-1.108486,1.346654,0.885427,0.523288,-0.808224,-1.534321,-0.846930,-0.583614


In [62]:
# predictors: the things that help us predict
# labels: the things we want to predict



test_data_predictors = test_set_preprocessed.drop("HeartDisease", axis=1)
test_data_labels = test_set_preprocessed["HeartDisease"].copy()

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

### Agglomerate into a single pipeline, and save the best one

### Try a web app (may need to read bookmarked websites)