In [1]:
%config Completer.use_jedi=False

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

## 2. Choosing the right estimator/algorithm 

Check the Scikit-Learn maching learning map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [3]:
from sklearn.datasets import load_boston

In [4]:
X, y = load_boston(return_X_y=True)

In [5]:
boston = load_boston()
boston;

In [6]:
boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
boston_df['target'] = pd.Series(boston["target"])
boston_df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [7]:
boston_df.shape

(506, 14)

### 2.1 Picking model for regression problem

In [9]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [10]:
np.random.seed(42)

# create the data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [11]:
model = Ridge()
model.fit(X_train, y_train)

# default is R2
model.score(X_test, y_test)

0.6662221670168523

### Improve model

1. scale data?
2. normalize data?

### 2.1.1 Try different estimator

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
np.random.seed(42)
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

rf_model.score(X_test, y_test)

0.8922527442109116

### Thoughts
RandomForest does better with data that is not scaled, so not surprising that it out-performns Ridge regression with no feature engineering done at all.

In [17]:
from sklearn.svm import SVR

np.random.seed(42)
sv_model = SVR()
sv_model.fit(X_train, y_train)

sv_model.score(X_test, y_test)

0.27948125010200275

## 2.2 Choose estimator for classification

In [18]:
heart_disease = pd.read_csv("../data/heart-disease.csv")

In [19]:
heart_disease.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1


In [20]:
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [21]:
from sklearn.svm import LinearSVC

In [30]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

svc = LinearSVC(max_iter=10000)
svc.fit(X_train, y_train)

svc.score(X_test, y_test)



0.8688524590163934

In [23]:
svc.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [27]:
heart_disease.target.value_counts()

1    165
0    138
Name: target, dtype: int64

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
np.random.seed(45)
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

rf_clf.score(X_test, y_test)

0.8524590163934426

In [35]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
np.random.seed(42)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

knn.score(X_test, y_test)

0.6885245901639344