# Scikit-Learn Doc and Exercise

## 0. An end-to-end Scikit-Learn Workflow

In [1]:
import pandas as pd
import numpy as np


## 1. Getting the data ready

In [2]:
hd = pd.read_csv('heart-disease.csv')

In [11]:
# Create X which is the feature Matrix & Y which is the label

X = hd.drop('target', axis=1)
y = hd['target']

## 2. Choose the right estimator/algorithm for our problem

In [12]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# We'll keep the default Hyperparameters

clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## 3. Fit model to the data

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
clf.fit(X_train, y_train);

In [20]:
# make a prediction
y_preds = clf.predict(X_test)
y_preds

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0])

## 4. Evaluate our Model

In [21]:
clf.score(X_test,y_test)

0.7213114754098361

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.62      0.80      0.70        25
           1       0.83      0.67      0.74        36

    accuracy                           0.72        61
   macro avg       0.73      0.73      0.72        61
weighted avg       0.74      0.72      0.72        61



In [24]:
confusion_matrix(y_test,y_preds)

array([[20,  5],
       [12, 24]])

In [25]:
accuracy_score(y_test,y_preds)

0.7213114754098361

## 5. Improve our model

In [27]:
# Try different amounts of n_estimators (which is another word for hyperparams)

np.random.seed(42)

for i in range(10,100,10):
    print(f"Trying model with {i} estimators")
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")

Trying model with 10 estimators
Model accuracy on test set: 70.49%
Trying model with 20 estimators
Model accuracy on test set: 77.05%
Trying model with 30 estimators
Model accuracy on test set: 73.77%
Trying model with 40 estimators
Model accuracy on test set: 73.77%
Trying model with 50 estimators
Model accuracy on test set: 73.77%
Trying model with 60 estimators
Model accuracy on test set: 72.13%
Trying model with 70 estimators
Model accuracy on test set: 70.49%
Trying model with 80 estimators
Model accuracy on test set: 77.05%
Trying model with 90 estimators
Model accuracy on test set: 72.13%


## 6. Save the model and load it

In [28]:
import pickle

pickle.dump(clf, open('randomforest_model1.pkl', 'wb'))

In [29]:
#load the model 

loaded_model = pickle.load(open('randomforest_model1.pkl','rb'))
loaded_model.score(X_test, y_test)

0.7213114754098361

In [40]:
car_sales = pd.read_csv('car-sales-extended.csv')

X = car_sales.drop('Price', axis=1)
y = car_sales['Price']


## Transform the categorical data into numeric data for using in machine learning models

In [43]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [44]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()

clf.fit(X_train, y_train)
clf.score(X_test,y_test)

0.3235867221569877

## Handling missing data

* Fill them with some value (aka imputation)
* Remove the samples with missing data altogether

In [11]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


--------------------
# Handling Missing Values

In [34]:
X = car_sales_missing.drop('Price', axis = 1)
y = car_sales_missing['Price']

### Option1: Fill missing data using Pandas

In [36]:
""" This step is NOT required in the new version of Scikit-learn as
Scikit-learn handles missing data itself """

## Fill the 'Make' column

car_sales_missing['Make'].fillna('missing', inplace=True)

## Fill the 'Colour' column

car_sales_missing['Colour'].fillna('missing', inplace=True)

## Fill the 'Odometer KM' column

car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)

## Fill the 'Doors' column

car_sales_missing['Doors'].fillna(4, inplace=True)

In [37]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

## Remove rows with missing price values

In [38]:
car_sales_missing.dropna(inplace=True)

In [39]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [40]:
# Convert data to numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

## Option2: Fill missing values with scikit-learn

### Extension: Feature Scaling
Once your data is all in numerical format, there's one more transformation you'll probably want to do to it.

It's called Feature Scaling.

In other words, making sure all of your numerical data is on the same scale.

For example, say you were trying to predict the sale price of cars and the number of kilometres on their odometers varies from 6,000 to 345,000 but the median previous repair cost varies from 100 to 1,700. A machine learning algorithm may have trouble finding patterns in these wide-ranging variables.

To fix this, there are two main types of feature scaling.

Normalization (also called min-max scaling) - This rescales all the numerical values to between 0 and 1, with the lowest value being close to 0 and the highest previous value being close to 1. Scikit-Learn provides functionality for this in the MinMaxScalar class. SRC: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

Standardization - This subtracts the mean value from all of the features (so the resulting features have 0 mean). It then scales the features to unit variance (by dividing the feature by the standard deviation). Scikit-Learn provides functionality for this in the StandardScalar class. SRC: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

**A couple of things to note.**

Feature scaling usually isn't required for your target variable.

Feature scaling is usually not required with tree-based models (e.g. Random Forest) since they can handle varying features.

#### Extra reading

For further information on this topic, I'd suggest the following resources. 

* Feature Scaling - why is it required? by Rahul Saini https://medium.com/@rahul77349/feature-scaling-why-it-is-required-8a93df1af310

* Feature Scaling with Scikit-Learn by Ben Alex Keen https://benalexkeen.com/feature-scaling-with-scikit-learn/

* Feature Scaling for Machine Learning: Understanding the Difference Between Normalization vs. Standardization by Aniruddha Bhandari https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/

#### Challenge

After reading up on feature scaling, a good idea would be to practice it on one of the problems you're working on and see how it affects the results. If you find anything interesting, be sure to share it.

### Some notes to take when preparing data

* Split your data first (into train/test), always keep your training & test data separate

* Fill/transform the training set and test sets separately (this goes for filling data with pandas as well)

* Don't use data from the future (test set) to fill data from the past (training set)


In [102]:
car_missing_val = pd.read_csv('car-sales-extended-missing-data.csv')
car_missing_val.head()
car_missing_val.dropna(subset=['Price'],inplace=True)

car_missing_val.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [103]:
car_missing_val.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [104]:
car_missing_val.dropna(subset=['Price'],inplace=True)
car_missing_val.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [105]:
X = car_missing_val.drop('Price', axis=1)
y = car_missing_val['Price']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train

Unnamed: 0,Make,Colour,Odometer (KM),Doors
416,Honda,White,18933.0,4.0
100,Honda,Blue,146233.0,4.0
371,,Blue,23545.0,4.0
20,Toyota,,124844.0,4.0
995,Toyota,Black,35820.0,4.0
...,...,...,...,...
306,Honda,Red,108681.0,4.0
175,Toyota,Blue,51155.0,4.0
449,Honda,White,146703.0,4.0
25,Honda,Blue,125819.0,4.0


In [111]:
## Fill missing values in scikit-learn

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with missing and numerical values with mean.

cat_imputer = SimpleImputer(strategy='constant', fill_value = 'missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

## Define columns

cat_features= ['Make', 'Colour']
door_feature= ['Doors']
num_features= ['Odometer (KM)']

## Create an imputer, something that fills missing data

imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_features),
                            ('door_imputer', door_imputer, door_feature),
                            ('num_imputer', num_imputer, num_features)])

filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.fit_transform(X_test)

df = pd.DataFrame(filled_X_train)
df

Unnamed: 0,0,1,2,3
0,Honda,White,4.0,18933.0
1,Honda,Blue,4.0,146233.0
2,missing,Blue,4.0,23545.0
3,Toyota,missing,4.0,124844.0
4,Toyota,Black,4.0,35820.0
...,...,...,...,...
755,Honda,Red,4.0,108681.0
756,Toyota,Blue,4.0,51155.0
757,Honda,White,4.0,146703.0
758,Honda,Blue,4.0,125819.0


In [112]:
# Convert data to numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_features = [0,1,2]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')

# X_train = pd.DataFrame(filled_X_train,columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])
# X_test = pd.DataFrame(filled_X_test,columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])


transformed_X_train = transformer.fit_transform(filled_X_train)
transformed_X_test = transformer.fit_transform(filled_X_test)


In [113]:
transformed_X_train

<760x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3040 stored elements in Compressed Sparse Row format>

In [114]:
### Now lets fit the model to our data

np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.17680708165422154

## Choosing the right estimator (model) for your problem

Some things to note:
    
    * Sklearn refers to machine learning models, or algorithms as estimators
    * Classification problem - predicting a category (heart disease or not)
        * sometimes you'll see 'clf' used for classification estimator
    * Regression problem - predicting a number (selling price of a car)

In [3]:
# Get 'California housing' Dataset from sklearn

from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names'])

In [11]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [14]:
housing_df['target'] = housing['target']

In [29]:
housing_df.drop(['MedHouseVal'],axis=1,inplace=True)

In [30]:
np.random.seed(42)

# separate features and target
X = housing_df.drop(['target'], axis=1)
y = housing_df['target']


# split data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate and fit the model

from sklearn.linear_model import Ridge,Lasso

model = Ridge()

model.fit(X_train, y_train) 
model.score(X_test,y_test)



0.5758549611440126

In [31]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)


model = RandomForestRegressor()


model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8051230593157366

In [42]:
### Classification Task

heart_disease = pd.read_csv('heart-disease.csv')
heart_disease.head()

np.random.seed(42)
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# model = LinearSVC(max_iter=1000)
model = RandomForestClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)


0.8524590163934426

In [41]:
heart_disease['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [46]:
from sklearn.metrics import accuracy_score

y_pred= model.predict(X_test)

accuracy_score(y_test,y_pred)

model.predict_proba(X_test[:6])


array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82],
       [0.14, 0.86]])

In [47]:
model.predict(X_test[:6])

array([0, 1, 1, 0, 1, 1])

In [59]:
from sklearn.ensemble import RandomForestRegressor


np.random.seed(42)


X = housing_df.drop('target', axis=1)
y = housing_df['target']

## split into train-test set

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)


## create the model instance
model = RandomForestRegressor()

model.fit(X_train,y_train)


y_preds = model.predict(X_test)


In [60]:
y_preds[:10]

array([0.49384  , 0.75494  , 4.9285964, 2.54029  , 2.33176  , 1.6549701,
       2.34323  , 1.66182  , 2.47489  , 4.8344779])

In [61]:
np.array([y_test[:10]])

array([[0.477  , 0.458  , 5.00001, 2.186  , 2.78   , 1.587  , 1.982  ,
        1.575  , 3.4    , 4.466  ]])

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

## Create the data

housing_df['target'] = housing['target']
X = housing_df.drop(['target'], axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model = RandomForestRegressor()

model.fit(X_train, y_train)

model.score(X_test,y_test)
# y_preds = model.predict(X_test)



0.8066196804802649

In [13]:
y_preds[:10]

array([0.49384  , 0.75494  , 4.9285964, 2.54029  , 2.33176  , 1.6549701,
       2.34323  , 1.66182  , 2.47489  , 4.8344779])

In [14]:
## Compare predictions to the truth

from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test,y_preds)

#this means that on average, each one of our test data predictions (y_preds) is ~0.32 different than the true value (y_test)

0.3265721842781009

## 4. Evaluating a Machine learning model

Three ways to evaluate scikit-learn models/estimtors:
   1. Estimator's built-in `score()` method
   2. The `scoring` parameter
   3. Problem-specific metric functions
    

### 4.1 Evaluating a model using `score()` method

In [19]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

## Create X and y

heart_disease = pd.read_csv('heart-disease.csv')
X = heart_disease.drop(['target'], axis=1)
y = heart_disease['target']

## Create train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Create classifier model instance

model = RandomForestClassifier()

## Fit classifier to training data
model.fit(X_train,y_train)
model.score(X_test,y_test)


0.8524590163934426

In [20]:
### Using Cross-validation

from sklearn.model_selection import cross_val_score

cross_val_score(model, X, y, cv=5)

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])