# Introduction to Scikit-learn

This notebook is an into to scikit-learn library
1. Scikit-learn Workflow
2. Getting the data ready
3. Choose the right algorithm/estimator for our problem
4. Evaluating a model
5. Improve a model
6. Save and load a trained model



## Scikit-learn workflow


In [133]:
import pandas as pd
import numpy as np

#the whole process took about 20 minutes
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# 2. Getting the data ready

In [134]:
# Create X (features matrix)
#rows have axis 0, columns have axis 1
# drop all columns except for target column along the axis 1
X = heart_disease.drop("target", axis=1)

# Create Y (labels)
Y = heart_disease["target"]

X.head()



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [135]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [136]:
#Three main things we have to do with data to be used with ML
## 1. Split the data into features and labels (X and Y)
## 2. Filling (imputing) or disregarding missing values
## 3. Converting non-numerical values to numerical (called feature encoding)

## Wrangling data (munging)  - clear, transform, reduce data to make it useful.

# Let's change sex column type to string
X_categorical = X
X_categorical['sex'] = X_categorical['sex'].apply(lambda v: 'Female' if v == 1 else 'Male')
X_categorical

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,Female,3,145,233,1,0,150,0,2.3,0,0,1
1,37,Female,2,130,250,0,1,187,0,3.5,0,0,2
2,41,Male,1,130,204,0,0,172,0,1.4,2,0,2
3,56,Female,1,120,236,0,1,178,0,0.8,2,0,2
4,57,Male,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,Male,0,140,241,0,1,123,1,0.2,1,0,3
299,45,Female,3,110,264,0,1,132,0,1.2,1,0,3
300,68,Female,0,144,193,1,1,141,0,3.4,1,2,3
301,57,Female,0,130,131,0,1,115,1,1.2,1,1,3


## 2.1 Convert categorical values to numerical, one hot encoding

In [137]:
# Let's use one hot encoding/engineering with scikit learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_feature = ['sex']

one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot', one_hot, categorical_feature)], remainder='passthrough') 

transformed_X = transformer.fit_transform(X_categorical)
pd.DataFrame(transformed_X)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,63.0,3.0,145.0,233.0,1.0,0.0,150.0,0.0,2.3,0.0,0.0,1.0
1,1.0,0.0,37.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2.0
2,0.0,1.0,41.0,1.0,130.0,204.0,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2.0
3,1.0,0.0,56.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2.0
4,0.0,1.0,57.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.0,1.0,57.0,0.0,140.0,241.0,0.0,1.0,123.0,1.0,0.2,1.0,0.0,3.0
299,1.0,0.0,45.0,3.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,0.0,3.0
300,1.0,0.0,68.0,0.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,2.0,3.0
301,1.0,0.0,57.0,0.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0,3.0


## 2.2 Handling missing values


In [138]:
# Many machine learning models don't work well when there are missing values in the data.

# There are two main options when dealing with missing values:

# 1. Fill them with some given value, imputing data
#    For example, you might fill missing values of a numerical column with the mean of all the other values. 
#    The practice of filling missing values is often referred to as imputation.
# 2. Remove them. If a row has missing values, you may opt to remove them completely from your sample completely.
#    However, this potentially results in using less data to build your model.

# Dealing with missing values is a problem to problem issue. And there's often no best way to do it.

In [139]:
# Check if there are  missing values 
car_sales_missing = pd.read_csv("car-sales-missing-data.csv")
car_sales_missing


# Fill missing values with Pandas
car_sales_missing['Make'].fillna('missing', inplace=True)
car_sales_missing['Colour'].fillna('missing', inplace=True)
car_sales_missing["Odometer"].fillna(X_train["Odometer"].mean(), inplace=True)
car_sales_missing["Doors"].fillna(4, inplace=True)




## 2.2.1 Fill missing values with Scikit-Learn

In [140]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [141]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [142]:
car_sales_missing.dropna(subset = ["Price"], inplace=True)
car_sales_missing.isna().sum()


Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [144]:
car_sales_missing


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [145]:
#Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical with mean
#Categorical features
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
#Door feature that is num or categorical
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
#Numertical feature
num_imputer = SimpleImputer(strategy="mean")

cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

#Create an imputer 
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features),

])

# Transform the data
car_sales_missing_filled= imputer.fit_transform(car_sales_missing)
car_sales_missing_filled



array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [146]:
car_sales_filled = pd.DataFrame(car_sales_missing_filled, columns =["Make", "Colour", "Doors", "Odometer"])
car_sales_filled

Unnamed: 0,Make,Colour,Doors,Odometer
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


In [None]:
car_sales_filled.isna().sum()


Make        0
Colour      0
Doors       0
Odometer    0
dtype: int64

# 3. Choose the right model and hyperparameters


In [148]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,Female,3,145,233,1,0,150,0,2.3,0,0,1
1,37,Female,2,130,250,0,1,187,0,3.5,0,0,2
2,41,Male,1,130,204,0,0,172,0,1.4,2,0,2
3,56,Female,1,120,236,0,1,178,0,0.8,2,0,2
4,57,Male,0,120,354,0,1,163,1,0.6,2,0,2


In [149]:
# We have a classification problem, because we want to classify whether someone has a heart deseas or not.

# import classification ML model, learning patterns in data and classifing whether a sample (row) is one thing or
# another thing.
from sklearn.ensemble import RandomForestClassifier
#clf is a short for classifier in sklearn, can use model word.
clf =  RandomForestClassifier(n_estimators=100)

# We keep the default hyperparameters, we will see the parameters.
clf.get_params()



{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
# Fit the model to the data: train the model on training data set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape



((242, 13), (61, 13), (242,), (61,))

In [None]:
# Classification model find the patterns in the training data

clf.fit(X_train, Y_train)

RandomForestClassifier()

In [None]:
# make a prediction, y_preds is a conventional name
# y_label = clf.predict(np.array([0, 2, 3, 4])) - doesn't work as shape is not like X_train or X_test

y_preds = clf.predict(X_test)
y_preds




array([1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1])

In [None]:
Y_test

107    1
187    0
129    1
81     1
47     1
      ..
225    0
182    0
154    1
52     1
92     1
Name: target, Length: 61, dtype: int64

# 4. Evaluate the model

In [None]:
# Evaluate the model on the training data, score returns mean accuracy on the given test data and labels.
clf.score(X_train, Y_train)

1.0

In [None]:
# Evaluate the model on the test data
clf.score(X_test, Y_test)

0.819672131147541

In [None]:
#use other metrics to evaluate the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#metrics to compare test labels with prediction labels.
print(classification_report(Y_test, y_preds))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79        29
           1       0.78      0.91      0.84        32

    accuracy                           0.82        61
   macro avg       0.83      0.82      0.82        61
weighted avg       0.83      0.82      0.82        61



In [None]:
#metrics to compare test labels with prediction labels.
confusion_matrix(Y_test,y_preds)

array([[21,  8],
       [ 3, 29]])

In [None]:
#metrics to compare test labels with prediction labels.
accuracy_score(Y_test,y_preds)

0.819672131147541

# 5. Improve the model

In [None]:
# Try different hyperparameters values:  n_estimators
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, Y_test)*100:.2f} %")
    print("")
print(" The best score with 40 estimators")

Trying model with 10 estimators...
Model accuracy on test set: 81.97 %

Trying model with 20 estimators...
Model accuracy on test set: 83.61 %

Trying model with 30 estimators...
Model accuracy on test set: 81.97 %

Trying model with 40 estimators...
Model accuracy on test set: 81.97 %

Trying model with 50 estimators...
Model accuracy on test set: 83.61 %

Trying model with 60 estimators...
Model accuracy on test set: 83.61 %

Trying model with 70 estimators...
Model accuracy on test set: 83.61 %

Trying model with 80 estimators...
Model accuracy on test set: 81.97 %

Trying model with 90 estimators...
Model accuracy on test set: 81.97 %

 The best score with 40 estimators


# 6. Save a model and load

In [None]:
import pickle

#save model to file, wb - write binary
pickle.dump(clf, open("random_forest_model1.pkl", "wb"))

In [None]:
# load a model from file
loaded_model = pickle.load(open("random_forest_model1.pkl", "rb"))

#check model accuracy on test data
loaded_model.score(X_test, Y_test)

0.819672131147541