In [1]:
What_we_going_to_cover = [
"0. An end-to-end Scikit-Learn workflow",
"1. Getting the data ready",
"2. Choose the right estimator/algorithm for our problems",
"3. Fit the model/algorithm and use it to make predictions on our data",
"4. Evaluating a model",
"5. Improve a model",
"6. Save and load a trained model",
"7. Putting it all together!",
]

In [2]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Getting the data ready

Three main things we have to do:

    1. Split the data into features and labels (usually 'X' & 'y')
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values to numerical values (also called feature encoding)
    

In [38]:
heart_disease = pd.read_csv("csv/heart-disease.csv")

In [39]:
x = heart_disease.drop("target", axis=1) # features
y = heart_disease["target"] # labels

## Split the data into training and test sets


In [44]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
269,56,1,0,130,283,1,0,103,1,1.6,0,0,3
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2
191,58,1,0,128,216,0,0,131,1,2.2,1,3,3
140,51,0,2,120,295,0,0,157,0,0.6,2,0,2
133,41,1,1,110,235,0,1,153,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3
218,65,1,0,135,254,0,0,127,0,2.8,1,1,3
200,44,1,0,110,197,0,0,177,0,0.0,2,1,2


In [45]:
x_train.shape , x_test.shape, y_train.shape, y_test.shape
# train -> 80% of data
# test -> 20% of data

((242, 13), (61, 13), (242,), (61,))

In [47]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
269,56,1,0,130,283,1,0,103,1,1.6,0,0,3
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2
191,58,1,0,128,216,0,0,131,1,2.2,1,3,3
140,51,0,2,120,295,0,0,157,0,0.6,2,0,2
133,41,1,1,110,235,0,1,153,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3
218,65,1,0,135,254,0,0,127,0,2.8,1,1,3
200,44,1,0,110,197,0,0,177,0,0.0,2,1,2


## 1.1 Make sure it's all numerical

In [58]:

car_sales = pd.read_csv("csv/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [59]:
car_sales.dtypes
car_sales.isna().sum() # missing values

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [60]:
# Split into x/y
from sklearn.model_selection import train_test_split
x = car_sales.drop("Price", axis=1) # features
y = car_sales["Price"] # labels

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [29]:
# build machine learnig model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(x_train, y_train)

model.score(x_test, y_test)

# Error : cannot convert string to float: 'Toyota' -> need to convert string to numbers


ValueError: could not convert string to float: 'Toyota'

In [61]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

catogerical_features=["Make","Colour","Doors"]

one_hot=OneHotEncoder() #Turns catogery into number 

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough") 
#List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.

transformed_x=transformer.fit_transform(x)

transformed_x


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [57]:
pd.DataFrame(transformed_x)

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
1,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 13)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
3,"(0, 3)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
4,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 11)\t1.0\n..."
...,...
995,"(0, 3)\t1.0\n (0, 5)\t1.0\n (0, 12)\t1.0\n..."
996,"(0, 4)\t1.0\n (0, 9)\t1.0\n (0, 11)\t1.0\n..."
997,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 12)\t1.0\n..."
998,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."


In [62]:
# let refit the model
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=.2)

model.fit(x_train, y_train)

model.score(x_test, y_test)


0.30431565439836705

In [63]:
# 5. Improve a model
# Try different amount of n_estimators
np.random.seed(22)
from time import sleep

for i in range(10,100,10):
    print(f"Trying model with {i} estimator")

    model=RandomForestRegressor(n_estimators=i)
    model.fit(x_train,y_train)
    print(f"Model accuracy on test set:{model.score(x_test,y_test) * 100:.2f} %") # result should be 2 decimal point
    print("")


Trying model with 10 estimator
Model accuracy on test set:25.54 %

Trying model with 20 estimator
Model accuracy on test set:31.36 %

Trying model with 30 estimator
Model accuracy on test set:31.33 %

Trying model with 40 estimator
Model accuracy on test set:30.36 %

Trying model with 50 estimator
Model accuracy on test set:31.64 %

Trying model with 60 estimator
Model accuracy on test set:30.10 %

Trying model with 70 estimator
Model accuracy on test set:30.93 %

Trying model with 80 estimator
Model accuracy on test set:30.25 %

Trying model with 90 estimator
Model accuracy on test set:29.61 %



### 1.2 what if there was missing Values ?
1.Fill the with some (also known as impoutation)

2.remove the sample with missing data altogether

In [65]:
car_sale_missing=pd.read_csv("csv/car-sales-extended-missing-data.csv")

In [66]:
car_sale_missing.isna().sum() # missing values

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [81]:
# Fill missing data with pandas

car_sale_missing["Make"].fillna("missing",inplace=True) # fill missing data with missing ( bad idea)

car_sale_missing["Colour"].fillna("missing",inplace=True) # fill missing data with missing ( bad idea)

car_sale_missing["Odometer (KM)"].fillna(car_sale_missing["Odometer (KM)"].mean(),inplace=True) # fill missing data with mean

car_sale_missing["Doors"].fillna(4,inplace=True) # fill missing data with 4


car_sale_missing.isna().sum() # missing values

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [82]:
# remove rows with missing price value

car_sale_missing.dropna(inplace=True)


In [83]:
from sklearn.model_selection import train_test_split

x=car_sale_missing.drop("Price",axis=1)
y=car_sale_missing["Price"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [87]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

catogerical_features=["Make","Colour","Doors"]

one_hot=OneHotEncoder()

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough")

transformed_x=transformer.fit_transform(car_sale_missing)

transformed_x


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

## Fill missing data with scikit-learn

In [89]:
car_sale_missing2=pd.read_csv("csv/car-sales-extended-missing-data.csv")
car_sale_missing2.isna().sum() # missing values

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [91]:
# drop rows with no labels

car_sale_missing2.dropna(subset=["Price"],inplace=True)

car_sale_missing2.isna().sum() # missing values

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [92]:
# Split into x/y

x=car_sale_missing2.drop("Price",axis=1) # features
y=car_sale_missing2["Price"] # labels

In [95]:
# fill missing values with scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# fill catogerical values with missing and numerical values with mean
catogerical_imputer=SimpleImputer(strategy="constant",fill_value="missing")
door_imputer=SimpleImputer(strategy="constant",fill_value=4)
numerical_imputer=SimpleImputer(strategy="mean")

# define columns

catogerical_features=["Make","Colour"]
door_features=["Doors"]
numerical_features=["Odometer (KM)"]

# create an imputer (something that fills missing data)

imputer=ColumnTransformer([
    ("catogerical_imputer",catogerical_imputer,catogerical_features),
    ("door_imputer",door_imputer,door_features),
    ("numerical_imputer",numerical_imputer,numerical_features)
])

# (name of the imputer, imputer, features)

# transform the data

filled_x=imputer.fit_transform(x)

filled_x



array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

Let's fill the missing values. We'll fill the training and test values separately to ensure training data stays with the training data and test data stays with the test data.

**Note:** We use `fit_transform()` on the training data and `transform()` on the testing data. In essence, we learn the patterns in the training set and transform it via imputation (fit, then transform). Then we take those same patterns and fill the test set (transform only).

In [96]:
car_sales_filled=pd.DataFrame(filled_x,columns=["Make","Colour","Doors","Odometer (KM)"])

car_sales_filled.isna().sum() # missing values

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [99]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

catogerical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder() # Turns catogery into number

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough")

transformed_x=transformer.fit_transform(car_sales_filled)

transformed_x


<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [102]:
# Now we have our data as numbers and filled (no missing values)
# Let's fit a model

np.random.seed(42)

x_train,x_test,y_train,y_test=train_test_split(transformed_x,y,test_size=.2)

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=20)

model.fit(x_train,y_train)

model.score(x_test,y_test)


0.20783328634326415

In [104]:
# print sklearn version
import sklearn
print(sklearn.__version__)


1.0.2


# Choosing the right Estimator/ algorithm for your problem

some things to note:

* Sklearn refer to machine learning models, algorithm as estimators
* classification problem - predicting a category (heart disease or not)
	* Something you'will see `clf` (classifier) used as a classification estimator
 * Regression problem - predicting a number (selling price of a car)

* Step 1 - Check the sklearn machine learning map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### 2.1 Picking a machine learning model for a regression problem

let's use the California housing dataset



In [13]:
from sklearn.datasets import  fetch_california_housing

house_data=fetch_california_housing()


In [14]:
house_df=pd.DataFrame(house_data["data"],columns=house_data["feature_names"])
house_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [115]:
# check for missing values
house_df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [15]:
house_df["target"]=house_data["target"] # add target column to dataframe
house_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [16]:
# split into x/y

x=house_df.drop("target",axis=1)
y=house_df["target"]

# split into train/test
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)




In [17]:
# build machine learning model

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor()

model.fit(x_train,y_train)

model.score(x_test,y_test)

0.8060140778732707

In [19]:
# try another model
from sklearn.linear_model import Ridge

model=Ridge()

model.fit(x_train,y_train)

model.score(x_test,y_test) # this give COV (coefficient of variation) not accuracy

# cov means  the reationship b/w two variables in a dataset (how much they change together)



0.5970823910441018

In [20]:
# lets try Lasso
from sklearn.linear_model import Lasso

model=Lasso()

model.fit(x_train,y_train)

model.score(x_test,y_test) # this give COV (coefficient of variation) not accuracy

0.2773328374340006

## lets do classification problem

In [137]:
heart_disease=pd.read_csv("csv/heart-disease.csv")

In [140]:
heart_disease.isna().sum() # checking missing values

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [143]:
x=heart_disease.drop("target",axis=1)
y=heart_disease["target"]

In [144]:
# split into train/test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)

In [145]:
# build machine learning model
# first try RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier() 

model.fit(x_train,y_train)


model.score(x_test,y_test)

0.8360655737704918

In [146]:
# try Linear SVC
from sklearn.svm import LinearSVC

model=LinearSVC()

model.fit(x_train,y_train)

model.score(x_test,y_test)



0.7704918032786885