In [2]:
What_we_going_to_cover = [
"0. An end-to-end Scikit-Learn workflow",
"1. Getting the data ready",
"2. Choose the right estimator/algorithm for our problems",
"3. Fit the model/algorithm and use it to make predictions on our data",
"4. Evaluating a model",
"5. Improve a model",
"6. Save and load a trained model",
"7. Putting it all together!",
]

In [14]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Getting the data ready

Three main things we have to do:

    1. Split the data into features and labels (usually 'X' & 'y')
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values to numerical values (also called feature encoding)
    

In [4]:
heart_disease = pd.read_csv("csv/heart-disease.csv")

In [5]:
x = heart_disease.drop("target", axis=1) # features
y = heart_disease["target"] # labels

## Split the data into training and test sets


In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [7]:
x_train.shape , x_test.shape, y_train.shape, y_test.shape
# train -> 80% of data
# test -> 20% of data

((242, 13), (61, 13), (242,), (61,))

In [8]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0
238,77,1,0,125,304,0,0,162,1,0.0,2,3,2
97,52,1,0,108,233,1,1,147,0,0.1,2,3,3
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3
139,64,1,0,128,263,0,1,105,1,0.2,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,58,1,0,150,270,0,0,111,1,0.8,2,0,3
59,57,0,0,128,303,0,0,159,0,0.0,2,1,2
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2
230,47,1,2,108,243,0,1,152,0,0.0,2,0,2


## 1.1 Make sure it's all numerical

In [9]:

car_sales = pd.read_csv("csv/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [10]:
car_sales.dtypes
car_sales.isna().sum() # missing values

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [11]:
# Split into x/y
from sklearn.model_selection import train_test_split
x = car_sales.drop("Price", axis=1) # features
y = car_sales["Price"] # labels

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [12]:
# build machine learnig model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(x_train, y_train)

model.score(x_test, y_test)

# Error : cannot convert string to float: 'Toyota' -> need to convert string to numbers


ValueError: could not convert string to float: 'Honda'

In [None]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

catogerical_features=["Make","Colour","Doors"]

one_hot=OneHotEncoder() #Turns catogery into number 

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough") 
#List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.

transformed_x=transformer.fit_transform(x)

transformed_x


In [None]:
pd.DataFrame(transformed_x)

In [None]:
# let refit the model
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=.2)

model.fit(x_train, y_train)

model.score(x_test, y_test)


In [None]:
# 5. Improve a model
# Try different amount of n_estimators
np.random.seed(22)
from time import sleep

for i in range(10,100,10):
    print(f"Trying model with {i} estimator")

    model=RandomForestRegressor(n_estimators=i)
    model.fit(x_train,y_train)
    print(f"Model accuracy on test set:{model.score(x_test,y_test) * 100:.2f} %") # result should be 2 decimal point
    print("")


### 1.2 what if there was missing Values ?
1.Fill the with some (also known as impoutation)

2.remove the sample with missing data altogether

In [None]:
car_sale_missing=pd.read_csv("csv/car-sales-extended-missing-data.csv")

In [None]:
car_sale_missing.isna().sum() # missing values

In [None]:
# Fill missing data with pandas

car_sale_missing["Make"].fillna("missing",inplace=True) # fill missing data with missing ( bad idea)

car_sale_missing["Colour"].fillna("missing",inplace=True) # fill missing data with missing ( bad idea)

car_sale_missing["Odometer (KM)"].fillna(car_sale_missing["Odometer (KM)"].mean(),inplace=True) # fill missing data with mean

car_sale_missing["Doors"].fillna(4,inplace=True) # fill missing data with 4


car_sale_missing.isna().sum() # missing values

In [None]:
# remove rows with missing price value

car_sale_missing.dropna(inplace=True)


In [None]:
from sklearn.model_selection import train_test_split

x=car_sale_missing.drop("Price",axis=1)
y=car_sale_missing["Price"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)

In [None]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

catogerical_features=["Make","Colour","Doors"]

one_hot=OneHotEncoder()

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough")

transformed_x=transformer.fit_transform(car_sale_missing)

transformed_x


## Fill missing data with scikit-learn

In [None]:
car_sale_missing2=pd.read_csv("csv/car-sales-extended-missing-data.csv")
car_sale_missing2.isna().sum() # missing values

In [None]:
# drop rows with no labels

car_sale_missing2.dropna(subset=["Price"],inplace=True)

car_sale_missing2.isna().sum() # missing values

In [None]:
# Split into x/y

x=car_sale_missing2.drop("Price",axis=1) # features
y=car_sale_missing2["Price"] # labels

In [None]:
# fill missing values with scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# fill catogerical values with missing and numerical values with mean
catogerical_imputer=SimpleImputer(strategy="constant",fill_value="missing")
door_imputer=SimpleImputer(strategy="constant",fill_value=4)
numerical_imputer=SimpleImputer(strategy="mean")

# define columns

catogerical_features=["Make","Colour"]
door_features=["Doors"]
numerical_features=["Odometer (KM)"]

# create an imputer (something that fills missing data)

imputer=ColumnTransformer([
    ("catogerical_imputer",catogerical_imputer,catogerical_features),
    ("door_imputer",door_imputer,door_features),
    ("numerical_imputer",numerical_imputer,numerical_features)
])

# (name of the imputer, imputer, features)

# transform the data

filled_x=imputer.fit_transform(x)

filled_x



Let's fill the missing values. We'll fill the training and test values separately to ensure training data stays with the training data and test data stays with the test data.

**Note:** We use `fit_transform()` on the training data and `transform()` on the testing data. In essence, we learn the patterns in the training set and transform it via imputation (fit, then transform). Then we take those same patterns and fill the test set (transform only).

In [None]:
car_sales_filled=pd.DataFrame(filled_x,columns=["Make","Colour","Doors","Odometer (KM)"])

car_sales_filled.isna().sum() # missing values

In [None]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

catogerical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder() # Turns catogery into number

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough")

transformed_x=transformer.fit_transform(car_sales_filled)

transformed_x


In [None]:
# Now we have our data as numbers and filled (no missing values)
# Let's fit a model

np.random.seed(42)

x_train,x_test,y_train,y_test=train_test_split(transformed_x,y,test_size=.2)

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=20)

model.fit(x_train,y_train)

model.score(x_test,y_test)


In [None]:
# print sklearn version
import sklearn
print(sklearn.__version__)


# Choosing the right Estimator/ algorithm for your problem

some things to note:

* Sklearn refer to machine learning models, algorithm as estimators
* classification problem - predicting a category (heart disease or not)
	* Something you'will see `clf` (classifier) used as a classification estimator
 * Regression problem - predicting a number (selling price of a car)

* Step 1 - Check the sklearn machine learning map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### 2.1 Picking a machine learning model for a regression problem

let's use the California housing dataset



In [None]:
from sklearn.datasets import  fetch_california_housing

house_data=fetch_california_housing()


In [None]:
house_df=pd.DataFrame(house_data["data"],columns=house_data["feature_names"])
house_df.head()

In [None]:
# check for missing values
house_df.isna().sum()

In [None]:
house_df["target"]=house_data["target"] # add target column to dataframe
house_df.head()

In [None]:
# split into x/y

x=house_df.drop("target",axis=1)
y=house_df["target"]

# split into train/test
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)




In [None]:
# build machine learning model

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor()

model.fit(x_train,y_train)

model.score(x_test,y_test)

In [None]:
# try another model
from sklearn.linear_model import Ridge

model=Ridge()

model.fit(x_train,y_train)

model.score(x_test,y_test) # this give COV (coefficient of variation) not accuracy

# cov means  the reationship b/w two variables in a dataset (how much they change together)



In [None]:
# lets try Lasso
from sklearn.linear_model import Lasso

model=Lasso()

model.fit(x_train,y_train)

model.score(x_test,y_test) # this give COV (coefficient of variation) not accuracy

## lets do classification problem

In [None]:
heart_disease=pd.read_csv("csv/heart-disease.csv")

In [None]:
heart_disease.isna().sum() # checking missing values

In [None]:
x=heart_disease.drop("target",axis=1)
y=heart_disease["target"]

In [None]:
# split into train/test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)

In [None]:
# build machine learning model
# first try RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier() 

model.fit(x_train,y_train)


model.score(x_test,y_test)

In [None]:
# try Linear SVC
from sklearn.svm import LinearSVC

model=LinearSVC()

model.fit(x_train,y_train)

model.score(x_test,y_test)

## 3. Fit the model / algorithm on our data and use it to make predictions
### 3.1 Fitting the model to the data

Different names for:
* `X` = features, features variables, data
* `y` = labels, targets, target variables


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# setup random seed
np.random.seed(42)


# make the data

x=heart_disease.drop("target",axis=1) # features
y=heart_disease["target"] # labels

# split into train/test

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)


# choose the right model and hyperparameters

from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(n_estimators=100)

# we will keep the default hyperparameters

# fit the model to the data

model.fit(x_train,y_train)

# evaluate the model on the test data
model.score(x_test,y_test)


What's happening here?

Calling the fit() method will cause the machine learning algorithm to attempt to find patterns between X and y. Or if there's no y, it'll only find the patterns within X.

Let's see X.

Passing X and y to fit() will cause the model to go through all of the examples in X (data) and see what their corresponding y (label) is.

How the model does this is different depending on the model you use.

Explaining the details of each would take an entire textbook.

For now, you could imagine it similar to how you would figure out patterns if you had enough time.

You'd look at the feature variables, X, the age, sex, chol (cholesterol) and see what different values led to the labels, y, 1 for heart disease, 0 for not heart disease.

This concept, regardless of the problem, is similar throughout all of machine learning.

*During training (finding patterns in data):*

A machine learning algorithm looks at a dataset, finds patterns, tries to use those patterns to predict something and corrects itself as best it can with the available data and labels. It stores these patterns for later use.

*During testing or in production (using learned patterns):*

A machine learning algorithm uses the patterns its previously learned in a dataset to make a prediction on some unseen data.



## 3.2 Making predictions using a machine learning model

Now we've got a trained model, one which has hoepfully learned patterns in the data, you'll want to use it to make predictions.

Scikit-Learn enables this in several ways. Two of the most common and useful are `predict()` and `predict_proba().`

Let's see them in action.

In [None]:
# Use the Trained model to make predictions

model.predict(x_test) # this doesn't work

In [None]:
pd.DataFrame(model.predict(x_test),y_test,columns=["Prediction"])

In [None]:
# compare predictions to truth labels to evaluate the model
y_preds=model.predict(x_test)

np.mean(y_preds==y_test)

In [None]:
# find the wrong predictions

y_preds=model.predict(x_test)
y_test=np.array(y_test)
for i in range(len(y_preds)):
      if y_preds[i]!=y_test[i]:
         print(f"True:{y_test[i]},Predicted:{y_preds[i]}")

In [None]:
# using sklearn

from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_preds)



In [None]:
## making predictions with predict_proba()

# predict_proba() returns probabilities of a classification label

model.predict_proba(x_test[:5])


In [None]:
model.predict(x_test[:5])

## lets predict for Regression problem
 predict() can also be used for regression models

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# split data
x=house_df.drop("target",axis=1);
y=house_df["target"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# fit the model

model=RandomForestRegressor()

model.fit(x_train,y_train)

# score

model.score(x_test,y_test)



In [None]:
# lets predict

y_preds=model.predict(x_test) # house price prediction
y_preds[:10]

In [None]:
np.array(y_test[0:10])

In [None]:
# compare the predictions to the truth 
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test,y_preds)

## 4. Evaluating a machine learing model

Three ways to Evalute scikit-Learn models/estimators:

1.Estimator's build-in `score()` method

2.The `scoring` parameter

3.Problem-specific metric functions

you can read more about it here: https://scikit-learn.org/stable/modules/model_evaluation.html

### 4.1 Evaluate model using `scoring` parameter

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

x=heart_disease.drop("target",axis=1)
y=heart_disease["target"]
clf=RandomForestClassifier()

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# fit the model

clf.fit(x_train,y_train)

# score
clf.score(x_test,y_test)


0.8524590163934426

In [18]:
cross_val_score(clf,x,y,cv=5)

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])

In [49]:
# single Training and test split score
clf_single_score=clf.score(x_test,y_test)

#take mean of 5-fold cross-validation score
clf_cross_val_score=np.mean(cross_val_score(clf,x,y,cv=5)) #cv -> no of split to test

# compare
clf_single_score,clf_cross_val_score

(0.8524590163934426, 0.8183606557377049)