# Introduction to scikit-learn

This notebook demonstrates some of the most useful functions of the beautiful Scikit-Learn library.

What we're going to cover


0. An end-to-end Scikit-Learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problem
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together!

In [None]:
# Let's listify the contents
what_were_covering = [
    "0. An end-to-end Scikit-Learn workflow",
    "1. Getting the data ready",
    "2. Choose the right estimator/algorithm for our problems",
    "3. Fit the model/algorithm and use it to make predictions on our data",
    "4. Evaluating a model",
    "5. Improve a model",
    "6. Save and load a trained model",
    "7. Putting it all together!"]

## 0. An end-to-end Scikit-Learn workflow

In [None]:
# 1. Get the data ready
import pandas as pd
import numpy as np

heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

In [None]:
# Create X (feautres matrix)
X =  heart_disease.drop("target", axis=1)

# Create y (labels)
y = heart_disease["target"]

In [None]:
# 2. Choose the right model and hyperparameters
# This is a classification problem because we want to determine if X = heart disease

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 90)

# We'll keep the default hyperparameters
clf.get_params()

In [None]:
# Fit the model to the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf.fit(X_train, y_train);
X_train.head()

In [None]:
# Make a prediction
y_label = clf.predict(np.array([0, 2, 3, 4]))

In [None]:
y_preds = clf.predict(X_test)

In [None]:
y_preds

In [None]:
y_test.head()

In [None]:
# 4. Evaluate the model on the training data and test data
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
# 5. Improve a model
# Try different amount of n_estimators

np.random.seed(10)
for i in range(10, 100, 10):
    print("Trying model with {} estimators".format(i))
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print("Model accuracy on test set: {} %".format(clf.score(X_test, y_test)))
    print()

In [None]:
# 6. Save a model and load it
import pickle

pickle.dump(clf, open("random-forest-model1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("Random-forest-model1.pkl", "rb"))
loaded_model.score(X_test, y_test)

# Retry again

In [None]:
heart_data = pd.read_csv("heart-disease.csv")
heart_data.head()

In [None]:
X = heart_data.drop("target", axis=1)
X

In [None]:
y = heart_data["target"]
y

In [None]:
clf = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf.fit(X_train, y_train);

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

# Retry again

In [None]:
heart_data = pd.read_csv("heart-disease.csv")
X = heart_data.drop("target", axis=1)
y = heart_data["target"]

clf = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

clf.score(X_test, y_test)

In [None]:
what_were_covering

# 1. Getting your data ready

Three main things we have to do:
    1. Split the data into features and labels (Usually "X" and "y")
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values into numerical values (also called feature encoding)

In [None]:
heart_disease.head()

In [None]:
X = heart_disease.drop("target", axis=1)
X.head()

In [None]:
y = heart_disease["target"]
y.head()

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape # Important, make sure shapes match.

# 1.1 Make sure its all numerical

In [None]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
# Split the data into x and y
X = car_sales.drop("Price", axis=1)
X.head()

In [None]:
y = car_sales["Price"]
y.head()

In [None]:
# SPlit into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train) # FIt on training data
model.score(X_test, y_test) # Evaluate on test data

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")
transformed_x = transformer.fit_transform(X)
transformed_x

In [None]:
X.head()

In [None]:
pd.DataFrame(transformed_x).head()

In [None]:
# dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
# dummies.head()

In [None]:
# Lets try refit the model
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(transformed_x,
                                                   y,
                                                   test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model.fit(X_train, y_train);

In [None]:
model.score(X_test, y_test)

# 1.2 What if there were missing values?
1. Fill them with some value (also known as imputation)
2. Remove the samples with missing data altogether

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Import car sales missing data
missing_data = pd.read_csv("car-sales-extended-missing-data.csv")
missing_data.head()
missing_data.dtypes

In [None]:
missing_data.isna().sum()

In [None]:
# Create X and y
X = missing_data.drop("Price", axis=1)
X.head()

In [None]:
y = missing_data["Price"]
y.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(X)

In [None]:
X.isna().sum()

#### Option 1: Fill missing  data with pandas

In [None]:
# Fill the "Make" column
missing_data["Make"].fillna("missing", inplace=True)

# Fill the "Color" column
missing_data["Colour"].fillna("missing", inplace=True)

# Fill missing "Odometer (KM)" with mean of Odometer
missing_data["Odometer (KM)"].fillna(missing_data["Odometer (KM)"].mean(), inplace=True)

# Fill the "Doors" column with the average of doors
missing_data["Doors"].value_counts()
missing_data["Doors"].fillna(4, inplace=True)

# CHeck our dataframe again
missing_data.isna().sum()

In [None]:
# Remove rows with missing price value
missing_data.dropna(inplace=True)
missing_data.isna().sum()

In [None]:
len(missing_data)

In [None]:
X = missing_data.drop("Price", axis=1)
y = missing_data["Price"]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
import pandas as pd
import numpy as np

In [None]:
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

In [None]:
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [None]:
heart_disease.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
clf.fit(X_train, y_train);

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

# Try again making data numerical and running ML

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Get the data ready
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head(), len(car_sales), car_sales.dtypes

In [None]:
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [None]:
feature_data = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 feature_data)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)
X_pd = pd.DataFrame(data=transformed_X)
X_pd.head()
# model = RandomForestRegressor()
# transformed_X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pd, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

# Restart

In [None]:
what_were_covering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Getting the data ready
missing_data = pd.read_csv("car-sales-extended-missing-data.csv")
missing_data.head(), len(missing_data), missing_data.dtypes

In [None]:
missing_data.isna().sum()

In [None]:
# FIll missing data in "Make", "Color", "Odometer", "Doors" with "missing" and drop missing data in price
# Make string data numerical

missing_data["Make"].fillna("missing", inplace=True)
missing_data["Colour"].fillna("missing", inplace=True)
missing_data["Odometer (KM)"].fillna(missing_data["Odometer (KM)"].mean(), inplace=True)
missing_data["Doors"].fillna(4, inplace=True)

missing_data.dropna(inplace=True)

In [None]:
missing_data.isna().sum()

In [None]:
# Now that all missing data has been sorted. Sort the data into X and y
X = missing_data.drop("Price", axis=1)
y = missing_data["Price"]

In [None]:
X.head(), y.head()

In [None]:
# Turn the data into numerical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

feature_data = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 feature_data)],
                               remainder="passthrough",
                               sparse_threshold=0)

transformed_X = transformer.fit_transform(X)
X_pd = pd.DataFrame(transformed_X)
X_pd.head()

In [None]:
# Fit the model. We are trying to get an estimate of price
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=60)
X_train, X_test, y_train, y_test = train_test_split(X_pd, y, test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Train the data
model.fit(X_train, y_train);

In [None]:
# Test the trained data
model.score(X_train, y_train)

In [None]:
# Test the test data
model.score(X_test, y_test)

In [None]:
for i in range(10, 100, 25):
    model = RandomForestRegressor(n_estimators=i)
    X_train, X_test, y_train, y_test = train_test_split(X_pd, y, test_size=0.2)
    model.fit(X_train, y_train);
    model.score(X_train, y_train)
    print("Testing {} estimators: Result = {}".format(i, model.score(X_test, y_test)))

# Option 2. FIll missing values with scikit-learn

In [15]:
missing_data = pd.read_csv("car-sales-extended-missing-data.csv")
missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [16]:
missing_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [17]:
# Drop the rows with no labeks
missing_data.dropna(subset=["Price"], inplace=True)
missing_data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [18]:
missing_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [19]:
X = missing_data.drop("Price", axis=1)
y = missing_data["Price"]

In [20]:
missing_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [21]:
# Fill missing values with scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with missing and numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_feature = ["Make", "Colour"]
door_feature = ["Doors"]
num_feature = ["Odometer (KM)"]

# Create an imputer (Something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_feature),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_feature)
])

# Transform the data
filled_X = imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [None]:
X_pd = pd.DataFrame(filled_X,
                    columns=["Make", "Colour", "Doors", "Odometer (KM)"])
X_pd.head()

In [None]:
X_pd.isna().sum()

In [None]:
# Convert catagorical value to numerical
from sklearn.preprocessing import OneHotEncoder


cat_data = ["Make", "Colour", "Doors"]
hot_one = OneHotEncoder()
transformer = ColumnTransformer([("hot_one", hot_one, cat_data)],
                                remainder="passthrough",
                               sparse_threshold=0)

transform_X = transformer.fit_transform(X_pd)
transform_X

In [None]:
# Now that our data is numbers and filled (No missing values). Fit a model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transform_X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

# Retry

In [None]:
what_were_covering

In [None]:
# Getting the data ready
# Import the data
# Check for missing data
# Make categorical data numerical
# Choose the right model
# fit the model
# Make predictions

In [62]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [72]:
# 1. import the data
missing_data = pd.read_csv("car-sales-extended-missing-data.csv")
missing_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [64]:
# 2. check for missing data
missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [73]:
# Drop data with no labels
missing_data.dropna(subset=["Price"], inplace=True)
missing_data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [74]:
# Split the data
X = missing_data.drop("Price", axis=1)
y = missing_data["Price"]

In [75]:
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [76]:
# Fix the missing data in X
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
mean_imputer = SimpleImputer(strategy="mean")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)

cat_feature = ["Make", "Colour"]
mean_feature = ["Odometer (KM)"]
door_feature = ["Doors"]

imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_feature),
    ("mean_imputer", mean_imputer,mean_feature),
    ("door_imputer", door_imputer, door_feature)])

filled_X = imputer.fit_transform(X)
filled_X[:5]

array([['Honda', 'White', 35431.0, 4.0],
       ['BMW', 'Blue', 192714.0, 5.0],
       ['Honda', 'White', 84714.0, 4.0],
       ['Toyota', 'White', 154365.0, 4.0],
       ['Nissan', 'Blue', 181577.0, 3.0]], dtype=object)

In [77]:
# Put data back into a dataframe
X_pd = pd.DataFrame(filled_X, columns=["Make", "Colour", "Odometer (KM)", "Doors"])

In [78]:
X_pd.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [79]:
# Change categorical data into numerical data

categorical_data = ["Make", "Colour", "Odometer (KM)", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_data)], remainder="passthrough", sparse_threshold=0)

transformed_X = transformer.fit_transform(X_pd)

In [80]:
# Sort data into training and test data
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 915), (190, 915), (760,), (190,))

In [82]:
model = RandomForestRegressor()
model.fit(X_train, y_train);

In [83]:
model.score(X_train, y_train)

0.8272852712890146

In [84]:
model.score(X_test, y_test)

0.08859473548157593