In [1]:
What_we_going_to_cover = [
"0. An end-to-end Scikit-Learn workflow",
"1. Getting the data ready",
"2. Choose the right estimator/algorithm for our problems",
"3. Fit the model/algorithm and use it to make predictions on our data",
"4. Evaluating a model",
"5. Improve a model",
"6. Save and load a trained model",
"7. Putting it all together!",
]

In [2]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Getting the data ready

Three main things we have to do:

    1. Split the data into features and labels (usually 'X' & 'y')
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values to numerical values (also called feature encoding)
    

In [38]:
heart_disease = pd.read_csv("csv/heart-disease.csv")

In [39]:
x = heart_disease.drop("target", axis=1) # features
y = heart_disease["target"] # labels

## Split the data into training and test sets


In [44]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
269,56,1,0,130,283,1,0,103,1,1.6,0,0,3
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2
191,58,1,0,128,216,0,0,131,1,2.2,1,3,3
140,51,0,2,120,295,0,0,157,0,0.6,2,0,2
133,41,1,1,110,235,0,1,153,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3
218,65,1,0,135,254,0,0,127,0,2.8,1,1,3
200,44,1,0,110,197,0,0,177,0,0.0,2,1,2


In [45]:
x_train.shape , x_test.shape, y_train.shape, y_test.shape
# train -> 80% of data
# test -> 20% of data

((242, 13), (61, 13), (242,), (61,))

In [47]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
269,56,1,0,130,283,1,0,103,1,1.6,0,0,3
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2
191,58,1,0,128,216,0,0,131,1,2.2,1,3,3
140,51,0,2,120,295,0,0,157,0,0.6,2,0,2
133,41,1,1,110,235,0,1,153,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3
218,65,1,0,135,254,0,0,127,0,2.8,1,1,3
200,44,1,0,110,197,0,0,177,0,0.0,2,1,2


## 1.1 Make sure it's all numerical

In [3]:

car_sales = pd.read_csv("csv/car-sales-extended-missing-data.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [4]:
car_sales.dtypes
car_sales.isna().sum() # missing values

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [5]:
# Split into x/y
from sklearn.model_selection import train_test_split
x = car_sales.drop("Price", axis=1) # features
y = car_sales["Price"] # labels

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [11]:
# build machine learnig model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(x_train, y_train)

model.score(x_test, y_test)

# Error : cannot convert string to float: 'Toyota' -> need to convert string to numbers


ValueError: could not convert string to float: 'Toyota'

In [8]:
# Turn  catogerical variables into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

catogerical_features=["Make","Colour","Doors"]

one_hot=OneHotEncoder() #Turns catogery into number 

transformer=ColumnTransformer([("one_hot",one_hot,catogerical_features)],remainder="passthrough") 
#List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.

transformed_x=transformer.fit_transform(x)

transformed_x


<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [10]:
pd.DataFrame(transformed_x).head()
x


Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0
...,...,...,...,...
995,Toyota,Black,35820.0,4.0
996,,White,155144.0,3.0
997,Nissan,Blue,66604.0,4.0
998,Honda,White,215883.0,4.0
