## Import relevant libraries

In [47]:
import numpy as np
import pandas as pd 

## 1. Getting our data ready to be used with machine learning
**3 main things we have to do:**
   1. Split the data into features and labels (usually "x" , "y")
   2. Filling (also called imputing) or disregarding missing values
   3. Converting non-numerical values to numericals values( also call featuring coding)

In [48]:
heart_disease = pd.read_csv("../../data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [49]:
x = heart_disease.drop("target",axis=1)
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [50]:
y = heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [51]:
# split the data into trainning and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)

In [52]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((242, 13), (242,), (61, 13), (61,))

In [53]:
x.shape, x.shape[0] * 0.8

((303, 13), 242.4)

In [54]:
len(heart_disease)

303

### Make sure all features are numerical

In [55]:
car_sales = pd.read_csv("../../data/car-sales-extended.csv")

In [56]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [57]:
len(car_sales)

1000

In [58]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [59]:
# split data into x, y
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [60]:
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [61]:
# Turn the categories into number
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features= ["Make", "Colour", "Doors"]
# initiate OneHotEncoder
one_hot= OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [62]:
# otherwise, we can use pd.get_dummies method , but Doors can't convert numerical features because it is numerical
dummies = pd.get_dummies(car_sales[["Make","Colour","Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [63]:
# Let's refit model
np.random.seed(42)
# split into trainning and test set
x_train , x_test, y_train, y_test = train_test_split(transformed_x,y,train_size=0.8)

In [64]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train,y_train)

RandomForestRegressor()

In [65]:
model.score(x_test,y_test)

0.3235867221569877

### What if there were missing values? 
    1. Fill them with some value ( also known as imputation) 
    2. Remove the samples with missing data altogether

In [66]:
# import car sale missing data
car_sales_missing= pd.read_csv("../../data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [67]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [68]:
# Create x,y
x = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [69]:
# Let's try and convert out data to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer(transformers=[("one_hot", one_hot, categorical_features)], remainder="passthrough")

In [70]:
car_sales_missing["Doors"].mean().round().astype(int)

4

#### Option1 : Filling missing data with Pandas

In [71]:
# Fill "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)
# Fill "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)
# Fill "Odometer (KM)" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)
#Fill the "Doors" column
car_sales_missing["Doors"].fillna(car_sales_missing["Doors"].mean().round().astype(int), inplace=True)
car_sales_missing["Doors"] = car_sales_missing["Doors"].astype(int)

In [72]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [73]:
# Remove rows  with missing Price value
car_sales_missing.dropna(axis=0, inplace=True)

In [74]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

#### option 2: Filling missing values with Scikit-Learn

In [75]:
car_sales_missing = pd.read_csv("../../data/car-sales-extended-missing-data.csv")

In [76]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [77]:
car_sales_missing.dropna(subset=["Price"],axis=0, inplace=True)

In [78]:
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [79]:
# Split into x,y
x = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [80]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with "missing" and numerical values with mean
categorical_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant",fill_value=x["Doors"].mean().round(0).astype(int))
numerical_imputer = SimpleImputer(strategy="mean")

# Define columns
categorical_features = ["Make","Colour"]
door_features = ["Doors"]
numerical_features = ["Odometer (KM)"]

# Create an imputer (something that fill missing values)
imputer_transformer = ColumnTransformer(transformers=[("categorical_imputer", categorical_imputer, categorical_features), 
                                                      ("door_imputer", door_imputer, door_features),
                                                      ("numerical_imputer", numerical_imputer, numerical_features)
                                                     ],                                         
                                       )
# Transform data
filled_x = imputer_transformer.fit_transform(x)

In [81]:
car_sales_filled= pd.DataFrame(filled_x, columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4,35431
1,BMW,Blue,5,192714
2,Honda,White,4,84714
3,Toyota,White,4,154365
4,Nissan,Blue,3,181577
...,...,...,...,...
945,Toyota,Black,4,35820
946,missing,White,3,155144
947,Nissan,Blue,4,66604
948,Honda,White,4,215883


In [82]:
# Let's try to convert out data to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
categorical_features = ["Make", "Colour", "Doors"]

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_x = transformer.fit_transform(car_sales_filled)

In [83]:
# Now we've got our data as numbers and filled(no missing value)
# Let's fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train, x_test , y_train, y_test = train_test_split(transformed_x, y, train_size=0.8)
model = RandomForestRegressor(n_estimators=100)
model.fit(x_train, y_train)

RandomForestRegressor()

In [84]:
model.score(x_test,y_test)

0.21990196728583944