# Initialization

In [None]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler # standard normalization to scale data

# Loading Data

In [None]:
# 不檢查檔案是否存在，暴力下載
# ! wget https://raw.githubusercontent.com/cnchi/datasets/master/CarEvaluation.csv

# 先檢查檔案是否存在，再決定是否下載
import os

Dataset_File = "CarEvaluation.csv"
if not os.path.isfile(Dataset_File):
  os.system("wget https://raw.githubusercontent.com/cnchi/datasets/master/" + Dataset_File)

# Data Preprocessing

## 1. Loading Dataset

In [None]:
# load data from storage to memory
dataset = pd.read_csv("CarEvaluation.csv")

# pandas會自動把缺失值補成NaN (Not a Number)
dataset

Unnamed: 0,City,Children,Age,Salary,ToBuy
0,Taipei,,44.0,72000.0,No
1,Taichung,0.0,27.0,48000.0,Yes
2,Kaohsiung,0.0,30.0,54000.0,No
3,Taichung,1.0,38.0,61000.0,No
4,Kaohsiung,2.0,40.0,,Yes
5,Taipei,2.0,35.0,58000.0,Yes
6,Taichung,1.0,,52000.0,No
7,Taipei,2.0,48.0,79000.0,Yes
8,Kaohsiung,1.0,50.0,83000.0,No
9,Taipei,2.0,37.0,67000.0,Yes


## 2. Spliting Independent and Dependent Variables

In [None]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [None]:
X, Y

(array([['Taipei', nan, 44.0, 72000.0],
        ['Taichung', 0.0, 27.0, 48000.0],
        ['Kaohsiung', 0.0, 30.0, 54000.0],
        ['Taichung', 1.0, 38.0, 61000.0],
        ['Kaohsiung', 2.0, 40.0, nan],
        ['Taipei', 2.0, 35.0, 58000.0],
        ['Taichung', 1.0, nan, 52000.0],
        ['Taipei', 2.0, 48.0, 79000.0],
        ['Kaohsiung', 1.0, 50.0, 83000.0],
        ['Taipei', 2.0, 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

## 3. Handling Missing Data

## Check if there are any missing value

In [None]:
dataset.isnull()

Unnamed: 0,City,Children,Age,Salary,ToBuy
0,False,True,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,True,False
5,False,False,False,False,False
6,False,False,True,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [None]:
dataset.isnull().sum()

City        0
Children    1
Age         1
Salary      1
ToBuy       0
dtype: int64

In [None]:
sum(dataset.isnull().sum())

3

In [None]:
dataset.isnull().any()

City        False
Children     True
Age          True
Salary       True
ToBuy       False
dtype: bool

## Fill Missing Values

In [None]:
# mean, median, most_frequent
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:, 1:4])
X[:, 1:4] = imputer.transform(X[:, 1:4])

In [None]:
X

array([['Taipei', 1.2222222222222223, 44.0, 72000.0],
       ['Taichung', 0.0, 27.0, 48000.0],
       ['Kaohsiung', 0.0, 30.0, 54000.0],
       ['Taichung', 1.0, 38.0, 61000.0],
       ['Kaohsiung', 2.0, 40.0, 63777.77777777778],
       ['Taipei', 2.0, 35.0, 58000.0],
       ['Taichung', 1.0, 38.77777777777778, 52000.0],
       ['Taipei', 2.0, 48.0, 79000.0],
       ['Kaohsiung', 1.0, 50.0, 83000.0],
       ['Taipei', 2.0, 37.0, 67000.0]], dtype=object)

## 4. Categorical Data to Digital Data

In [None]:
# using label encoder, digitalize Y
# 不參與計算，使用label encode即可
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
Y = labelEncoder.fit_transform(Y).astype("float64")

In [None]:
Y

array([0., 1., 0., 0., 1., 1., 0., 1., 0., 1.])

In [None]:
# using one-hot encoder，digitalize X
# 參與計算，若要合併資料及，只要改變欄位順序就可以合併
ary_dummies = pd.get_dummies(X[:, 0]).values # -> get one-hot encode
X = np.concatenate((ary_dummies, X[:, 1:4]), axis=1).astype("float64") # axis=0 concate with row, =1 concate with column

In [None]:
ary_dummies

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1]], dtype=uint8)

In [None]:
X

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.22222222e+00,
        4.40000000e+01, 7.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.00000000e+01, 5.40000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.00000000e+00,
        4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.00000000e+00,
        3.50000000e+01, 5.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.00000000e+00,
        4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        5.00000000e+01, 

## 5. Splitting Training and Testing Dataset

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # train : test = 8 : 2

## 6. Feature Scaling

In [None]:
# 自變數尺度一樣(比例尺相同)，收斂快

sc_X = StandardScaler().fit(X_train) # standard normalization
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:
X_train

array([[ 2.64575131, -0.77459667, -1.        ,  0.87211946,  0.26306757,
         0.12381479],
       [-0.37796447, -0.77459667,  1.        ,  0.87211946, -0.25350148,
         0.46175632],
       [-0.37796447,  1.29099445, -1.        , -2.04846663, -1.97539832,
        -1.53093341],
       [-0.37796447,  1.29099445, -1.        , -0.58817359,  0.05261351,
        -1.11141978],
       [-0.37796447, -0.77459667,  1.        ,  0.87211946,  1.64058505,
         1.7202972 ],
       [-0.37796447,  1.29099445, -1.        , -0.58817359, -0.0813118 ,
        -0.16751412],
       [-0.37796447, -0.77459667,  1.        , -0.26366402,  0.95182631,
         0.98614835],
       [-0.37796447, -0.77459667,  1.        ,  0.87211946, -0.59788085,
        -0.48214934]])