##### 1. Importing the libraries
   **[numpy](http://www.numpy.org/)** is library which contains math function  
   **[pandas](https://pandas.pydata.org/)** is library used to manage import and dataset

In [1]:
import numpy as np
import pandas as pd

##### 2: Importing dataset
   Dataset are generally available in CSV format. We use the read_csv method to read local CSV file as dataframe

In [2]:
dataset = pd.read_csv('day_003_dataset.csv')
dataset.head(n = 10)
#dataset.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


   We make seperate matrix and vector of independant(Predictor,Feature) and dependent(Response,Target) variable from dataframe

In [3]:
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : , 3].values
# Where 
# X is Predictor(Feature)
# Y is Response (Target)

In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

##### 3: Handling the missing data
   We use [imputer](https://scikit-learn.org/stable/modules/impute.html) class of [sklearn.prepocessing](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing) for replace the missing data by Mean or Median of the entries column

In [6]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(X[ : , 1:3])
X[ : , 1:3] = imputer.transform(X[ : , 1:3])

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

##### 4: Encoding categorical data
   Categorical data are variable that contain label values rather than numeric values. The number of posible values is often limited to fixed set by fit_transform method. We need to encoding country variable into number by [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) class from [sklearn.preprocessing](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing) library.

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])

In [9]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

Creating a dummy variable

In [11]:
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [12]:
X

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.40000000e+01, 7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        3.00000000e+01, 5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.50000000e+01, 5.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        5.00000000e+01, 

In [13]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

##### 5: Splitting the datasets into training sets and Test sets
   We make traning and testing data set by spliting. The split is generally 80/20 by [train_test_split()](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) method from [sklearn.crossvalidation](https://scikit-learn.org/stable/modules/cross_validation.html) library.

In [14]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)



In [15]:
X_train

array([[1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.70000000e+01, 6.70000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.80000000e+01, 6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.40000000e+01, 7.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.50000000e+01, 5.80000000e+04]])

In [16]:
X_test

array([[1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04]])

In [17]:
Y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [18]:
Y_test

array([0, 0])

##### 6: Normalization by Feature Scaling
   Most of the machine learing algorithms use the Euclidean distance between two data points in their computation, feature highly varying in magnitudes, units and range pose problems. High magnitudes features will weigh more in the distance calculations than featues with low magnitudes. Done by feature standardization or Z-score normalization. We used StandardScalar of sklearn.preprocessing library.
   
   [ประโยชน์ของการทำ normalization](https://medium.com/@kasidissatangmongkol/machine-learning-101-%E0%B8%AA%E0%B8%A3%E0%B9%89%E0%B8%B2%E0%B8%87%E0%B9%82%E0%B8%A1%E0%B9%80%E0%B8%94%E0%B8%A5%E0%B9%81%E0%B8%A3%E0%B8%81%E0%B8%82%E0%B8%AD%E0%B8%87%E0%B8%84%E0%B8%B8%E0%B8%93%E0%B8%87%E0%B9%88%E0%B8%B2%E0%B8%A2%E0%B9%86%E0%B9%83%E0%B8%99-excel-55bad0f99545)<br/>
  + ตัวแปรจะถูกปรับค่าให้อยู่บน scale เดียวกัน เช่นมีค่าอยู่ระหว่าง 0 ถึง 1 มีประโยชน์อย่างมากถ้าเกิดตัวแปรใน dataset เราเก็บมาจาก scale ที่แตกต่างกันมากๆดังตัวอย่างวันนี้ ราคาบ้านมีค่าวิ่งอยู่ระหว่าง [199000, 405000] แต่ขนาดบ้านมีค่าวิ่งอยู่ระหว่าง [1100, 2450] ควรจะ normalized ก่อนจะ fit model<br/>
  + คอมพิวเตอร์รักข้อมูลที่ normalized แล้วฮะ เพราะใช้ ram ในการประมวลผลน้อยลง (i.e. processing time เร็วขึ้นแบบผิดหูผิดตา) คิดเล่นๆก็ได้ ยกกำลังสองค่า 240,000 หรือว่ายกกำลังสองค่า 0.20 แบบไหนคิดง่ายกว่ากัน

In [19]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [20]:
X_train

array([[ 1.        , -1.        ,  2.64575131, -0.77459667,  0.26306757,
         0.12381479],
       [-1.        ,  1.        , -0.37796447, -0.77459667, -0.25350148,
         0.46175632],
       [ 1.        , -1.        , -0.37796447,  1.29099445, -1.97539832,
        -1.53093341],
       [ 1.        , -1.        , -0.37796447,  1.29099445,  0.05261351,
        -1.11141978],
       [-1.        ,  1.        , -0.37796447, -0.77459667,  1.64058505,
         1.7202972 ],
       [ 1.        , -1.        , -0.37796447,  1.29099445, -0.0813118 ,
        -0.16751412],
       [-1.        ,  1.        , -0.37796447, -0.77459667,  0.95182631,
         0.98614835],
       [-1.        ,  1.        , -0.37796447, -0.77459667, -0.59788085,
        -0.48214934]])

In [21]:
X_test

array([[ 0.,  0.,  0.,  0., -1., -1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.]])