# Data preprocessing

## Processing data
Dataset can be found in ./datasets/data_preprocessing

In [3]:
# Import libs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import dataset
dataset = pd.read_csv('../datasets/data_preprocessing/Data.csv')
print(dataset)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values
print(x)
print(y)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [76]:
# Take care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [78]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder_x = LabelEncoder()
x[:, 0] = label_encoder_x.fit_transform(x[:, 0])
print(x)
one_hot_encoder = OneHotEncoder(categorical_features = [0])
x = one_hot_encoder.fit_transform(x).toarray()
np.set_printoptions(precision=3)
#np.set_printoptions(suppress=True)
print(x)

label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)
print(y)

[[     1.         0.         0.        44.     72000.   ]
 [     0.         0.         1.        27.     48000.   ]
 [     0.         1.         0.        30.     54000.   ]
 [     0.         0.         1.        38.     61000.   ]
 [     0.         1.         0.        40.     63777.778]
 [     1.         0.         0.        35.     58000.   ]
 [     0.         0.         1.        38.778  52000.   ]
 [     1.         0.         0.        48.     79000.   ]
 [     0.         1.         0.        50.     83000.   ]
 [     1.         0.         0.        37.     67000.   ]]
[[     0.         1.         0.         0.        44.     72000.   ]
 [     1.         0.         0.         1.        27.     48000.   ]
 [     1.         0.         1.         0.        30.     54000.   ]
 [     1.         0.         0.         1.        38.     61000.   ]
 [     1.         0.         1.         0.        40.     63777.778]
 [     0.         1.         0.         0.        35.     58000.   ]
 [   

In [79]:
# Split dataset to train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print(x_train)
print(x_test)
print(y_train)
print(y_test)


[[     1.         0.         1.         0.        40.     63777.778]
 [     0.         1.         0.         0.        37.     67000.   ]
 [     1.         0.         0.         1.        27.     48000.   ]
 [     1.         0.         0.         1.        38.778  52000.   ]
 [     0.         1.         0.         0.        48.     79000.   ]
 [     1.         0.         0.         1.        38.     61000.   ]
 [     0.         1.         0.         0.        44.     72000.   ]
 [     0.         1.         0.         0.        35.     58000.   ]]
[[     1.      0.      1.      0.     30.  54000.]
 [     1.      0.      1.      0.     50.  83000.]]
[1 1 1 0 1 0 0 1]
[0 0]


In [83]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
print(x_train)
print(x_test)

[[ 1.    -1.     2.646 -0.775  0.263  0.124]
 [-1.     1.    -0.378 -0.775 -0.254  0.462]
 [ 1.    -1.    -0.378  1.291 -1.975 -1.531]
 [ 1.    -1.    -0.378  1.291  0.053 -1.111]
 [-1.     1.    -0.378 -0.775  1.641  1.72 ]
 [ 1.    -1.    -0.378  1.291 -0.081 -0.168]
 [-1.     1.    -0.378 -0.775  0.952  0.986]
 [-1.     1.    -0.378 -0.775 -0.598 -0.482]]
[[ 1.    -1.     2.646 -0.775 -1.459 -0.902]
 [ 1.    -1.     2.646 -0.775  1.985  2.14 ]]


-----------------------------------------------------------------------

# List of APIs used

1. pandas.read_csv()
2. sklearn.preprocessing.imputer.fit()
3. sklearn.preprocessing.LabelEncoder.fit_transform()
4. sklearn.preprocessing.OneHotEncoder.fit_transform()
5. sklearn.model_selection.train_test_split
6. sklearn.preprocessing.StandardScaler.fit_transform()

## List of packages uses
1. numpy
2. sklearn
3. pandas