In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('data/DataPreprocessing.csv')
data

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes
5,India,40.0,69600.0,Yes
6,Brazil,,62400.0,No
7,India,53.0,94800.0,Yes
8,USA,55.0,99600.0,No
9,India,42.0,80400.0,Yes


In [6]:
x = data.iloc[:, :-1].values # .values make it to return numpy array
x

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, nan],
       ['India', 40.0, 69600.0],
       ['Brazil', nan, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

In [7]:
y = data.iloc[:, -1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### missing values of age and income will fill with mean

In [8]:
si = SimpleImputer(missing_values=np.nan, strategy='mean')
si.fit(x[:, 1:])
x[:, 1:] = si.transform(x[:, 1:])

In [9]:
x

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, 76533.33333333333],
       ['India', 40.0, 69600.0],
       ['Brazil', 43.77777777777778, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

### Region category will convert to numerical

In [10]:
le = LabelEncoder()
x[:, 0] = le.fit_transform(x[:, 0])

In [11]:
x

array([[1, 49.0, 86400.0],
       [0, 32.0, 57600.0],
       [2, 35.0, 64800.0],
       [0, 43.0, 73200.0],
       [2, 45.0, 76533.33333333333],
       [1, 40.0, 69600.0],
       [0, 43.77777777777778, 62400.0],
       [1, 53.0, 94800.0],
       [2, 55.0, 99600.0],
       [1, 42.0, 80400.0]], dtype=object)

In [12]:
ct = ColumnTransformer([('Country', OneHotEncoder(), [0])], remainder='passthrough') # passthrough make to save other columns and dnot drop them
x = ct.fit_transform(x)

In [13]:
x

array([[0.0, 1.0, 0.0, 49.0, 86400.0],
       [1.0, 0.0, 0.0, 32.0, 57600.0],
       [0.0, 0.0, 1.0, 35.0, 64800.0],
       [1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 40.0, 69600.0],
       [1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 53.0, 94800.0],
       [0.0, 0.0, 1.0, 55.0, 99600.0],
       [0.0, 1.0, 0.0, 42.0, 80400.0]], dtype=object)

### Scaling values with standard score

In [14]:
ss = StandardScaler()
x = ss.fit_transform(x)

In [15]:
x

array([[-0.65465367,  1.22474487, -0.65465367,  0.75887436,  0.74947325],
       [ 1.52752523, -0.81649658, -0.65465367, -1.71150388, -1.43817841],
       [-0.65465367, -0.81649658,  1.52752523, -1.27555478, -0.89126549],
       [ 1.52752523, -0.81649658, -0.65465367, -0.11302384, -0.25320042],
       [-0.65465367, -0.81649658,  1.52752523,  0.17760889,  0.        ],
       [-0.65465367,  1.22474487, -0.65465367, -0.54897294, -0.52665688],
       [ 1.52752523, -0.81649658, -0.65465367,  0.        , -1.0735698 ],
       [-0.65465367,  1.22474487, -0.65465367,  1.34013983,  1.38753832],
       [-0.65465367, -0.81649658,  1.52752523,  1.63077256,  1.75214693],
       [-0.65465367,  1.22474487, -0.65465367, -0.25834021,  0.29371249]])