# Importing libraries 

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing data 

In [53]:
data = pd.read_csv('data.csv')

# Checking the data 

In [54]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Spliting the data into features and dependent variable 

In [55]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1:]

In [56]:
X_array = X.values
y_array = y.values

## Taking care of missing dataset 

### Sklearn simple imputer will help you to replace the missing values form your data by avg or m 

In [57]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

#### Here missing values nan is replaced by mean of the salary 

In [58]:
imputer.fit(X_array[:,1:3])
X_array[:, 1:3] = imputer.transform(X_array[:,1:3])

In [59]:
print(X_array)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Handling categorical values 

### Using one hot encoding 

In [75]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [76]:
X_array

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

## Using label encoding 

In [77]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_array = le.fit_transform(y_array)

In [78]:
y_array

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

## We can use pandas liabray function get dummies to do label encoding 

## Scaling 

### Spliting data into test and traing set 

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [80]:
X_train

array([[    0.,     0.,     1.,    nan, 52000.],
       [    1.,     0.,     0.,    44., 72000.],
       [    1.,     0.,     0.,    37., 67000.],
       [    0.,     0.,     1.,    27., 48000.],
       [    1.,     0.,     0.,    35., 58000.],
       [    0.,     1.,     0.,    50., 83000.],
       [    1.,     0.,     0.,    48., 79000.],
       [    0.,     0.,     1.,    38., 61000.]])

In [81]:
X_test

array([[    0.,     1.,     0.,    30., 54000.],
       [    0.,     1.,     0.,    40.,    nan]])

In [82]:
y_train

Unnamed: 0,Purchased
6,No
0,No
9,Yes
1,Yes
5,Yes
8,No
7,Yes
3,No


In [83]:
y_test

Unnamed: 0,Purchased
2,No
4,Yes


# Standard scaling

In [84]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [85]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [86]:
X_train

array([[-1.        , -0.37796447,  1.29099445,         nan, -1.11066496],
       [ 1.        , -0.37796447, -0.77459667,  0.55728027,  0.59805036],
       [ 1.        , -0.37796447, -0.77459667, -0.38433122,  0.17087153],
       [-1.        , -0.37796447,  1.29099445, -1.72949049, -1.45240802],
       [ 1.        , -0.37796447, -0.77459667, -0.65336308, -0.59805036],
       [-1.        ,  2.64575131, -0.77459667,  1.36437583,  1.53784378],
       [ 1.        , -0.37796447, -0.77459667,  1.09534398,  1.19610072],
       [-1.        , -0.37796447,  1.29099445, -0.24981529, -0.34174306]])

In [87]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.32594271, -0.93979342],
       [-1.        ,  2.64575131, -0.77459667,  0.01921656,         nan]])