**Data preprocessing**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('Data.csv')
print(df)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


**Seperate features column and target column**

In [4]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

print(x)

print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


**Handling missing values**

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x[:,1:3] = imputer.fit_transform(x[:,1:3])
# imputer = imputer.fit(x[:,1:3])
# imputer.fit(x[:,1:3])
# x[:,1:3] = imputer.transform(x[:,1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 48000.0]
 ['France' 35.0 58000.0]
 ['Spain' 27.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


**Encoding The independent variable**

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 48000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 27.0 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


**Encoding the dependent variable**

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


**Feature scaling**

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)
print(x)


[[ 1.22474487 -0.65465367 -0.65465367  0.8273403   0.82020574]
 [-0.81649658 -0.65465367  1.52752523 -1.37028238 -1.18846138]
 [-0.81649658  1.52752523 -0.65465367 -0.98246661 -0.6862946 ]
 [-0.81649658 -0.65465367  1.52752523  0.05170877 -0.10043336]
 [-0.81649658  1.52752523 -0.65465367  0.31025261 -1.18846138]
 [ 1.22474487 -0.65465367 -0.65465367 -0.336107   -0.35151675]
 [-0.81649658 -0.65465367  1.52752523 -1.37028238 -0.85368353]
 [ 1.22474487 -0.65465367 -0.65465367  1.34442799  1.40606699]
 [-0.81649658  1.52752523 -0.65465367  1.60297184  1.74084484]
 [ 1.22474487 -0.65465367 -0.65465367 -0.07756315  0.40173343]]
