# Data Preprocessing Tools


## Importing the libraries


In [13]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

# np.set_printoptions(suppress=True)


## Importing the dataset


In [14]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values


In [15]:
print(X)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [16]:
print(Y)


['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data


In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:] = imputer.fit_transform(X[:, 1:])


In [18]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data


### Encoding the Independent Variable


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='drop')

# X1 = X[:, :1]

# X1 = ct.fit_transform(X1).tolist()
# X = list(map(lambda a: a[1:],X.tolist()))
# X = np.array(np.array(list(map(lambda a, b: a + b, X, X1))))

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [20]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable


In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)


In [22]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [24]:
print(X_train)
print(X_test)
print(Y_train)
print(Y_test)

[[1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]]
[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]]
[1 1 0 1 0 1 0 0]
[0 1]


## Feature Scaling


In [25]:
from sklearn.preprocessing import StandardScaler 

sc = StandardScaler()

scaler = sc.fit(X_train[:, 3:])
X_train[:, 3:] = sc.transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [26]:
print(X_train)

[[1.0 0.0 0.0 -0.747990606595216 -0.6409837045065677]
 [1.0 0.0 0.0 1.367173163112102 1.3614281988280001]
 [0.0 0.0 1.0 -0.2598758905089118 -0.35492486117305805]
 [0.0 1.0 0.0 0.06553392021529096 -0.09005556179017836]
 [0.0 1.0 0.0 1.692582973836305 1.7428399899393465]
 [1.0 0.0 0.0 -0.4225807958710132 0.21719282549396138]
 [0.0 1.0 0.0 -1.561515133405723 -1.022395495617914]
 [0.0 0.0 1.0 -0.13332763078283283 -1.213101391173587]]


In [27]:
print(X_test)

[[1.0 0.0 0.0 0.7163535416636966 0.6939575643831442]
 [0.0 0.0 1.0 -2.0496298494920273 -1.5945131822849334]]
