# Data Preprocessing Tools


## Importing the libraries


In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

np.set_printoptions(suppress=True)


## Importing the dataset


In [2]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values


In [3]:
print(X)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(Y)


['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data


In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:] = imputer.fit_transform(X[:, 1:])


In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data


### Encoding the Independent Variable


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='drop')

# X1 = X[:, :1]

# X1 = ct.fit_transform(X1).tolist()
# X = list(map(lambda a: a[1:],X.tolist()))
# X = np.array(np.array(list(map(lambda a, b: a + b, X, X1))))

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [8]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)


In [10]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [12]:
print(X_train)
print(X_test)
print(Y_train)
print(Y_test)

[[1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 48.0 79000.0]]
[[0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]]
[1 0 1 0 1 0 1 1]
[0 0]


## Feature Scaling


In [13]:
from sklearn.preprocessing import StandardScaler 

sc = StandardScaler()

scaler = sc.fit(X_train[:, 3:])
X_train[:, 3:] = sc.transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [14]:
print(X_train)

[[1.0 0.0 0.0 -0.4329081551000039 -0.42992546479911053]
 [0.0 1.0 0.0 1.582353946227601 1.6787007930711877]
 [0.0 0.0 1.0 -1.507714609141393 -1.2733759679472298]
 [0.0 0.0 1.0 0.0746393370862078 -0.9359957666879821]
 [1.0 0.0 0.0 -0.16420654158965658 0.32917998803419685]
 [0.0 1.0 0.0 -1.1046621888758723 -0.7673056660583583]
 [0.0 1.0 0.0 0.2388458786758644 0.05740149257535867]
 [1.0 0.0 0.0 1.3136523327172536 1.34132059181194]]


In [15]:
print(X_test)

[[0.0 0.0 1.0 -0.02985573483448293 -0.17689031385467474]
 [1.0 0.0 0.0 0.776249105696559 0.7509052396082565]]
