# Importing Libraries

In [64]:
import numpy as np;
import matplotlib.pyplot as plt;
import pandas as pd;

# Importing Dataset

In [66]:
dataset = pd.read_csv('./Preprocessing.csv');
# print(dataset);

X = dataset.iloc[:, : -1].values;
Y = dataset.iloc[:, 3].values;

In [67]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [68]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Handling Missing Values (NaN)

In [70]:
from sklearn.impute import SimpleImputer;
imputer = SimpleImputer(missing_values=np.nan, strategy='mean');
imputer.fit(X[:,1:3]);
X[:, 1:3] = imputer.transform(X[:, 1:3]);

In [71]:
print(X);

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding Categorical Data

## Encoding Independent Variable

In [74]:
from sklearn.compose import ColumnTransformer;
from sklearn.preprocessing import OneHotEncoder;
ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(), [0])
], remainder='passthrough');
X = np.array(ct.fit_transform(X));

In [75]:
print(X);

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding Dependent Variable

In [91]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder();
Y=le.fit_transform(Y);

In [93]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting the Dataset into Training Set and Testing Set

In [95]:
from sklearn.model_selection import train_test_split;
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1);

In [97]:
print("X_train");
print(X_train);
print()

X_train
[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]



In [99]:
print("X_test");
print(X_test);
print()

X_test
[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]



In [101]:
print("Y_train");
print(Y_train);
print()

Y_train
[0 1 0 0 1 1 0 1]



In [103]:
print("Y_test");
print(Y_test);
print()

Y_test
[0 1]



# Feature Scaling

In [112]:
from sklearn.preprocessing import StandardScaler;
sc = StandardScaler();
X_train[:, 3:]=sc.fit_transform(X_train[:,3:])
X_test[:, 3:]=sc.transform(X_test[:,3:])

In [114]:
print(X_train);

[[0.0 0.0 1.0 -0.19159184384578554 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057846 -0.07013167641635404]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022487 -0.307866172742979]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [116]:
print(X_test);

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
