In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import  OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# importing data
raw_data = pd.read_csv(r"C:\Users\Maria Munir Stokes\Documents\DS Courses\Udemy\drive-download-20200629T190417Z-001\Part 1 - Data Preprocessing\Section 2 -------------------- Part 1 - Data Preprocessing --------------------\Python\Data.csv")
raw_data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [82]:
# separating the features from the dependent variable
# converting into numpy arrays
x = raw_data.iloc[:,:-1].values
y = raw_data.iloc[:,-1].values

# checking execution
print(x,y)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [83]:
# filling in missing values with the means of the column
# SimpleImuter class from sklearn used

impute_mean = SimpleImputer(missing_values = np.nan , strategy ='mean')
#fit and transform data
impute_mean.fit(x[:,1:3])
x[:,1:3] = impute_mean.transform(x[:,1:3])

#check
display(x)

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [84]:
# using the one hot encoding method to encode the names of the country
# the arguments of the column transformer are: [task, which task and index], passthrough ensures all of the columns that
# are not encoded are kept

ct = ColumnTransformer(transformers = [('encode', OneHotEncoder(), [0])], remainder = 'passthrough') 

# fit and tansform the class to the data
# forcing it to be a numpy array

x = np.array(ct.fit_transform(x))
print(x)

# encoding the dependent variable using label encoder

le= LabelEncoder()
y = le.fit_transform(y)
print(y)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
[0 1 0 0 1 1 0 1 0 1]


In [85]:
# splitting the data set into training and testing data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [86]:
print("x train: ", x_train)
print("x test: ", x_test)
print("y train: ", y_train)
print("y test: ", y_test)

x train:  [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
x test:  [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
y train:  [0 1 0 0 1 1 0 1]
y test:  [0 1]


In [89]:
# feature scaling the variables
# this only needs to be done on the numerical variables and not the encoded ones

scaler = StandardScaler()  #create a class
x_train[:,3:] = scaler.fit_transform(x_train[:,3:])

print("x train: ",x_train)

# this same transformation needs to be used with the x_test data
# it is important to not use fit as it will re-fit the data

x_test[:,3:] = scaler.transform(x_test[:,3:])
print("x test: ",x_test)


x train:  [[0.0 0.0 1.0 -0.19159184384578554 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057846 -0.07013167641635404]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022487 -0.307866172742979]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]
x test:  [[0.0 1.0 0.0 -1.4661817944830127 -0.9069571034860731]
 [1.0 0.0 0.0 -0.4497366439748442 0.20564033932253026]]
