Importing the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Importing the Dataset

In [4]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Spliting the dataset into Features and Labels

In [11]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:,-1].values
#keep in mind that -1 is the index of the last column
#and : means a range... when we write [:,-1] means we
#want all the rows + the last column

#try this: dont use .values at the end of the code then run this
#block and the next block again. whats the difference in x?

In [12]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [8]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Taking care of missing data

In [15]:
#if you have a large dataset or you have only 1% missind data, you can
#just remove all the rows with missing data


#the second thing is you can replace the missing data with the average of
#all other values in the column

#in this dataset we have 2 missing data: one in the age column and
#one in the Salary column. lets replace them:

In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#missing_values=np.nan says that we want to replace the missing values
#strategy='mean' says that we want to replace them with the average...

In [18]:
imputer.fit(x[:,1:3])
#this fit function will look at the missing spaces and calculate
#the relevant average needed to put in the missing places
imputer.transform(x[:,1:3])
#this function replaces the numbers calculated by the fit function
#in the missing places
#keep in mind we excluded the first column here because it wasnt numerical
x[:,1:3] = imputer.transform(x[:,1:3])
#now we replace the new and full dataset into the old one.


In [19]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Encoding Categorical Data

In [22]:
#the first column are France,Germany and spain
#the model wont understand these so we have to turn them into
#numbers
x[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

In [24]:
#one hot encoding:
#so lets give each country a number. lets give for ex spain 0
#germany 1 and france 2.This is indeed INCORRECT because our model
#will misunderstand these and make corelations like:'the higher the country
#code the higher the ... rate'. so we have to use one hot encoding:



Encoding the indipendent Variable

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [26]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],remainder='passthrough')
#transformers=[('encoder', OneHotEncoder(), [0])] in this part we say which columns(0 in this ex) will be encoded and
#with which encoder
#remainder='passthrough' says that dont touch anything besides the 0th column so we dont loose the data

In [27]:
ct.fit_transform(x)
#and here we use the ct(short for ColumnTransformer) function we used on x
x = ct.fit_transform(x)
x = np.array(x)
#then we replace the old x(with Spain, Germany and France in it) with the new one which has now vectors for each
#country:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [28]:
#now if you compare this x with a couple blocks earlier you will notice that France = 1.0 0.0 0.0 , Spain = 0.0 0.0 1.0
#and Germany = 0.0 1.0 0.0
#whith this format, our model will not misunderstand the data and will not make incorrect corelations

In [29]:
#so now lets cahnge the yes/no column into 1/0:
from sklearn.preprocessing import LabelEncoder
#making an object with the library we imported:
le = LabelEncoder()
#using this object to transform yes and Nos into 1/0 in our y variable
le.fit_transform(y)
#replacing the old y with the new one:
y = le.fit_transform(y)
#seeing the result of our replacement:
print(y)

[0 1 0 0 1 1 0 1 0 1]


Spliting the dataset into the training and test set

In [33]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(x, y, test_size = 0.2,
                                                 random_state= 1)
#random_state= 1 is not neccessary. we wrote it to get the same results as the teacher.
print('X_test:',X_test)
print('X_train:',X_train)
print('Y_test:',Y_test)
print('Y_train:',Y_train)

X_test: [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
X_train: [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
Y_test: [0 1]
Y_train: [0 1 0 0 1 1 0 1]


Feature Scaling

In [None]:
#always should be done after spliting the dataset
#we dont do this in all of our projects because not all algorithms need feature
#scaling.
#feature scaling is scalling all of your features between two numbers(ex. 0 and 1)
#so that your model can perform better. there are two main feature scaling formulas:
#Standardisation(between -3 and 3) and Normalization(between 0 and 1)

In [34]:
#Here we apply the Standardisation(the teacher recommends it):
from sklearn.preprocessing import StandardScaler
#remember we shouldnt feature scale the vectors we created for the country names
#which are called the dummy variables. so we dont feature scale the dummy variable
sc = StandardScaler()

X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:]) #we cant use the fit_transform here
#we have to use the same scaler on X_test and X_train. if we use the fit_transform
#again here, we are using two different scallers and its not ok.


In [35]:
X_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [36]:
X_test

array([[0.0, 1.0, 0.0, -1.4661817944830124, -0.9069571034860727],
       [1.0, 0.0, 0.0, -0.44973664397484414, 0.2056403393225306]],
      dtype=object)