In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#import dataset
dataset = pd.read_csv("Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [None]:
# dividing into dependent and independent variables
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values #helps extract location of index rows, columns

In [None]:
X.shape, X.size, Y.shape, Y.size

((10, 3), 30, (10,), 10)

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [None]:
#Taking care of missing data
#here we will handle missing values with replacing them with the average of all the other values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean') #the missing values we want to impute #second is the value we want to replace nan with
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3]) #here it replaces the values with the mean in the original X values

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
#Encoding independent variables where categorical data: here country are converted into numbers
#One Hot Encoding is one of the best technique which converts rows in columns and marks binary values for each row
# we use column transform and the one hot encoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers =[('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
#inputs for transformer:kind of transformation, kind of encoding and remainder
X = np.array(ct.fit_transform(X)) #we need X to be numpy array to apply any ML model

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
#Encoding dependent variables where categorical data: here Purchased are converted into numbers
#label Encoder will be used for converting them in to 0 and 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder();
Y = le.fit_transform(Y)

In [None]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


In [None]:
#splitting data into training and test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [None]:
X_train

array([[1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0]], dtype=object)

In [None]:
X_test

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0]], dtype=object)

In [None]:
Y_train

array([1, 1, 1, 0, 0, 0, 1, 0])

In [None]:
Y_test

array([1, 0])

In [None]:
#Feature scaling that puts all features in same scale
#we use standardization because it works always and it is always applied to the train split only the mean and the standard deviation is calculated from the train data and then applied elsewhere!
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() # no arguments
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) #not applied on the encoded columns (dummy values)
X_test[:, 3:] = sc.transform(X_test[:,3:]) # we don't fit the scaler here to avoid the leaking of info

In [None]:
print(X_train)

[[1.0 0.0 0.0 -0.2243492425258068 0.23524565201180497]
 [0.0 0.0 1.0 -1.5270222636433952 -1.319421265631428]
 [1.0 0.0 0.0 -0.48488384674932444 -0.5011755195034107]
 [0.0 1.0 0.0 1.469125684927058 1.5444388458166327]
 [0.0 1.0 0.0 -1.1362203573081187 -0.8284738179546175]
 [1.0 0.0 0.0 0.6875218722565051 0.6443685250758137]
 [1.0 0.0 0.0 1.2085910807035405 1.2171405473654258]
 [0.0 0.0 1.0 0.007237072339542363 -0.992122967180221]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 0.16645266380946971 -0.028411310629444783]
 [0.0 0.0 1.0 -0.09408194041404794 -0.2557017956650054]]
