Importing The Libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plot
import pandas as pd

Import DataSet

In [3]:
dataset = pd.read_csv('data/Data.csv')
print(dataset.head())
print(dataset.describe())

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
             Age        Salary
count   9.000000      9.000000
mean   38.777778  63777.777778
std     7.693793  12265.579662
min    27.000000  48000.000000
25%    35.000000  54000.000000
50%    38.000000  61000.000000
75%    44.000000  72000.000000
max    50.000000  83000.000000


In [4]:
x = dataset.iloc[:,:-1]
print(x.head())

y = dataset.iloc[:,-1]
print(y.head())

X = x.values
Y = y.values
print(X)
print(Y)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Missing Data

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:,1:])

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Encoding Categorical Data

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

Y = le.fit_transform(Y)

print(Y)

[0 1 0 0 1 1 0 1 0 1]


Splitting Data Set

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, test_size=0.2, random_state= 42)

print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]]
[[0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 27.0 48000.0]]
5    Yes
0     No
7    Yes
2     No
9    Yes
4    Yes
3     No
6     No
Name: Purchased, dtype: object
8     No
1    Yes
Name: Purchased, dtype: object


Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:])

print(X_train)

[[1.0 0.0 0.0 -0.7529426005471074 -0.6260377781240922]
 [1.0 0.0 0.0 1.008453807952985 1.013042950055349]
 [1.0 0.0 0.0 1.7912966561752484 1.8325833141450698]
 [0.0 1.0 0.0 -1.7314961608249366 -1.0943465576039326]
 [1.0 0.0 0.0 -0.3615211764359758 0.4276569757055486]
 [0.0 1.0 0.0 0.22561095973072173 0.05040823668012205]
 [0.0 0.0 1.0 -0.16581046438040992 -0.274806193514212]
 [0.0 0.0 1.0 -0.013591021670525248 -1.328500947343853]]
