In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("New Data.csv")

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,Belgium,54.0,77000.0,No
1,Portugal,37.0,53000.0,Yes
2,Germany,40.0,59000.0,No
3,Portugal,48.0,66000.0,No
4,Germany,50.0,,Yes
5,Belgium,45.0,63000.0,Yes
6,Portugal,,57000.0,No
7,Belgium,58.0,84000.0,Yes
8,Germany,60.0,88000.0,No
9,Portugal,47.0,72000.0,Yes


In [4]:
#Assigning already the feature and label columns

x = dataset.iloc[: , :-1]
y = dataset.iloc[:, -1]

In [5]:
print(x)
print("\n")
print(y)

    Country   Age   Salary
0   Belgium  54.0  77000.0
1  Portugal  37.0  53000.0
2   Germany  40.0  59000.0
3  Portugal  48.0  66000.0
4   Germany  50.0      NaN
5   Belgium  45.0  63000.0
6  Portugal   NaN  57000.0
7   Belgium  58.0  84000.0
8   Germany  60.0  88000.0
9  Portugal  47.0  72000.0


0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


In [6]:
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [7]:
#A few missing data to be taken care of, to avoid error in our networks

from sklearn.impute import SimpleImputer

In [8]:
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")  #replacing missing values with mean of each numerical column
imputer.fit(x.iloc[:, 1:3])
x.iloc[:, 1:3] = imputer.transform(x.iloc[:, 1:3])

In [9]:
print(x)

    Country        Age        Salary
0   Belgium  54.000000  77000.000000
1  Portugal  37.000000  53000.000000
2   Germany  40.000000  59000.000000
3  Portugal  48.000000  66000.000000
4   Germany  50.000000  68777.777778
5   Belgium  45.000000  63000.000000
6  Portugal  48.777778  57000.000000
7   Belgium  58.000000  84000.000000
8   Germany  60.000000  88000.000000
9  Portugal  47.000000  72000.000000


In [10]:
#Let's create binary vectors for the categorical colum by one-hot encoding

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [11]:
ct = ColumnTransformer(transformers = [("encoder", OneHotEncoder(), [0])], remainder = "passthrough")

In [12]:
x = np.array(ct.fit_transform(x))

In [13]:
print(x)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 5.40000000e+01
  7.70000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.70000000e+01
  5.30000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  5.90000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 4.80000000e+01
  6.60000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  6.87777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.50000000e+01
  6.30000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 4.87777778e+01
  5.70000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 5.80000000e+01
  8.40000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 6.00000000e+01
  8.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 4.70000000e+01
  7.20000000e+04]]


In [14]:
#The dependent variable requires similar encoding

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

y = label.fit_transform(y)

In [15]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [16]:
#Our features and label now seems to be ready to be passed into the network

from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [18]:
#Since we will be building a neural network, it could be important to do a bit of standard scaling to our features

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [19]:
print(x_train)

[[0.00000000e+00 0.00000000e+00 1.00000000e+00 4.87777778e+01
  5.70000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  6.87777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 5.40000000e+01
  7.70000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 4.80000000e+01
  6.60000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.70000000e+01
  5.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 5.80000000e+01
  8.40000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 6.00000000e+01
  8.80000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.50000000e+01
  6.30000000e+04]]


In [20]:
x_train[: , 3:] = scaler.fit_transform(x_train[: , 3:])
x_test[: , 3:] = scaler.transform(x_test[: , 3:])

In [21]:
print(x_train)

[[ 0.          0.          1.         -0.19159184 -1.07812594]
 [ 0.          1.          0.         -0.01411729 -0.07013168]
 [ 1.          0.          0.          0.56670851  0.63356243]
 [ 0.          0.          1.         -0.30453019 -0.30786617]
 [ 0.          0.          1.         -1.90180114 -1.42046362]
 [ 1.          0.          0.          1.14753431  1.23265336]
 [ 0.          1.          0.          1.43794721  1.57499104]
 [ 1.          0.          0.         -0.74014954 -0.56461943]]


##### Our dataset now ready to be passed into a neural network model