In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("Data.csv")

In [5]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


# Separating input and output columns

In [14]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [15]:
X , y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

# Handling Missing Values

Simple imputer from scikit learn is used to replace the missing values either with mean or median based on what you give as strategy attribute . 
then fit method calculates and mean or median and set the imputer
whereas transform returns a copy of data with missing values replaced

In [18]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan , strategy = "mean")
imputer.fit(X[:,1:])
X[:,1:] = imputer.transform(X[:,1:])

In [19]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encoding non-numerical columns 

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [27]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])] , remainder='passthrough' )
X = np.array(ct.fit_transform(X))

In [28]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Labelling the purchased column with numerical value

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y =  np.array(le.fit_transform(y))

In [31]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting Data Into Training and Test Set

In [44]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42)

In [37]:
X_train

array([[1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0]], dtype=object)

In [38]:
y_train

array([1, 0, 1, 0, 1, 1, 0, 0])

In [39]:
X_test

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0]], dtype=object)

In [40]:
y_test

array([0, 1])

# Feature Scaling

In [47]:
from sklearn.preprocessing import StandardScaler
ss =  StandardScaler()
ss.fit(X_train[:,3:])
X_train[:,3:] = ss.transform(X_train[:,3:])
X_test[:,3:] = ss.transform(X_test[:,3:])

In [48]:
X_train

array([[1.0, 0.0, 0.0, -0.7529426005471074, -0.6260377781240922],
       [1.0, 0.0, 0.0, 1.008453807952985, 1.013042950055349],
       [1.0, 0.0, 0.0, 1.7912966561752484, 1.8325833141450698],
       [0.0, 1.0, 0.0, -1.7314961608249366, -1.0943465576039326],
       [1.0, 0.0, 0.0, -0.3615211764359758, 0.4276569757055486],
       [0.0, 1.0, 0.0, 0.22561095973072173, 0.05040823668012205],
       [0.0, 0.0, 1.0, -0.16581046438040992, -0.274806193514212],
       [0.0, 0.0, 1.0, -0.013591021670525248, -1.328500947343853]],
      dtype=object)

In [49]:
X_test

array([[0.0, 1.0, 0.0, 2.18271808028638, 2.30089209362491],
       [0.0, 0.0, 1.0, -2.318628296991634, -1.796809726823693]],
      dtype=object)