#Exercise: Data Preprocessing Pipeline

Goal: Create a complete data preprocessing pipeline for a new dataset

##Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##Importing the dataset

In [2]:
dataset = pd.read_csv('customer_dataset.csv')

In [3]:
print(dataset)

   Country   Age   Salary Subscribed
0   France  44.0  72000.0         No
1    Spain  27.0      NaN        Yes
2  Germany   NaN  54000.0         No
3    Italy  38.0  61000.0         No
4   France  40.0  58000.0        Yes
5    Spain  35.0  52000.0        Yes
6  Germany  28.0  79000.0         No
7    Italy  48.0  83000.0        Yes


##Creating the matrix of features

In [4]:
X = dataset.iloc[:, :-1].values

In [5]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 nan]
 ['Germany' nan 54000.0]
 ['Italy' 38.0 61000.0]
 ['France' 40.0 58000.0]
 ['Spain' 35.0 52000.0]
 ['Germany' 28.0 79000.0]
 ['Italy' 48.0 83000.0]]


##Creating the dependent variable vector

In [6]:
y = dataset.iloc[:, -1].values

In [7]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes']


##Taking care of missing data

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:, 1:])

In [9]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 65571.42857142857]
 ['Germany' 37.142857142857146 54000.0]
 ['Italy' 38.0 61000.0]
 ['France' 40.0 58000.0]
 ['Spain' 35.0 52000.0]
 ['Germany' 28.0 79000.0]
 ['Italy' 48.0 83000.0]]


##Encoding categorical data

###Encoding the independent variable

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [11]:
print(X)

[[1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 0.0 1.0 27.0 65571.42857142857]
 [0.0 1.0 0.0 0.0 37.142857142857146 54000.0]
 [0.0 0.0 1.0 0.0 38.0 61000.0]
 [1.0 0.0 0.0 0.0 40.0 58000.0]
 [0.0 0.0 0.0 1.0 35.0 52000.0]
 [0.0 1.0 0.0 0.0 28.0 79000.0]
 [0.0 0.0 1.0 0.0 48.0 83000.0]]


###Encoding the dependent variable

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[0 1 0 0 1 1 0 1]


##Splitting the dataset into training set and test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [15]:
print(X_train)

[[0.0 0.0 0.0 1.0 27.0 65571.42857142857]
 [0.0 1.0 0.0 0.0 37.142857142857146 54000.0]
 [1.0 0.0 0.0 0.0 40.0 58000.0]
 [0.0 0.0 1.0 0.0 48.0 83000.0]
 [1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 0.0 28.0 79000.0]]


In [16]:
print(X_test)

[[0.0 0.0 1.0 0.0 38.0 61000.0]
 [0.0 0.0 0.0 1.0 35.0 52000.0]]


In [17]:
print(y_train)

[1 0 1 1 0 0]


In [18]:
print(y_test)

[0 1]


##Feature scaling

###Standardization

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 4:] = sc.fit_transform(X_train[:, 4:])
X_test[:, 4:] = sc.transform(X_test[:, 4:])

In [20]:
print(X_train)

[[0.0 0.0 0.0 1.0 -1.3384868828318128 -0.287922646830573]
 [0.0 1.0 0.0 0.0 -0.02769283205858845 -1.3897368701349688]
 [1.0 0.0 0.0 0.0 0.341544928722601 -1.0088628176346837]
 [0.0 0.0 1.0 0.0 1.3754106589099326 1.3716000104920987]
 [1.0 0.0 0.0 0.0 0.8584777938162668 0.32419636611631447]
 [0.0 1.0 0.0 0.0 -1.2092536665583964 0.9907259579918135]]


In [21]:
print(X_test)

[[0.0 0.0 1.0 0.0 0.08307849617576811 -0.7232072782594697]
 [0.0 0.0 0.0 1.0 -0.30462115264448125 -1.5801738963851113]]
