#Exercise: Data Preprocessing Pipeline

Goal: Build a complete data preprocessing pipeline from scratch using a new dataset with missing values and categorical variables

##Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##Importing the dataset

In [2]:
dataset = pd.read_csv('dataset.csv')

In [3]:
print(dataset)

  Category  Score Pass
0        A   85.5  Yes
1        B    NaN   No
2        A   92.0  Yes
3        C   78.3  Yes
4        B    NaN   No


##Creating the matrix of features

In [4]:
X = dataset.iloc[:, :-1].values

In [5]:
print(X)

[['A' 85.5]
 ['B' nan]
 ['A' 92.0]
 ['C' 78.3]
 ['B' nan]]


##Creating the dependent variable vector

In [6]:
y = dataset.iloc[:, -1].values

In [7]:
print(y)

['Yes' 'No' 'Yes' 'Yes' 'No']


##Taking care of missing data

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:, 1:])

In [9]:
print(X)

[['A' 85.5]
 ['B' 85.26666666666667]
 ['A' 92.0]
 ['C' 78.3]
 ['B' 85.26666666666667]]


##Encoding categorical data

###Encoding the independent variable

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [11]:
print(X)

[[1.0 0.0 0.0 85.5]
 [0.0 1.0 0.0 85.26666666666667]
 [1.0 0.0 0.0 92.0]
 [0.0 0.0 1.0 78.3]
 [0.0 1.0 0.0 85.26666666666667]]


###Encoding the dependent variable

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[1 0 1 1 0]


##Splitting the dataset into training set and test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
print(X_train)

[[0.0 1.0 0.0 85.26666666666667]
 [1.0 0.0 0.0 92.0]
 [1.0 0.0 0.0 85.5]
 [0.0 0.0 1.0 78.3]]


In [16]:
print(X_test)

[[0.0 1.0 0.0 85.26666666666667]]


In [17]:
print(y_train)

[0 1 1 1]


In [18]:
print(y_test)

[0]


##Feature scaling

###Standardization

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [20]:
print(X_train)

[[0.0 1.0 0.0 0.0]
 [1.0 0.0 0.0 1.3895227297937351]
 [1.0 0.0 0.0 0.048151777765129626]
 [0.0 0.0 1.0 -1.4376745075588648]]


In [21]:
print(X_test)

[[0.0 1.0 0.0 0.0]]
