In [1]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [2]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [3]:
import pandas as pd
import numpy as np

In [4]:
##### Introduction to Preprocessing Steps #####

In [5]:
train = pd.DataFrame(
    data=[
    [1, 'Aaron', 'Third', 22, 7.25, 'no'], 
    [2, 'Beth', 'First', 38, 71.28, 'yes'], 
    [3, 'Cathy', 'Second', 26, 7.92, 'yes'], 
    [4, 'Dave', 'First', 60, 71.28, 'yes'], 
    [5, 'Erin', 'Second', 70, 71.92, 'no']], 
    columns=['Id', 'Name', 'Pclass', 'Age', 'Fare', 'Survived'])
train_X = train.drop('Survived', axis=1)
train_Y = train['Survived']
train

Unnamed: 0,Id,Name,Pclass,Age,Fare,Survived
0,1,Aaron,Third,22,7.25,no
1,2,Beth,First,38,71.28,yes
2,3,Cathy,Second,26,7.92,yes
3,4,Dave,First,60,71.28,yes
4,5,Erin,Second,70,71.92,no


In [6]:
# Encode categorical features as an integer array
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
from sklearn.preprocessing import OrdinalEncoder

# instantiate the preprocessing model
oe = OrdinalEncoder()

# fit the model to training data
oe.fit(train_X[['Pclass']])

# transform the training data - using the model that was fit on training data
train_X['Pclass_transformed'] = oe.transform(train_X[['Pclass']])

# you can also do fit and transform in a single step 
#train_X['Pclass_transformed'] = oe.fit_transform(train_X[['Pclass']])

# drop the original column
train_X = train_X.drop('Pclass', axis=1)

train_X

Unnamed: 0,Id,Name,Age,Fare,Pclass_transformed
0,1,Aaron,22,7.25,2.0
1,2,Beth,38,71.28,0.0
2,3,Cathy,26,7.92,1.0
3,4,Dave,60,71.28,0.0
4,5,Erin,70,71.92,1.0


In [7]:
test = pd.DataFrame(
    data=[
    [6, 'Fiona', 'Second', 2, 50.25, 'yes'], 
    [7, 'Gina', 'Third', 25, 7.28, 'no'], 
    [8, 'Heather', 'First', 30, 71.92, 'no'], 
    [9, 'Ingrid', 'First', 54, 71.28, 'yes'], 
    [10, 'John', 'Third', 66, 7.92, 'yes']], 
    columns=['Id', 'Name', 'Pclass', 'Age', 'Fare', 'Survived'])
test_X = test.drop('Survived', axis=1)
test_Y = test['Survived']
test

Unnamed: 0,Id,Name,Pclass,Age,Fare,Survived
0,6,Fiona,Second,2,50.25,yes
1,7,Gina,Third,25,7.28,no
2,8,Heather,First,30,71.92,no
3,9,Ingrid,First,54,71.28,yes
4,10,John,Third,66,7.92,yes


In [8]:
# transform the test data - using the model that was fit on training data (do not fit the model again on test data!)
test_X['Pclass_transformed'] = oe.transform(test_X[['Pclass']])

# drop the original column
test_X = test_X.drop('Pclass', axis=1)

test_X

Unnamed: 0,Id,Name,Age,Fare,Pclass_transformed
0,6,Fiona,2,50.25,1.0
1,7,Gina,25,7.28,2.0
2,8,Heather,30,71.92,0.0
3,9,Ingrid,54,71.28,0.0
4,10,John,66,7.92,2.0
