In [1]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [2]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data/kaggleTitanic/sample.csv')
df.head()
#df['Pclass'].unique()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,no,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
##### ENCODING CATEGORICAL VARIABLES #####
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [6]:
# one-hot encoding of categorical variables: when you want to convert categorical levels into dummy variables
# Note: with the SKLearn OHE, you might need to take care of NaNs in your categorical data before using OHE 
#          since OHE might thrown an error otherwise

# let's say we wanted to one hot encode categorical vaiable Pclass
from sklearn.preprocessing import OneHotEncoder

# we can consider all k dummy variables associated with a categorical variable with k levels - by setting drop=None
ohe = OneHotEncoder(categories='auto', drop=None, handle_unknown='error', sparse=False, dtype=int) 
ohe.fit(df[['Pclass']])
ohe.transform(df[['Pclass']])

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0]])

In [7]:
# we could have done fit_transform in a single step
ohe = OneHotEncoder(categories='auto', drop=None, handle_unknown='error', sparse=False, dtype=int) 
ohe.fit_transform(df[['Pclass']])

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0]])

In [8]:
# you can get at the names of the new dummy variables that sklearn has created as follows
ohe.get_feature_names()

array(['x0_1', 'x0_2', 'x0_3'], dtype=object)

In [9]:
# so putting it all together:
#  - we can do a fit_transform in a single step, and
#  - put the resulting one-hot encoded dummy variables into a dataframe,
#    while using the dummy variable names as column names 
ohe = OneHotEncoder(categories='auto', drop=None, handle_unknown='error', sparse=False, dtype=int)
dfcat = pd.DataFrame(ohe.fit_transform(df[['Pclass']]), columns=ohe.get_feature_names())
dfcat.head()

Unnamed: 0,x0_1,x0_2,x0_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [10]:
# we can then concat the new dummy variables to the original dataframe 
dfcat = pd.concat([df, dfcat], axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,x0_1,x0_2,x0_3
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,no,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


In [11]:
# and then finally, drop the original categorical column
dfcat = dfcat.drop('Pclass', axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,x0_1,x0_2,x0_3
0,1,no,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,yes,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,yes,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,yes,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,no,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


In [12]:
# we can also drop 1, and consider k-1 dummy variables associated with a categorical variable with k levels - by setting drop='first'
# this avoids multicilinearity, and is useful in situations where perfectly collinear features can cause issues, such as unregularized regression.
ohe = OneHotEncoder(categories='auto', drop='first', handle_unknown='error', sparse=False, dtype=int)
dfcat = pd.DataFrame(ohe.fit_transform(df[['Pclass']]), columns=ohe.get_feature_names())
dfcat = pd.concat([df, dfcat], axis=1)
dfcat = dfcat.drop('Pclass', axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,x0_2,x0_3
0,1,no,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1
1,2,yes,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,yes,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1
3,4,yes,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0
4,5,no,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1


In [13]:
# one-hot encoding can also be done for multiple categorical variables at the same time in a similar fashion
categoricalvars = ['Pclass', 'Sex', 'Embarked']
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categories='auto', drop='first', handle_unknown='error', sparse=False, dtype=int)
dfcat = pd.DataFrame(ohe.fit_transform(df[categoricalvars]), columns=ohe.get_feature_names())
dfcat = pd.concat([df, dfcat], axis=1)
dfcat = dfcat.drop(categoricalvars, axis=1)
dfcat.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,x0_2,x0_3,x1_male,x2_Q,x2_S
0,1,no,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,1,0,1
1,2,yes,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,yes,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,1
3,4,yes,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,no,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,1,0,1
