In [1]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [2]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
df = pd.read_csv('data/kaggleTitanic/sample.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
##### STANDARDIZATION, SCALING, NORMALIZATION #####
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html

In [6]:
#  Standardization: or mean removal and variance scaling
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# xi_scale = (xi - xmean)/xsd
# resulting distribution has mean 0 and sd 1.
# sensitive to outliers, and cannot guarantee balanced scales in the presence of outliers.
# also, the outliers themselves are still present in the transformed data.

numericvars = ['Age', 'Fare']
from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=True, with_std=True)
dfnumss = pd.DataFrame(ss.fit_transform(df[numericvars]), columns=['ss_'+x for x in numericvars])
dfnumss = pd.concat([df, dfnumss], axis=1)
dfnumss = dfnumss.drop(numericvars, axis=1)
dfnumss.head()

# these will be close to 0
#dfnumss[['ss_Age']].mean()
#dfnumss[['ss_Fare']].mean()
# these will be close to 1
#dfnumss[['ss_Age']].std()
#dfnumss[['ss_Fare']].std()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,ss_Age,ss_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,-0.433703,-0.88299
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.701811,1.976819
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,-0.149825,-0.852844
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.488902,1.164729
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.488902,-0.847261


In [7]:
#  MinMaxScaler transforms features by scaling each feature to a given range (by default [0,1])
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# xi_scale = (xi - min(x))/(max(x) - min(x))
# by default, resulting distribution is in [0, 1] range.
# sensitive to outliers, and cannot guarantee balanced scales in the presence of outliers.
# also, the outliers themselves are still present in the transformed data.

numericvars = ['Age', 'Fare']
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
dfnumss = pd.DataFrame(mms.fit_transform(df[numericvars]), columns=['mms_'+x for x in numericvars])
dfnumss = pd.concat([df, dfnumss], axis=1)
dfnumss = dfnumss.drop(numericvars, axis=1)
dfnumss.head()

# these will be 0
#dfnumss[['mms_Age']].min()
#dfnumss[['mms_Fare']].min()
# these will be 1
#dfnumss[['mms_Age']].max()
#dfnumss[['mms_Fare']].max()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,mms_Age,mms_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,0.384615,0.0
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.692308,1.0
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,0.461538,0.010541
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.634615,0.716034
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.634615,0.012493


In [8]:
#  MaxAbsScaler scales each feature by its maximum absolute value.
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
# xi_scale = (xi)/(max(abs(x)))
# resulting distribution is in [-1, 1] range.
# sensitive to outliers, and cannot guarantee balanced scales in the presence of outliers.
# also, the outliers themselves are still present in the transformed data.

numericvars = ['Age', 'Fare']
from sklearn.preprocessing import MaxAbsScaler
mas = MaxAbsScaler()
dfnummas = pd.DataFrame(mas.fit_transform(df[numericvars]), columns=['mas_'+x for x in numericvars])
dfnummas = pd.concat([df, dfnummas], axis=1)
dfnummas = dfnummas.drop(numericvars, axis=1)
dfnummas.head()

# these will be in [-1,1]
#dfnummas[['mas_Age']].min()
#dfnummas[['mas_Age']].max()
#dfnummas[['mas_Fare']].min()
#dfnummas[['mas_Fare']].max()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,mas_Age,mas_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,0.407407,0.101707
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.703704,1.0
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,0.481481,0.111176
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.648148,0.744915
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.648148,0.11293


In [9]:
# Robust Scaler
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# removes the median and scales the data according to the inter-quartile range (defaults to Q3-Q1)
# xi_scale = (xi - Q2(x))/(Q3(x) - Q1(x)) where Q1, Q2, and Q3 are 25th, 50th and 75th quantiles
# robust to outliers, but the outliers themselves are still present in the transformed data.

numericvars = ['Age', 'Fare']
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
dfnumrs = pd.DataFrame(rs.fit_transform(df[numericvars]), columns=['rs_'+x for x in numericvars])
dfnumrs = pd.concat([df, dfnumrs], axis=1)
dfnumrs = dfnumrs.drop(numericvars, axis=1)
dfnumrs.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,rs_Age,rs_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,-0.384615,-0.231405
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.846154,1.442121
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,-0.076923,-0.213764
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.615385,0.966896
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.615385,-0.210497


In [10]:
# Power Transformer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer
# makes data more Gaussian-like.
# finds the optimal scaling factor to stabilize variance and mimimize skewness through maximum likelihood estimation. 
# by default, PowerTransformer also applies zero-mean, unit variance normalization to the transformed output. 
# supports the Box-Cox transform (can only be applied to strictly positive data) and the Yeo-Johnson transform (if there are negative values in data).

numericvars = ['Age', 'Fare']
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
dfnumpt = pd.DataFrame(pt.fit_transform(df[numericvars]), columns=['pt_'+x for x in numericvars])
dfnumpt = pd.concat([df, dfnumpt], axis=1)
dfnumpt = dfnumpt.drop(numericvars, axis=1)
dfnumpt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,pt_Age,pt_Fare
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,,S,-0.408439,-1.173828
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,C85,C,0.705694,1.429066
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,,S,-0.12543,-1.038307
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,C123,S,0.500069,1.190662
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,,S,0.500069,-1.01475


In [11]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [12]:
# (Sample Vector) Normalization: 
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
# Normalization is the process of scaling individual *samples* (not features - i.e., operation is along rows!) to have unit norm. 
# This process can be useful if you plan to use a quadratic form such as the dot-product 
# or any other kernel to quantify the similarity of any pair of samples.
# l1: sum of abs values is 1
# l2: sum of square of values is 1

from sklearn.preprocessing import Normalizer
norm = Normalizer(norm='l2')
dfnorm = pd.DataFrame(norm.fit_transform(df), columns=['norm_'+x for x in df.columns])
dfnorm.head()

# dfnorm.apply(lambda x: abs(x)).sum(axis=1) # for l1 norm these will all be ones
# dfnorm.apply(lambda x: x*x).sum(axis=1) # for l2 norm these will all be ones

Unnamed: 0,norm_sepal length (cm),norm_sepal width (cm),norm_petal length (cm),norm_petal width (cm)
0,0.803773,0.551609,0.220644,0.031521
1,0.828133,0.50702,0.236609,0.033801
2,0.805333,0.548312,0.222752,0.034269
3,0.80003,0.539151,0.260879,0.034784
4,0.790965,0.569495,0.22147,0.031639


In [13]:
##### DISCRETIZATION (or quantization or binning)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html

In [14]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [15]:
# KBinsDiscretizer: bin continuous data into intervals
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') # read documentation for encode and strategy
dfkbd = pd.DataFrame(kbd.fit_transform(df), columns=['kbd_'+x for x in df.columns])

In [16]:
kbd.bin_edges_

array([array([4.3 , 5.  , 5.6 , 6.1 , 6.52, 7.9 ]),
       array([2. , 2.7, 3. , 3.1, 3.4, 4.4]),
       array([1.  , 1.5 , 3.9 , 4.64, 5.32, 6.9 ]),
       array([0.1 , 0.2 , 1.16, 1.5 , 1.9 , 2.5 ])], dtype=object)

In [17]:
dfkbd.head()

Unnamed: 0,kbd_sepal length (cm),kbd_sepal width (cm),kbd_petal length (cm),kbd_petal width (cm)
0,1.0,4.0,0.0,1.0
1,0.0,2.0,0.0,1.0
2,0.0,3.0,0.0,1.0
3,0.0,3.0,1.0,1.0
4,1.0,4.0,0.0,1.0


In [18]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [19]:
# Binarizer: binarize data (set feature values to 0 or 1) according to a threshold
# Binarizer is similar to the KBinsDiscretizer when k = 2, and when the bin edge is at the value threshold.
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
from sklearn.preprocessing import Binarizer
bnr = Binarizer(threshold=4.9)
dfbnr = pd.DataFrame(bnr.fit_transform(df[['sepal length (cm)']]), columns=['bnr_sepal length (cm)'])
dfbnr.head()

Unnamed: 0,bnr_sepal length (cm)
0,1.0
1,0.0
2,0.0
3,0.0
4,1.0
