In [1]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [2]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data/kaggleTitanic/sample.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,no,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
##### MISSING VALUE IMPUTATION
# https://scikit-learn.org/stable/modules/impute.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html

In [6]:
# Univariate feature imputation
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# Missing values can be imputed with a provided constant value, or using the statistics (mean, median or most frequent) 
# strategy is mean,  median, most_frequent, or constant (with fill_value)

from sklearn.impute import SimpleImputer
# impute Age missing values with mean
impage = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None) 
dfimpage = pd.DataFrame(impage.fit_transform(df[['Age']]), columns=['imp_Age'])
# impute Cabin missing values with 'MISSING'
impcabin = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='MISSING') 
dfimpcabin = pd.DataFrame(impcabin.fit_transform(df[['Cabin']]), columns=['imp_Cabin'])
dfimp = pd.concat([df, dfimpage, dfimpcabin], axis=1)
#dfimp[(dfimp['Age'].isna() | dfimp['Cabin'].isna())].head()
dfimp = dfimp.drop(['Age', 'Cabin'], axis=1)
dfimp.head(100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,imp_Age,imp_Cabin
0,1,no,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S,22.0,MISSING
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C,38.0,C85
2,3,yes,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S,26.0,MISSING
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S,35.0,C123
4,5,no,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S,35.0,MISSING
5,6,no,3,"Moran, Mr. James",male,0,0,330877,8.4583,Q,28.111111,MISSING
6,7,no,1,"McCarthy, Mr. Timothy J",male,0,0,17463,51.8625,S,54.0,E46
7,8,no,3,"Palsson, Master. Gosta Leonard",male,3,1,349909,21.075,S,2.0,MISSING
8,9,yes,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,0,2,347742,11.1333,S,27.0,MISSING
9,10,yes,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,1,0,237736,30.0708,C,14.0,MISSING


In [7]:
# Multivariate feature imputation
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
# NOTE!!! 
#   This estimator is still experimental for now: the predictions and the API might 
#    change without any deprecation cycle. 
#    So to use it, you need to explicitly import enable_iterative_imputer.
# Each feature with missing values is modeled as a function of other features, 
#   and that estimate is then used for imputation. 
# It achieves this an iterated round-robin fashion: 
# At each step, a feature column is designated as output y,
#   and the other feature columns are treated as inputs X. 
# A regressor is fit on (X, y) for known y. 
#   Then, the regressor is used to predict the missing values of y. 
# This is done for each feature in an iterative fashion, 
#   and then is repeated for max_iter imputation rounds. 
# The results of the final imputation round are returned.

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
numericvars = ['SibSp', 'Parch', 'Age']
# by default estimator=BayesianRidge(); 
imp = IterativeImputer(estimator=None, max_iter=10, random_state=0) 
dfimp = pd.DataFrame(imp.fit_transform(df[numericvars]),columns=['imp_'+x for x in numericvars])
dfimp = pd.concat([df, dfimp], axis=1)
dfimp.head()

# can use others estimators such as RandomForestRegressor
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# numericvars = ['SibSp', 'Parch', 'Age']
# imp = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0) 
# dfimp = pd.DataFrame(imp.fit_transform(df[numericvars]),columns=['imp_'+x for x in numericvars])
# dfimp = pd.concat([df, dfimp], axis=1)
# dfimp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,imp_SibSp,imp_Parch,imp_Age
0,1,no,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0,0.0,22.0
1,2,yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0,0.0,38.0
2,3,yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0,0.0,26.0
3,4,yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0,0.0,35.0
4,5,no,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.0,0.0,35.0


In [8]:
# When using imputation, preserving the information about which values had been missing 
# can be informative. 
# Can use MissingIndicator to transform a dataset into corresponding binary matrix 
# indicating the presence of missing values in the dataset. 
from sklearn.impute import MissingIndicator
lst = [[1, 2, 20], [3, 6, 60], [4, 8, 80], [np.nan, 3, 30], [np.nan, np.nan, 70]]
dff = pd.DataFrame(lst)
mi = MissingIndicator(missing_values=np.nan)
mi.fit_transform(dff)

array([[False, False],
       [False, False],
       [False, False],
       [ True, False],
       [ True,  True]])

In [9]:
##### POLYNOMIAL FEATURE GENERATION
# Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree.
# For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [10]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df = df[['sepal length (cm)','sepal width (cm)']]
df.rename(columns={'sepal length (cm)':'a', 'sepal width (cm)':'b'}, inplace=True)
df.head()

Unnamed: 0,a,b
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [11]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
dfpoly = pd.DataFrame(poly.fit_transform(df), columns=['bias', 'a', 'b', 'a^2', 'ab', 'b^2'])
dfpoly.head()

Unnamed: 0,bias,a,b,a^2,ab,b^2
0,1.0,5.1,3.5,26.01,17.85,12.25
1,1.0,4.9,3.0,24.01,14.7,9.0
2,1.0,4.7,3.2,22.09,15.04,10.24
3,1.0,4.6,3.1,21.16,14.26,9.61
4,1.0,5.0,3.6,25.0,18.0,12.96


In [12]:
##### CUSTOM TRANSFORMERS 
# A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. 
# This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc.
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [13]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
from sklearn.preprocessing import FunctionTransformer
ft = FunctionTransformer(np.log, validate=True)
dfft = pd.DataFrame(ft.fit_transform(df), columns=['ft_'+x for x in df.columns])
dfft.head()

Unnamed: 0,ft_sepal length (cm),ft_sepal width (cm),ft_petal length (cm),ft_petal width (cm)
0,1.629241,1.252763,0.336472,-1.609438
1,1.589235,1.098612,0.336472,-1.609438
2,1.547563,1.163151,0.262364,-1.609438
3,1.526056,1.131402,0.405465,-1.609438
4,1.609438,1.280934,0.336472,-1.609438
