In [1]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html

In [2]:
##### PRE-PROCESSING DATA #####
#
# https://scikit-learn.org/stable/modules/preprocessing.html
#
# Encoding Categorical Variables:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
#
# Standardization, Scaling, Normalization:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
#
# Discretization:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html
# 
# Missing Value Imputation
# https://scikit-learn.org/stable/modules/impute.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html
# 
# Polynomial Features
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
#
# Custom Transformers
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html
#
# Transforming Prediction Targets:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
#

In [3]:
import pandas as pd
import numpy as np

In [4]:
##### TRANSFORMING PREDICTION TARGET
# These are transformers that are not intended to be used on features, but only on supervised learning targets.
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

In [5]:
df = pd.read_csv('data/uciIris/iris.data.csv')
df.sample(frac=0.05)
#df['iris species'].unique()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),iris species
30,4.8,3.1,1.6,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
47,4.6,3.2,1.4,0.2,Iris-setosa
79,5.7,2.6,3.5,1.0,Iris-versicolor
1,4.9,3.0,1.4,0.2,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
51,6.4,3.2,4.5,1.5,Iris-versicolor
124,6.7,3.3,5.7,2.1,Iris-virginica


In [6]:
# Label Encoding (similar to OrdinalEncoder for Categorical Features)
# use LabelEncoder to Encode labels with value between 0 and n_classes-1
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle = pd.DataFrame(le.fit_transform(df['iris species']), columns=['iris species LE'])
dfle = pd.concat([df, dfle], axis=1)
dfle = dfle.drop('iris species', axis=1)
dfle.sample(frac=0.05)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),iris species LE
53,5.5,2.3,4.0,1.3,1
119,6.0,2.2,5.0,1.5,2
135,7.7,3.0,6.1,2.3,2
94,5.6,2.7,4.2,1.3,1
120,6.9,3.2,5.7,2.3,2
30,4.8,3.1,1.6,0.2,0
105,7.6,3.0,6.6,2.1,2
147,6.5,3.0,5.2,2.0,2


In [7]:
# Label Binarization (similar to OneHotEncoder for Categorical Features)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
dflb = pd.DataFrame(lb.fit_transform(df['iris species']), columns=[x+' LB' for x in lb.classes_])
dflb = pd.concat([df, dflb], axis=1)
dflb = dflb.drop('iris species', axis=1)
dflb.sample(frac=0.05)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Iris-setosa LB,Iris-versicolor LB,Iris-virginica LB
135,7.7,3.0,6.1,2.3,0,0,1
138,6.0,3.0,4.8,1.8,0,0,1
85,6.0,3.4,4.5,1.6,0,1,0
92,5.8,2.6,4.0,1.2,0,1,0
58,6.6,2.9,4.6,1.3,0,1,0
118,7.7,2.6,6.9,2.3,0,0,1
129,7.2,3.0,5.8,1.6,0,0,1
120,6.9,3.2,5.7,2.3,0,0,1


In [8]:
# Multilabel Binarizer (converts lists of sets or tuples into multilabel format)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
labels = [('sci-fi', 'thriller'), ('comedy',)] 
pd.DataFrame(mlb.fit_transform(labels), columns=mlb.classes_)

Unnamed: 0,comedy,sci-fi,thriller
0,0,1,1
1,1,0,0
