In [191]:
import sklearn
sklearn.__version__

# 

# Preprocessing

## Ordinal Encoding

In [192]:
from sklearn.preprocessing import OrdinalEncoder

In [193]:
encoder = OrdinalEncoder()

In [194]:
X = [['B'], ['A'], ['C'], ['D']]

In [195]:
encoder.fit(X)

In [196]:
X_ = encoder.transform(X)

In [197]:
X_

In [198]:
encoder.categories_

In [199]:
X = [['A', 10], ['B', 9.5], ['C', 8], ['D', 16]]

In [200]:
encoder.fit(X)

In [201]:
X_ = encoder.transform(X)

In [202]:
X_

In [203]:
encoder.categories_

In [204]:
encoder.inverse_transform(X_)

# Label Encoder

In [205]:
from sklearn.preprocessing import LabelEncoder

In [206]:
encoder = LabelEncoder()

In [207]:
Y = ['bat', 'ant', 'cat', 'dog', 'ant']

In [208]:
encoder.fit(Y)

In [209]:
Y_ = encoder.transform(Y)

In [210]:
Y_

In [211]:
encoder.classes_

In [212]:
encoder.inverse_transform(Y_)

# One-hot Encoding

In [213]:
from sklearn.preprocessing import OneHotEncoder

In [214]:
encoder = OneHotEncoder()

In [215]:
X = [['ant'], ['bat'], ['cat'], ['bat'], ['ant'], ['cat']]

In [216]:
encoder.fit(X)

In [217]:
X_ = encoder.transform(X)

In [218]:
X_

In [219]:
X_.todense()

In [220]:
encoder.categories_

In [221]:
encoder.inverse_transform(X_)

In [222]:
import pandas as pd

In [223]:
df = pd.DataFrame(X, columns=['animal'])

In [224]:
df

In [225]:
pd.get_dummies(df)

In [226]:
pd.get_dummies(df, dtype=int)

In [227]:
df = pd.DataFrame([['male'], ['female'], ['female'], ['male']], columns=['gender'])

In [228]:
df_ = pd.get_dummies(df, dtype=int)

In [229]:
df_.columns

In [230]:
df_.drop(columns=df_.columns[0])

# MultiLabel Binarizer

In [231]:
from sklearn.preprocessing import MultiLabelBinarizer

In [232]:
encoder = MultiLabelBinarizer()

In [233]:
X = [['sci-fi', 'comedy'], 
     ['comedy'], 
     ['drama', 'romance'],  
     ['sci-fi', 'drama', 'action']]

In [234]:
encoder.fit(X)

In [235]:
X_ = encoder.transform(X)

In [236]:
X_

In [237]:
encoder.inverse_transform(X_)

In [238]:
encoder.classes_

# K Bins Discretizer

In [239]:
from sklearn.preprocessing import KBinsDiscretizer

In [240]:
X = [[10], [11], [12], [16], [21], [22], [35]]

In [241]:
encoder = KBinsDiscretizer(n_bins=3)

In [242]:
encoder.fit(X)

In [243]:
X_ = encoder.transform(X)

In [244]:
X_

In [245]:
X_.todense()

In [246]:
encoder.bin_edges_

In [247]:
encoder.inverse_transform(X_)

In [248]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

In [249]:
encoder.fit_transform(X)

In [250]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

In [251]:
encoder.fit_transform(X)



# MinMax Scaler

In [252]:
from sklearn.preprocessing import MinMaxScaler

In [253]:
X = [[1], [2], [3.9], [4], [5]]

In [254]:
scaler = MinMaxScaler()

In [255]:
scaler.fit(X)

In [256]:
X_ = scaler.transform(X)

In [257]:
X_

In [258]:
scaler.inverse_transform(X_)

# Normalization

In [259]:
from sklearn.preprocessing import Normalizer

In [260]:
normalizer = Normalizer()

In [261]:
X = [[4, 1, 2, 2],
     [1, 3, 9, 3],
     [5, 7, 5, 1]]

In [262]:
normalizer.fit(X)

In [263]:
X_ = normalizer.transform(X)

In [264]:
X_

In [265]:
normalizer.inverse_transform(X_)

In [266]:
normalizer = Normalizer(norm='l1')

In [267]:
normalizer.fit(X)

In [268]:
X_ = normalizer.transform(X)

In [269]:
X_

# Standardization

In [270]:
from sklearn.preprocessing import StandardScaler

In [271]:
scaler = StandardScaler()

In [272]:
X = [[1, 2], [2, 3], [3, 4], [4, 5]]

In [273]:
scaler.fit(X)

In [274]:
X_ = scaler.transform(X)

In [275]:
X_

In [276]:
scaler.mean_, scaler.var_

In [277]:
scaler.inverse_transform(X_)

# Imputation

In [278]:
import numpy as np

In [279]:
X = [[np.nan, np.nan, 3], [4, 1, 6], [10,2,9],[10,2,9]]

In [280]:
from sklearn.impute import SimpleImputer

In [281]:
imputer = SimpleImputer(strategy='mean')

In [282]:
imputer.fit(X)

In [283]:
X_ = imputer.transform(X)

In [284]:
X_

In [285]:
imputer.inverse_transform(X_)

In [286]:
imputer = SimpleImputer(strategy='mean', add_indicator=True)

In [287]:
imputer.fit(X)

In [288]:
X_ = imputer.transform(X)

In [289]:
imputer.inverse_transform(X_)

In [290]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

In [291]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(X)

In [292]:
imputer = SimpleImputer(strategy='constant', fill_value=-1)
imputer.fit_transform(X)

In [293]:
X = [['dog'], ['dog'], ['cat'], [np.nan]]

In [294]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

In [295]:
imputer = SimpleImputer(strategy='constant')
imputer.fit_transform(X)

In [296]:
imputer = SimpleImputer(strategy='constant', fill_value='cat')
imputer.fit_transform(X)

# KNN Imputer

In [297]:
X = [[np.nan, np.nan, 3], [4, 1, 6], [10,2,9],[10,2,9]]

In [298]:
from sklearn.impute import KNNImputer

In [299]:
imputer = KNNImputer(n_neighbors=1)

In [300]:
imputer.fit(X)

In [301]:
X_ = imputer.transform(X)

In [302]:
X_

# Feature Selection

## Variance Threshold

In [303]:
from sklearn.feature_selection import VarianceThreshold

In [304]:
X = [[1, 2, 3], [5, 2, 4], [10, 2.2, 5]]
X = pd.DataFrame(X, columns=['A', 'B', 'C'])

In [305]:
X

In [306]:
p = 0.2
th = p * (1 - p)
selector = VarianceThreshold(threshold=th)

In [307]:
selector.fit(X)

In [308]:
selector.variances_

In [309]:
X_ = selector.transform(X)

In [310]:
X_

In [311]:
selector.get_feature_names_out()

# Classification

In [312]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect, \
                                      chi2, f_classif, mutual_info_classif

In [313]:
X = load_iris(as_frame=True)

In [314]:
X.data

In [315]:
X.target

In [316]:
# score_func = chi2
# score_func = f_classif
score_func = mutual_info_classif

# selector = SelectKBest(score_func=score_func, k=2)
selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

In [317]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

# selector = SelectPercentile(score_func=score_func, percentile=30)
selector = GenericUnivariateSelect(score_func=score_func, mode='percentile', param=30)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

In [318]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='fwe', param=0.05)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.pvalues_)
print(selector.get_feature_names_out())

# Regression

In [319]:
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression

In [320]:
from sklearn.datasets import load_diabetes

In [321]:
X = load_diabetes(as_frame=True)

In [322]:
X.data

In [323]:
X.target

In [324]:
# score_func = r_regression
# score_func = f_regression
score_func = mutual_info_regression

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=5)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

In [325]:
eval('r_regression')

In [326]:
selector.get_support()

# Feature Importance

In [327]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel

In [328]:
X = load_iris(as_frame=True)

In [329]:
clf = RandomForestClassifier()

In [330]:
clf.fit(X.data, X.target)

In [331]:
clf.feature_importances_

In [332]:
selector = SelectFromModel(clf, prefit=True)

In [333]:
X.data.columns[selector.get_support()]

# Forward

In [334]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

In [335]:
clf = KNeighborsClassifier(n_neighbors=3)

In [336]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='forward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

# Backward

In [337]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='backward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

# Recursive

In [338]:
from sklearn.feature_selection import RFE

In [339]:
clf = RandomForestClassifier()

In [340]:
selector = RFE(clf)

In [341]:
selector.fit(X.data, X.target)

In [342]:
selector.get_feature_names_out()

# Cross-validation (CV)

![](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

In [343]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [344]:
clf = RandomForestClassifier()

selector = RFECV(clf, cv=StratifiedKFold(5), scoring='accuracy', min_features_to_select=1, n_jobs=2)
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

# Exhaustive (All combinations)

In [345]:
from itertools import combinations, chain

In [346]:
for i in range(1, 5):
    for x in combinations([0, 1, 2, 3], i):
        print(x)

In [347]:
n_features = X.data.shape[1]
chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))

In [348]:
from sklearn.model_selection import cross_val_score

In [349]:
def EFS(estimator, X, Y, cv=5, verbose=False):
    n_features = X.shape[1]
    subsets = chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))
    best_score = -np.inf
    best_subset = None
    for i, subset in enumerate(subsets):
        subset = list(subset)
        score = cross_val_score(estimator, X.iloc[:, subset], Y, cv=cv).mean()
        if score > best_score:
            best_score = score
            best_subset = subset
        if verbose:
            print(i, score, subset)
    return X.columns[best_subset]

In [350]:
clf = KNeighborsClassifier(n_neighbors=3)
selected = EFS(clf, X.data, X.target, cv=StratifiedKFold(5), verbose=True)
print(selected)

# Feature Union

In [351]:
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import FeatureUnion, make_union
from sklearn.datasets import load_iris

In [352]:
X = load_iris(as_frame=True)

In [353]:
union = FeatureUnion([
    ('pca', PCA(n_components=2)),
    ('svd', TruncatedSVD(n_components=1))
])

# union = make_union(PCA(n_components=2), TruncatedSVD(n_components=1))

In [354]:
union

In [355]:
union.fit_transform(X['data']).shape

In [356]:
union.set_params(pca__n_components=1).fit_transform(X['data']).shape

# Pipeline

In [357]:
from sklearn.pipeline import Pipeline, make_pipeline

In [358]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

In [359]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('union', union),
    ('svc', SVC())
])

# pipe = make_pipeline(MinMaxScaler(), SVC())

In [360]:
pipe

In [361]:
pipe.fit(X['data'], X['target'])

In [362]:
pipe.predict(X['data'])

In [363]:
pipe.score(X['data'], X['target'])

In [364]:
pipe.set_params(svc__kernel='poly')

In [365]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [366]:
pipe = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier())),
    ('scaler', MinMaxScaler()),
    ('union', union),
    ('svc', SVC())
])

In [367]:
pipe

# save and load

In [368]:
import joblib

In [369]:
joblib.dump(pipe, 'pipe.joblib')

In [370]:
pipe = joblib.load('pipe.joblib')

In [371]:
pipe.fit(X['data'], X['target'])

In [372]:
pipe.predict(X['data'])