In [1]:
import sklearn
sklearn.__version__

# 

# Preprocessing

## Ordinal Encoding

In [2]:
from sklearn.preprocessing import OrdinalEncoder

In [3]:
encoder = OrdinalEncoder()

In [4]:
X = [['B'], ['A'], ['C'], ['D']]

In [5]:
encoder.fit(X)

In [6]:
X_ = encoder.transform(X)

In [7]:
X_

In [8]:
encoder.categories_

In [9]:
X = [['A', 10], ['B', 9.5], ['C', 8], ['D', 16]]

In [10]:
encoder.fit(X)

In [11]:
X_ = encoder.transform(X)

In [12]:
X_

In [13]:
encoder.categories_

In [14]:
encoder.inverse_transform(X_)

# Label Encoder

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()

In [17]:
Y = ['bat', 'ant', 'cat', 'dog', 'ant']

In [18]:
encoder.fit(Y)

In [19]:
Y_ = encoder.transform(Y)

In [20]:
Y_

In [21]:
encoder.classes_

In [22]:
encoder.inverse_transform(Y_)

# One-hot Encoding

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
encoder = OneHotEncoder()

In [25]:
X = [['ant'], ['bat'], ['cat'], ['bat'], ['ant'], ['cat']]

In [26]:
encoder.fit(X)

In [27]:
X_ = encoder.transform(X)

In [28]:
X_

In [29]:
X_.todense()

In [30]:
encoder.categories_

In [31]:
encoder.inverse_transform(X_)

In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame(X, columns=['animal'])

In [34]:
df

In [35]:
pd.get_dummies(df)

In [36]:
pd.get_dummies(df, dtype=int)

In [37]:
df = pd.DataFrame([['male'], ['female'], ['female'], ['male']], columns=['gender'])

In [38]:
df_ = pd.get_dummies(df, dtype=int)

In [39]:
df_.columns

In [40]:
df_.drop(columns=df_.columns[0])

# MultiLabel Binarizer

In [41]:
from sklearn.preprocessing import MultiLabelBinarizer

In [42]:
encoder = MultiLabelBinarizer()

In [43]:
X = [['sci-fi', 'comedy'], 
     ['comedy'], 
     ['drama', 'romance'],  
     ['sci-fi', 'drama', 'action']]

In [44]:
encoder.fit(X)

In [45]:
X_ = encoder.transform(X)

In [46]:
X_

In [47]:
encoder.inverse_transform(X_)

In [48]:
encoder.classes_

# K Bins Discretizer

In [49]:
from sklearn.preprocessing import KBinsDiscretizer

In [50]:
X = [[10], [11], [12], [16], [21], [22], [35]]

In [51]:
encoder = KBinsDiscretizer(n_bins=3)

In [52]:
encoder.fit(X)

In [53]:
X_ = encoder.transform(X)

In [54]:
X_

In [55]:
X_.todense()

In [56]:
encoder.bin_edges_

In [57]:
encoder.inverse_transform(X_)

In [58]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

In [59]:
encoder.fit_transform(X)

In [60]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

In [61]:
encoder.fit_transform(X)

# MinMax Scaler

In [62]:
from sklearn.preprocessing import MinMaxScaler

In [63]:
X = [[1], [2], [3.9], [4], [5]]

In [64]:
scaler = MinMaxScaler()

In [65]:
scaler.fit(X)

In [66]:
X_ = scaler.transform(X)

In [67]:
X_

In [68]:
scaler.inverse_transform(X_)

# Normalization

In [69]:
from sklearn.preprocessing import Normalizer

In [70]:
normalizer = Normalizer()

In [71]:
X = [[4, 1, 2, 2],
     [1, 3, 9, 3],
     [5, 7, 5, 1]]

In [72]:
normalizer.fit(X)

In [73]:
X_ = normalizer.transform(X)

In [74]:
X_

In [75]:
normalizer.inverse_transform(X_)

In [None]:
normalizer = Normalizer(norm='l1')

In [None]:
normalizer.fit(X)

In [None]:
X_ = normalizer.transform(X)

In [None]:
X_

# Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X = [[1, 2], [2, 3], [3, 4], [4, 5]]

In [76]:
scaler.fit(X)

In [77]:
X_ = scaler.transform(X)

In [78]:
X_

In [79]:
scaler.mean_, scaler.var_

In [80]:
scaler.inverse_transform(X_)

# Imputation

In [81]:
import numpy as np

In [82]:
X = [[np.nan, np.nan, 3], [4, 1, 6], [10,2,9],[10,2,9]]

In [83]:
from sklearn.impute import SimpleImputer

In [84]:
imputer = SimpleImputer(strategy='mean')

In [85]:
imputer.fit(X)

In [86]:
X_ = imputer.transform(X)

In [87]:
X_

In [88]:
imputer.inverse_transform(X_)

In [None]:
imputer = SimpleImputer(strategy='mean', add_indicator=True)

In [89]:
imputer.fit(X)

In [90]:
X_ = imputer.transform(X)

In [91]:
imputer.inverse_transform(X_)

In [92]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

In [93]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(X)

In [94]:
imputer = SimpleImputer(strategy='constant', fill_value=-1)
imputer.fit_transform(X)

In [95]:
X = [['dog'], ['dog'], ['cat'], [np.nan]]

In [96]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

In [97]:
imputer = SimpleImputer(strategy='constant')
imputer.fit_transform(X)

In [98]:
imputer = SimpleImputer(strategy='constant', fill_value='cat')
imputer.fit_transform(X)

# KNN Imputer

In [99]:
X = [[np.nan, np.nan, 3], [4, 1, 6], [10,2,9],[10,2,9]]

In [100]:
from sklearn.impute import KNNImputer

In [101]:
imputer = KNNImputer(n_neighbors=1)

In [102]:
imputer.fit(X)

In [103]:
X_ = imputer.transform(X)

In [104]:
X_

# Feature Selection

## Variance Threshold

In [105]:
from sklearn.feature_selection import VarianceThreshold

In [106]:
X = [[1, 2, 3], [5, 2, 4], [10, 2.2, 5]]
X = pd.DataFrame(X, columns=['A', 'B', 'C'])

In [107]:
X

In [108]:
p = 0.2
th = p * (1 - p)
selector = VarianceThreshold(threshold=th)

In [109]:
selector.fit(X)

In [110]:
selector.variances_

In [111]:
X_ = selector.transform(X)

In [112]:
X_

In [113]:
selector.get_feature_names_out()

# Classification

In [114]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect, \
                                      chi2, f_classif, mutual_info_classif

In [115]:
X = load_iris(as_frame=True)

In [116]:
X.data

In [117]:
X.target

In [118]:
# score_func = chi2
# score_func = f_classif
score_func = mutual_info_classif

# selector = SelectKBest(score_func=score_func, k=2)
selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

In [119]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

# selector = SelectPercentile(score_func=score_func, percentile=30)
selector = GenericUnivariateSelect(score_func=score_func, mode='percentile', param=30)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

In [120]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='fwe', param=0.05)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.pvalues_)
print(selector.get_feature_names_out())

# Regression

In [121]:
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression

In [122]:
from sklearn.datasets import load_diabetes

In [123]:
X = load_diabetes(as_frame=True)

In [124]:
X.data

In [125]:
X.target

In [126]:
# score_func = r_regression
# score_func = f_regression
score_func = mutual_info_regression

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=5)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

In [127]:
eval('r_regression')

In [128]:
selector.get_support()

# Feature Importance

In [129]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel

In [130]:
X = load_iris(as_frame=True)

In [131]:
clf = RandomForestClassifier()

In [132]:
clf.fit(X.data, X.target)

In [133]:
clf.feature_importances_

In [134]:
selector = SelectFromModel(clf, prefit=True)

In [135]:
X.data.columns[selector.get_support()]

# Forward

In [136]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

In [137]:
clf = KNeighborsClassifier(n_neighbors=3)

In [138]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='forward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

# Backward

In [139]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='backward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

# Recursive

In [140]:
from sklearn.feature_selection import RFE

In [141]:
clf = RandomForestClassifier()

In [142]:
selector = RFE(clf)

In [143]:
selector.fit(X.data, X.target)

In [144]:
selector.get_feature_names_out()

# Cross-validation (CV)

![](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

In [145]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [146]:
clf = RandomForestClassifier()

selector = RFECV(clf, cv=StratifiedKFold(5), scoring='accuracy', min_features_to_select=1, n_jobs=2)
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

# Exhaustive (All combinations)

In [147]:
from itertools import combinations, chain

In [148]:
for i in range(1, 5):
    for x in combinations([0, 1, 2, 3], i):
        print(x)

In [149]:
n_features = X.data.shape[1]
chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))

In [150]:
from sklearn.model_selection import cross_val_score

In [151]:
def EFS(estimator, X, Y, cv=5, verbose=False):
    n_features = X.shape[1]
    subsets = chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))
    best_score = -np.inf
    best_subset = None
    for i, subset in enumerate(subsets):
        subset = list(subset)
        score = cross_val_score(estimator, X.iloc[:, subset], Y, cv=cv).mean()
        if score > best_score:
            best_score = score
            best_subset = subset
        if verbose:
            print(i, score, subset)
    return X.columns[best_subset]

In [152]:
clf = KNeighborsClassifier(n_neighbors=3)
selected = EFS(clf, X.data, X.target, cv=StratifiedKFold(5), verbose=True)
print(selected)

# Feature Union

In [153]:
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import FeatureUnion, make_union
from sklearn.datasets import load_iris

In [154]:
X = load_iris(as_frame=True)

In [155]:
union = FeatureUnion([
    ('pca', PCA(n_components=2)),
    ('svd', TruncatedSVD(n_components=1))
])

# union = make_union(PCA(n_components=2), TruncatedSVD(n_components=1))

In [156]:
union

In [157]:
union.fit_transform(X['data']).shape

In [158]:
union.set_params(pca__n_components=1).fit_transform(X['data']).shape

# Pipeline

In [159]:
from sklearn.pipeline import Pipeline, make_pipeline

In [160]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

In [161]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('union', union),
    ('svc', SVC())
])

# pipe = make_pipeline(MinMaxScaler(), SVC())

In [162]:
pipe

In [163]:
pipe.fit(X['data'], X['target'])

In [164]:
pipe.predict(X['data'])

In [165]:
pipe.score(X['data'], X['target'])

In [166]:
pipe.set_params(svc__kernel='poly')

In [167]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [168]:
pipe = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier())),
    ('scaler', MinMaxScaler()),
    ('union', union),
    ('svc', SVC())
])

In [169]:
pipe

# save and load

In [170]:
import joblib

In [171]:
joblib.dump(pipe, 'pipe.joblib')

In [172]:
pipe = joblib.load('pipe.joblib')

In [173]:
pipe.fit(X['data'], X['target'])

In [174]:
pipe.predict(X['data'])