In [1]:
import sklearn
sklearn.__version__

'1.4.2'

# 

# Preprocessing

## Ordinal Encoding

In [2]:
from sklearn.preprocessing import OrdinalEncoder

In [3]:
encoder = OrdinalEncoder()

In [10]:
X = [['B'], ['A'], ['C'], ['D']]

In [11]:
encoder.fit(X)

In [12]:
X_ = encoder.transform(X)

In [13]:
X_

array([[1.],
       [0.],
       [2.],
       [3.]])

In [14]:
encoder.categories_

[array(['A', 'B', 'C', 'D'], dtype=object)]

In [15]:
X = [['A', 10], ['B', 9.5], ['C', 8], ['D', 16]]

In [16]:
encoder.fit(X)

In [17]:
X_ = encoder.transform(X)

In [18]:
X_

array([[0., 2.],
       [1., 1.],
       [2., 0.],
       [3., 3.]])

In [19]:
encoder.categories_

[array(['A', 'B', 'C', 'D'], dtype=object),
 array([8, 9.5, 10, 16], dtype=object)]

In [20]:
encoder.inverse_transform(X_)

array([['A', 10],
       ['B', 9.5],
       ['C', 8],
       ['D', 16]], dtype=object)

# Label Encoder

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
encoder = LabelEncoder()

In [36]:
Y = ['bat', 'ant', 'cat', 'dog', 'ant']

In [37]:
encoder.fit(Y)

In [38]:
Y_ = encoder.transform(Y)

In [39]:
Y_

array([1, 0, 2, 3, 0])

In [40]:
encoder.classes_

array(['ant', 'bat', 'cat', 'dog'], dtype='<U3')

In [41]:
encoder.inverse_transform(Y_)

array(['bat', 'ant', 'cat', 'dog', 'ant'], dtype='<U3')

# One-hot Encoding

In [42]:
from sklearn.preprocessing import OneHotEncoder

In [43]:
encoder = OneHotEncoder()

In [44]:
X = [['ant'], ['bat'], ['cat'], ['bat'], ['ant'], ['cat']]

In [45]:
encoder.fit(X)

In [46]:
X_ = encoder.transform(X)

In [47]:
X_

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [48]:
X_.todense()

matrix([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.]])

In [49]:
encoder.categories_

[array(['ant', 'bat', 'cat'], dtype=object)]

In [50]:
encoder.inverse_transform(X_)

array([['ant'],
       ['bat'],
       ['cat'],
       ['bat'],
       ['ant'],
       ['cat']], dtype=object)

In [51]:
import pandas as pd

In [52]:
df = pd.DataFrame(X, columns=['animal'])

In [53]:
df

Unnamed: 0,animal
0,ant
1,bat
2,cat
3,bat
4,ant
5,cat


In [54]:
pd.get_dummies(df)

Unnamed: 0,animal_ant,animal_bat,animal_cat
0,True,False,False
1,False,True,False
2,False,False,True
3,False,True,False
4,True,False,False
5,False,False,True


In [55]:
pd.get_dummies(df, dtype=int)

Unnamed: 0,animal_ant,animal_bat,animal_cat
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0
4,1,0,0
5,0,0,1


In [56]:
df = pd.DataFrame([['male'], ['female'], ['female'], ['male']], columns=['gender'])

In [58]:
df_ = pd.get_dummies(df, dtype=int)

In [59]:
df_.columns

Index(['gender_female', 'gender_male'], dtype='object')

In [60]:
df_.drop(columns=df_.columns[0])

Unnamed: 0,gender_male
0,1
1,0
2,0
3,1


# MultiLabel Binarizer

In [61]:
from sklearn.preprocessing import MultiLabelBinarizer

In [62]:
encoder = MultiLabelBinarizer()

In [63]:
X = [['sci-fi', 'comedy'], 
     ['comedy'], 
     ['drama', 'romance'],  
     ['sci-fi', 'drama', 'action']]

In [64]:
encoder.fit(X)

In [65]:
X_ = encoder.transform(X)

In [66]:
X_

array([[0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 1, 0],
       [1, 0, 1, 0, 1]])

In [67]:
encoder.inverse_transform(X_)

[('comedy', 'sci-fi'),
 ('comedy',),
 ('drama', 'romance'),
 ('action', 'drama', 'sci-fi')]

In [68]:
encoder.classes_

array(['action', 'comedy', 'drama', 'romance', 'sci-fi'], dtype=object)

# K Bins Discretizer

In [69]:
from sklearn.preprocessing import KBinsDiscretizer

In [70]:
X = [[10], [11], [12], [16], [21], [22], [35]]

In [71]:
encoder = KBinsDiscretizer(n_bins=3)

In [72]:
encoder.fit(X)

In [76]:
X_ = encoder.transform(X)

In [77]:
X_

<7x3 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [78]:
X_.todense()

matrix([[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]])

In [79]:
encoder.bin_edges_

array([array([10., 12., 21., 35.])], dtype=object)

In [80]:
encoder.inverse_transform(X_)

array([[11. ],
       [11. ],
       [16.5],
       [16.5],
       [28. ],
       [28. ],
       [28. ]])

In [81]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

In [82]:
encoder.fit_transform(X)

array([[0.],
       [0.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.]])

In [83]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

In [84]:
encoder.fit_transform(X)



array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [2.]])

# MinMax Scaler

In [85]:
from sklearn.preprocessing import MinMaxScaler

In [91]:
X = [[1], [2], [3.9], [4], [5]]

In [92]:
scaler = MinMaxScaler()

In [93]:
scaler.fit(X)

In [94]:
X_ = scaler.transform(X)

In [95]:
X_

array([[0.   ],
       [0.25 ],
       [0.725],
       [0.75 ],
       [1.   ]])

In [97]:
scaler.inverse_transform(X_)

array([[1. ],
       [2. ],
       [3.9],
       [4. ],
       [5. ]])

# Normalization

In [98]:
from sklearn.preprocessing import Normalizer

In [99]:
normalizer = Normalizer()

In [100]:
X = [[4, 1, 2, 2],
     [1, 3, 9, 3],
     [5, 7, 5, 1]]

In [101]:
normalizer.fit(X)

In [102]:
X_ = normalizer.transform(X)

In [103]:
X_

array([[0.8, 0.2, 0.4, 0.4],
       [0.1, 0.3, 0.9, 0.3],
       [0.5, 0.7, 0.5, 0.1]])

In [104]:
normalizer.inverse_transform(X_)

AttributeError: 'Normalizer' object has no attribute 'inverse_transform'

In [105]:
normalizer = Normalizer(norm='l1')

In [106]:
normalizer.fit(X)

In [107]:
X_ = normalizer.transform(X)

In [108]:
X_

array([[0.44444444, 0.11111111, 0.22222222, 0.22222222],
       [0.0625    , 0.1875    , 0.5625    , 0.1875    ],
       [0.27777778, 0.38888889, 0.27777778, 0.05555556]])

# Standardization

In [109]:
from sklearn.preprocessing import StandardScaler

In [110]:
scaler = StandardScaler()

In [111]:
X = [[1, 2], [2, 3], [3, 4], [4, 5]]

In [112]:
scaler.fit(X)

In [113]:
X_ = scaler.transform(X)

In [114]:
X_

array([[-1.34164079, -1.34164079],
       [-0.4472136 , -0.4472136 ],
       [ 0.4472136 ,  0.4472136 ],
       [ 1.34164079,  1.34164079]])

In [115]:
scaler.mean_, scaler.var_

(array([2.5, 3.5]), array([1.25, 1.25]))

In [116]:
scaler.inverse_transform(X_)

array([[1., 2.],
       [2., 3.],
       [3., 4.],
       [4., 5.]])

# Imputation

In [117]:
import numpy as np

In [118]:
X = [[np.nan, np.nan, 3], [4, 1, 6], [10,2,9],[10,2,9]]

In [119]:
from sklearn.impute import SimpleImputer

In [120]:
imputer = SimpleImputer(strategy='mean')

In [121]:
imputer.fit(X)

In [122]:
X_ = imputer.transform(X)

In [123]:
X_

array([[ 8.        ,  1.66666667,  3.        ],
       [ 4.        ,  1.        ,  6.        ],
       [10.        ,  2.        ,  9.        ],
       [10.        ,  2.        ,  9.        ]])

In [124]:
imputer.inverse_transform(X_)

ValueError: 'inverse_transform' works only when 'SimpleImputer' is instantiated with 'add_indicator=True'. Got 'add_indicator=False' instead.

In [125]:
imputer = SimpleImputer(strategy='mean', add_indicator=True)

In [126]:
imputer.fit(X)

In [127]:
X_ = imputer.transform(X)

In [128]:
imputer.inverse_transform(X_)

array([[nan, nan,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [130]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

array([[10.,  2.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [131]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(X)

array([[10.,  2.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [133]:
imputer = SimpleImputer(strategy='constant', fill_value=-1)
imputer.fit_transform(X)

array([[-1., -1.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [134]:
X = [['dog'], ['dog'], ['cat'], [np.nan]]

In [135]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

array([['dog'],
       ['dog'],
       ['cat'],
       ['dog']], dtype=object)

In [136]:
imputer = SimpleImputer(strategy='constant')
imputer.fit_transform(X)

array([['dog'],
       ['dog'],
       ['cat'],
       ['missing_value']], dtype=object)

In [138]:
imputer = SimpleImputer(strategy='constant', fill_value='cat')
imputer.fit_transform(X)

array([['dog'],
       ['dog'],
       ['cat'],
       ['cat']], dtype=object)

# KNN Imputer

In [139]:
X = [[np.nan, np.nan, 3], [4, 1, 6], [10,2,9],[10,2,9]]

In [140]:
from sklearn.impute import KNNImputer

In [141]:
imputer = KNNImputer(n_neighbors=1)

In [142]:
imputer.fit(X)

In [143]:
X_ = imputer.transform(X)

In [144]:
X_

array([[ 4.,  1.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

# Feature Selection

## Variance Threshold

In [145]:
from sklearn.feature_selection import VarianceThreshold

In [165]:
X = [[1, 2, 3], [5, 2, 4], [10, 2.2, 5]]
X = pd.DataFrame(X, columns=['A', 'B', 'C'])

In [166]:
X

Unnamed: 0,A,B,C
0,1,2.0,3
1,5,2.0,4
2,10,2.2,5


In [172]:
p = 0.2
th = p * (1 - p)
selector = VarianceThreshold(threshold=th)

In [173]:
selector.fit(X)

In [174]:
selector.variances_

array([1.35555556e+01, 8.88888889e-03, 6.66666667e-01])

In [175]:
X_ = selector.transform(X)

In [176]:
X_

array([[ 1.,  3.],
       [ 5.,  4.],
       [10.,  5.]])

In [177]:
selector.get_feature_names_out()

array(['A', 'C'], dtype=object)

# Classification

In [188]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect, \
                                      chi2, f_classif, mutual_info_classif

In [179]:
X = load_iris(as_frame=True)

In [180]:
X.data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [181]:
X.target

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int32

In [185]:
# score_func = chi2
# score_func = f_classif
score_func = mutual_info_classif

# selector = SelectKBest(score_func=score_func, k=2)
selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[0.5104595  0.2716372  0.98820231 0.97273606]
['petal length (cm)' 'petal width (cm)']


In [189]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

# selector = SelectPercentile(score_func=score_func, percentile=30)
selector = GenericUnivariateSelect(score_func=score_func, mode='percentile', param=30)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
['petal length (cm)']


In [193]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='fwe', param=0.05)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.pvalues_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
[4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
['sepal length (cm)' 'petal length (cm)' 'petal width (cm)']


# Regression

In [195]:
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression

In [196]:
from sklearn.datasets import load_diabetes

In [197]:
X = load_diabetes(as_frame=True)

In [198]:
X.data

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [212]:
X.target

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [221]:
# score_func = r_regression
# score_func = f_regression
score_func = mutual_info_regression

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=5)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[0.         0.02031831 0.17907887 0.06435746 0.06665894 0.01011581
 0.06270836 0.11229211 0.14669741 0.10881973]
['bmi' 's1' 's4' 's5' 's6']


In [223]:
eval('r_regression')

<function sklearn.feature_selection._univariate_selection.r_regression(X, y, *, center=True, force_finite=True)>

In [224]:
selector.get_support()

array([False, False,  True, False,  True, False, False,  True,  True,
        True])

# Feature Importance

In [234]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel

In [235]:
X = load_iris(as_frame=True)

In [236]:
clf = RandomForestClassifier()

In [237]:
clf.fit(X.data, X.target)

In [238]:
clf.feature_importances_

array([0.10428069, 0.02690182, 0.42391302, 0.44490447])

In [239]:
selector = SelectFromModel(clf, prefit=True)

In [242]:
X.data.columns[selector.get_support()]

Index(['petal length (cm)', 'petal width (cm)'], dtype='object')

# Forward

In [243]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

In [244]:
clf = KNeighborsClassifier(n_neighbors=3)

In [245]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='forward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

['sepal length (cm)' 'petal width (cm)']


# Backward

In [246]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='backward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

['petal length (cm)' 'petal width (cm)']


# Recursive

In [247]:
from sklearn.feature_selection import RFE

In [250]:
clf = RandomForestClassifier()

In [253]:
selector = RFE(clf)

In [254]:
selector.fit(X.data, X.target)

In [255]:
selector.get_feature_names_out()

array(['petal length (cm)', 'petal width (cm)'], dtype=object)

# Cross-validation (CV)

![](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

In [258]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [259]:
clf = RandomForestClassifier()

selector = RFECV(clf, cv=StratifiedKFold(5), scoring='accuracy', min_features_to_select=1, n_jobs=2)
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

['petal length (cm)' 'petal width (cm)']


# Exhaustive (All combinations)

In [260]:
from itertools import combinations, chain

In [266]:
for i in range(1, 5):
    for x in combinations([0, 1, 2, 3], i):
        print(x)

(0,)
(1,)
(2,)
(3,)
(0, 1)
(0, 2)
(0, 3)
(1, 2)
(1, 3)
(2, 3)
(0, 1, 2)
(0, 1, 3)
(0, 2, 3)
(1, 2, 3)
(0, 1, 2, 3)


In [267]:
n_features = X.data.shape[1]
chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))

<itertools.chain at 0x1f1cd2b4340>

In [268]:
from sklearn.model_selection import cross_val_score

In [275]:
def EFS(estimator, X, Y, cv=5, verbose=False):
    n_features = X.shape[1]
    subsets = chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))
    best_score = -np.inf
    best_subset = None
    for i, subset in enumerate(subsets):
        subset = list(subset)
        score = cross_val_score(estimator, X.iloc[:, subset], Y, cv=cv).mean()
        if score > best_score:
            best_score = score
            best_subset = subset
        if verbose:
            print(i, score, subset)
    return X.columns[best_subset]

In [276]:
clf = KNeighborsClassifier(n_neighbors=3)
selected = EFS(clf, X.data, X.target, cv=StratifiedKFold(5), verbose=True)
print(selected)

0 0.6199999999999999 [0]
1 0.5066666666666666 [1]
2 0.9400000000000001 [2]
3 0.96 [3]
4 0.74 [0, 1]
5 0.9466666666666667 [0, 2]
6 0.9533333333333334 [0, 3]
7 0.9333333333333333 [1, 2]
8 0.9400000000000001 [1, 3]
9 0.9533333333333334 [2, 3]
10 0.9400000000000001 [0, 1, 2]
11 0.9466666666666667 [0, 1, 3]
12 0.9733333333333334 [0, 2, 3]
13 0.96 [1, 2, 3]
14 0.9666666666666668 [0, 1, 2, 3]
Index(['sepal length (cm)', 'petal length (cm)', 'petal width (cm)'], dtype='object')


# Feature Union

In [3]:
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import FeatureUnion, make_union
from sklearn.datasets import load_iris

In [4]:
X = load_iris(as_frame=True)

In [9]:
union = FeatureUnion([
    ('pca', PCA(n_components=2)),
    ('svd', TruncatedSVD(n_components=1))
])

# union = make_union(PCA(n_components=2), TruncatedSVD(n_components=1))

In [10]:
union

In [11]:
union.fit_transform(X['data']).shape

(150, 3)

In [12]:
union.set_params(pca__n_components=1).fit_transform(X['data']).shape

(150, 2)

# Pipeline

In [13]:
from sklearn.pipeline import Pipeline, make_pipeline

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

In [21]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('union', union),
    ('svc', SVC())
])

# pipe = make_pipeline(MinMaxScaler(), SVC())

In [22]:
pipe

In [23]:
pipe.fit(X['data'], X['target'])

In [24]:
pipe.predict(X['data'])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [25]:
pipe.score(X['data'], X['target'])

0.9466666666666667

In [26]:
pipe.set_params(svc__kernel='poly')

In [34]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [35]:
pipe = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier())),
    ('scaler', MinMaxScaler()),
    ('union', union),
    ('svc', SVC())
])

In [36]:
pipe

# save and load

In [30]:
import joblib

In [37]:
joblib.dump(pipe, 'pipe.joblib')

['pipe.joblib']

In [38]:
pipe = joblib.load('pipe.joblib')

In [40]:
pipe.fit(X['data'], X['target'])

In [41]:
pipe.predict(X['data'])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])