In [2]:
# package imports
import pandas as pd
import numpy as np 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib.pyplot as plt
%matplotlib inline

# 数据集转换：https://scikit-learn.org/stable/modules/compose.html

## Pipeline：chain multiple estimators into one

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
pipe =  make_pipeline(PCA(5), SVC())
pipe

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

### 对pipeline进行网格搜索：https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py

In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = dict(pca__n_components=[2, 5, 10],
                  svc__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)
grid_search

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=5, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_s

In [12]:
from sklearn.linear_model import LogisticRegression
param_grid = dict(pca=['passthrough', PCA(5), PCA(10)],
                  svc=[SVC(), LogisticRegression()],
                  svc__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)
grid_search

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=5, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            pro...
                                     shrinking=True, tol=0.001, 

### ColumnTransformer转换组合数据

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer

data = {'name': ['mw', 'xy', 'mw', 'mt', 'mw', 'xy'],'year': [2000, 2001, 2002, 2003, 2004, 2005]}
df1 = pd.DataFrame(data)

preprocessor = make_column_transformer( (OneHotEncoder(),[0]),remainder="drop")
df1 = preprocessor.fit_transform(df1)
df1

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [9]:
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=4)
kf.get_n_splits(X)

print(kf)

# for i,(train, test) in enumerate(kf.split(Y_train)):
for i,(train_index, test_index) in enumerate(kf.split(X)):
    i
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

4

KFold(n_splits=4, random_state=None, shuffle=False)


0

TRAIN: [1 2 3] TEST: [0]


1

TRAIN: [0 2 3] TEST: [1]


2

TRAIN: [0 1 3] TEST: [2]


3

TRAIN: [0 1 2] TEST: [3]


In [16]:
X = np.array([[1, 2], [1, 4], [1, 2], [1, 4]])
X.mean(axis=0)

array([1., 3.])