In [1]:
import os
import sys
import numpy as np
from scipy import sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
from mpl_toolkits.mplot3d import Axes3D, axes3d
from scipy.cluster import hierarchy
import seaborn as sns
import spacy
import nltk
from konlpy.tag import Okt
import graphviz
from sklearn.utils.fixes import loguniform

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA

In [8]:
from sklearn.datasets import load_breast_cancer

In [9]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], cancer['target'], random_state=0)

In [10]:
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
svm = SVC().fit(X_train_scaled, y_train)
svm.score(X_test_scaled, y_test)

0.972027972027972

In [12]:
pipe = Pipeline([('scaler', MinMaxScaler()), ('svm', SVC())]).fit(X_train, y_train)
pipe.score(X_test, y_test)

0.972027972027972

In [13]:
pipe = make_pipeline(MinMaxScaler(), SVC()).fit(X_train, y_train)
pipe.score(X_test, y_test)

0.972027972027972

In [14]:
kind = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'svc__C':kind, 'svc__gamma':kind}

In [20]:
pipe = make_pipeline(MinMaxScaler(), SVC())
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5).fit(X_train, y_train)
(grid.best_score_, grid.best_params_, grid.score(X_test, y_test),
accuracy_score(y_test, grid.predict(X_test)), roc_auc_score(y_test, grid.decision_function(X_test)))

(0.9812311901504789,
 {'svc__C': 1, 'svc__gamma': 1},
 0.972027972027972,
 0.972027972027972,
 0.9976939203354298)

In [27]:
rnd = np.random.RandomState(0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100, ))
pipe = make_pipeline(SelectPercentile(score_func=f_regression, percentile=5), Ridge())
np.mean(cross_val_score(pipe, X, y, cv=5))

-0.2465542238495281

In [30]:
pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
pipe.named_steps

{'standardscaler-1': StandardScaler(),
 'pca': PCA(n_components=2),
 'standardscaler-2': StandardScaler()}

In [33]:
pipe.fit(cancer['data'])
pipe.named_steps['pca'].components_

array([[ 0.21890244,  0.10372458,  0.22753729,  0.22099499,  0.14258969,
         0.23928535,  0.25840048,  0.26085376,  0.13816696,  0.06436335,
         0.20597878,  0.01742803,  0.21132592,  0.20286964,  0.01453145,
         0.17039345,  0.15358979,  0.1834174 ,  0.04249842,  0.10256832,
         0.22799663,  0.10446933,  0.23663968,  0.22487053,  0.12795256,
         0.21009588,  0.22876753,  0.25088597,  0.12290456,  0.13178394],
       [-0.23385713, -0.05970609, -0.21518136, -0.23107671,  0.18611302,
         0.15189161,  0.06016536, -0.0347675 ,  0.19034877,  0.36657547,
        -0.10555215,  0.08997968, -0.08945723, -0.15229263,  0.20443045,
         0.2327159 ,  0.19720728,  0.13032156,  0.183848  ,  0.28009203,
        -0.21986638, -0.0454673 , -0.19987843, -0.21935186,  0.17230435,
         0.14359317,  0.09796411, -0.00825724,  0.14188335,  0.27533947]])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], cancer['target'], random_state=4)
param_grid = {'logisticregression__C':[0.01, 0.1, 1, 10, 100]}
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5).fit(X_train, y_train)

In [37]:
grid.best_estimator_, grid.best_estimator_.named_steps['logisticregression']

(Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('logisticregression', LogisticRegression(C=1, max_iter=1000))]),
 LogisticRegression(C=1, max_iter=1000))

In [38]:
grid.best_estimator_.named_steps['logisticregression'].coef_

array([[-0.43570655, -0.34266946, -0.40809443, -0.5344574 , -0.14971847,
         0.61034122, -0.72634347, -0.78538827,  0.03886087,  0.27497198,
        -1.29780109,  0.04926005, -0.67336941, -0.93447426, -0.13939555,
         0.45032641, -0.13009864, -0.10144273,  0.43432027,  0.71596578,
        -1.09068862, -1.09463976, -0.85183755, -1.06406198, -0.74316099,
         0.07252425, -0.82323903, -0.65321239, -0.64379499, -0.42026013]])