# Clustering for Preprocessing

In [88]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

## Load the data

The digits dataset contains 1797 8x8 grayscale images with the digits 0 to 9.

In [50]:
X_digits, y_digits = load_digits(return_X_y=True)

In [51]:
X_digits.shape

(1797, 64)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

## Scale the data

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [54]:
X_test_scaled = scaler.transform(X_test)

## Fit a logistic regression model

In [58]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

LogisticRegression()

In [79]:
log_reg.score(X_test_scaled, y_test)

0.9644444444444444

## Cluster the data before training the model

In [83]:
pipeline = Pipeline([
    ('kmeans', KMeans(n_clusters=50)),
    ('log_reg', LogisticRegression(max_iter=10000))
])

In [89]:
param_grid = dict(kmeans__n_clusters=range(2,150))
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 148 candidates, totalling 444 fits
[CV] kmeans__n_clusters=2 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. kmeans__n_clusters=2, total=   0.3s
[CV] kmeans__n_clusters=2 ............................................
[CV] ............................. kmeans__n_clusters=2, total=   0.2s
[CV] kmeans__n_clusters=2 ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ............................. kmeans__n_clusters=2, total=   0.2s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.3s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.3s
[CV] kmeans__n_clusters=3 ............................................
[CV] ............................. kmeans__n_clusters=3, total=   0.3s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.5s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.4s
[CV] kmeans__n_clusters=4 ............................................
[CV] ............................. kmeans__n_clusters=4, total=   0.4s
[CV] kmeans__n_clusters=5 ............................................
[CV] .

[CV] ............................ kmeans__n_clusters=22, total=   0.8s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   2.7s
[CV] kmeans__n_clusters=22 ...........................................
[CV] ............................ kmeans__n_clusters=22, total=   2.1s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.8s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   2.1s
[CV] kmeans__n_clusters=23 ...........................................
[CV] ............................ kmeans__n_clusters=23, total=   0.9s
[CV] kmeans__n_clusters=24 ...........................................
[CV] ............................ kmeans__n_clusters=24, total=   0.7s
[CV] kmeans__n_clusters=24 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=41, total=   1.5s
[CV] kmeans__n_clusters=41 ...........................................
[CV] ............................ kmeans__n_clusters=41, total=   1.0s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   1.0s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   1.5s
[CV] kmeans__n_clusters=42 ...........................................
[CV] ............................ kmeans__n_clusters=42, total=   1.0s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   1.0s
[CV] kmeans__n_clusters=43 ...........................................
[CV] ............................ kmeans__n_clusters=43, total=   2.0s
[CV] kmeans__n_clusters=43 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=60, total=   1.1s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   1.1s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   1.9s
[CV] kmeans__n_clusters=61 ...........................................
[CV] ............................ kmeans__n_clusters=61, total=   1.2s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   1.2s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   1.9s
[CV] kmeans__n_clusters=62 ...........................................
[CV] ............................ kmeans__n_clusters=62, total=   1.3s
[CV] kmeans__n_clusters=63 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=80, total=   1.2s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   2.1s
[CV] kmeans__n_clusters=80 ...........................................
[CV] ............................ kmeans__n_clusters=80, total=   1.2s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   1.2s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   1.1s
[CV] kmeans__n_clusters=81 ...........................................
[CV] ............................ kmeans__n_clusters=81, total=   1.2s
[CV] kmeans__n_clusters=82 ...........................................
[CV] ............................ kmeans__n_clusters=82, total=   1.2s
[CV] kmeans__n_clusters=82 ...........................................
[CV] .

[CV] ............................ kmeans__n_clusters=99, total=   2.0s
[CV] kmeans__n_clusters=99 ...........................................
[CV] ............................ kmeans__n_clusters=99, total=   1.3s
[CV] kmeans__n_clusters=100 ..........................................
[CV] ........................... kmeans__n_clusters=100, total=   1.3s
[CV] kmeans__n_clusters=100 ..........................................
[CV] ........................... kmeans__n_clusters=100, total=   2.3s
[CV] kmeans__n_clusters=100 ..........................................
[CV] ........................... kmeans__n_clusters=100, total=   1.3s
[CV] kmeans__n_clusters=101 ..........................................
[CV] ........................... kmeans__n_clusters=101, total=   1.2s
[CV] kmeans__n_clusters=101 ..........................................
[CV] ........................... kmeans__n_clusters=101, total=   1.2s
[CV] kmeans__n_clusters=101 ..........................................
[CV] .

[CV] ........................... kmeans__n_clusters=118, total=   1.4s
[CV] kmeans__n_clusters=119 ..........................................
[CV] ........................... kmeans__n_clusters=119, total=   1.4s
[CV] kmeans__n_clusters=119 ..........................................
[CV] ........................... kmeans__n_clusters=119, total=   1.3s
[CV] kmeans__n_clusters=119 ..........................................
[CV] ........................... kmeans__n_clusters=119, total=   1.4s
[CV] kmeans__n_clusters=120 ..........................................
[CV] ........................... kmeans__n_clusters=120, total=   1.4s
[CV] kmeans__n_clusters=120 ..........................................
[CV] ........................... kmeans__n_clusters=120, total=   1.3s
[CV] kmeans__n_clusters=120 ..........................................
[CV] ........................... kmeans__n_clusters=120, total=   1.5s
[CV] kmeans__n_clusters=121 ..........................................
[CV] .

[CV] ........................... kmeans__n_clusters=138, total=   1.6s
[CV] kmeans__n_clusters=138 ..........................................
[CV] ........................... kmeans__n_clusters=138, total=   1.5s
[CV] kmeans__n_clusters=138 ..........................................
[CV] ........................... kmeans__n_clusters=138, total=   1.7s
[CV] kmeans__n_clusters=139 ..........................................
[CV] ........................... kmeans__n_clusters=139, total=   1.6s
[CV] kmeans__n_clusters=139 ..........................................
[CV] ........................... kmeans__n_clusters=139, total=   1.5s
[CV] kmeans__n_clusters=139 ..........................................
[CV] ........................... kmeans__n_clusters=139, total=   1.7s
[CV] kmeans__n_clusters=140 ..........................................
[CV] ........................... kmeans__n_clusters=140, total=   1.6s
[CV] kmeans__n_clusters=140 ..........................................
[CV] .

[Parallel(n_jobs=1)]: Done 444 out of 444 | elapsed: 11.7min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kmeans', KMeans(n_clusters=99)),
                                       ('log_reg',
                                        LogisticRegression(max_iter=10000))]),
             param_grid={'kmeans__n_clusters': range(2, 150)}, verbose=2)

In [90]:
grid_clf.best_params_

{'kmeans__n_clusters': 147}

In [91]:
grid_clf.score(X_test_scaled, y_test)

0.9666666666666667