In [12]:
import numpy as np
from scipy.spatial.distance import euclidean
import scipy.misc as spm
from matplotlib import pyplot
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state
from sklearn.utils import check_array
import cv2
from skimage import filters, color

class KMeans(BaseEstimator, ClusterMixin):

    def __init__(self, n_clusters=3, max_iter=150, tol=1e-3,
                 verbose=0, random_state=None):

        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

    def _check_fit_data(self, X):
        X = check_array(X, dtype=np.float64)

        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples="+str(n_samples)+" should be\
                greater than number of clusters="+str(self.n_clusters))
        return X

    def _check_test_data(self, X):
        X = check_array(X, dtype=np.float64)
                      
        n_samples, n_features = X.shape
        expected_n_features = self.centroids_.shape[1]
        if not expected_n_features == n_features:
            raise ValueError("Incorrect number of features")
        return X

    def _update_centroids(self, X):
        for j in np.arange(self.n_clusters):
            mask = self.labels_==j
            if np.sum(mask) == 0:
                raise ValueError("Empty Cluster")
            temp = X[mask]
            count = np.shape(temp)[0]
            self.centroids_[j] = np.sum(temp, axis=0)/count

    def _update_dist(self, X, dist):
        n_samples = X.shape[0]
        for j in np.arange(n_samples):
            for k in np.arange(self.n_clusters):
                cost = euclidean(X[j], self.centroids_[k])
                dist[j, k] = cost

    def fit(self, X, y=None, sample_weight=None):
        X = self._check_fit_data(X)

        if self.max_iter <= 0:
            raise ValueError("Maximum number of iterations must be greater \
                than zero")

        n_samples, n_features=X.shape

        rs = check_random_state(self.random_state)
        self.labels_ = np.zeros((n_samples))
        centroids_idx = rs.randint(n_samples, size=self.n_clusters)
        self.centroids_ = X[centroids_idx]

        dist = np.zeros((n_samples, self.n_clusters))

        for itr in np.arange(self.max_iter):
            dist.fill(0)           
            self._update_dist(X, dist)
            labels_old = self.labels_
            self.labels_ = dist.argmin(axis=1)
            
            n_same = np.sum(self.labels_ == labels_old)
            if 1-n_same/n_samples < self.tol:
                if self.verbose:
                    print("Converged at iteration "+ str(itr+1))
                break           
            self._update_centroids(X)
            
        self.X_fit_ = X
        return self

    def predict(self, X):
        X = self._check_test_data(X)
        n_samples = X.shape[0]
        dist = np.zeros((n_samples, self.n_clusters))
        self._update_dist(X, dist)
        return dist.argmin(axis=1)


def segmentImage(filename):
#     img = spm.lena()
#     img = spm.imresize(img, (100, 100))
    img = cv2.imread(filename)
    img = color.rgb2gray(img)
    height, width = img.shape
    clf = KMeans(n_clusters=4, max_iter=10, random_state=0, verbose=1)
    img_list = np.reshape(img, (height*width, 1))
    clf.fit(img_list)
    index = np.copy(clf.labels_)
    index = np.reshape(index, (height, width))
    axes = pyplot.gca()
    axes.imshow(index)
    pyplot.show(block=True)	


def clusterToyData():
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=1000, centers=5, random_state=0)

    clf = KMeans(n_clusters=5, max_iter=100, random_state=0, verbose=1)
    print(clf.fit_predict(X)[:10])
    print(clf.predict(X[:10]))
    

In [None]:
# segmentImage('testset/0734.jpg')

In [102]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import time


def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed


class clust():
    def _load_data(self, sklearn_load_ds):
        data = sklearn_load_ds
        X = pd.DataFrame(data.data)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, data.target, test_size=0.3, random_state=42)
        
    def __init__(self, sklearn_load_ds):
        self._load_data(sklearn_load_ds)
    
    @timeit
    def classify(self, model=LogisticRegression(random_state=42)):
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        print('Accuracy: {}'.format(accuracy_score(self.y_test, y_pred)))

    @timeit
    def Kmeans(self, output='add'):
        n_clusters = len(np.unique(self.y_train))
        clf = KMeans(n_clusters = n_clusters, random_state=42)
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.predict(self.X_test)
        if output == 'add':
            self.X_train['km_clust'] = y_labels_train
            self.X_test['km_clust'] = y_labels_test
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self

In [109]:
import tensorflow as tf
from tensorflow.keras import layers
keras = tf.keras
import os

In [94]:

# models = os.popen("ls saved_models").read().split('\n')[:-1]
# tf_model = keras.models.load_model('saved_models/'+models[-1])


In [95]:

clust(load_digits()).Kmeans(output='replace').classify(model=tf_model)

ValueError: Found array with dim 5. Estimator expected <= 2.

In [84]:
len(load_digits().data

SyntaxError: unexpected EOF while parsing (<ipython-input-84-2e52a2f9ed79>, line 1)

In [85]:

data = load_digits()
X = []
for x in data.data:
    X.append(x.reshape(-1, 8, 8, 1))
np.array(X[0]).shape

(1, 8, 8, 1)

In [86]:
np.array(X).shape

(1797, 1, 8, 8, 1)

In [96]:
cl = clust(load_digits())

In [99]:
cl.X_train.shape

(1257, 1, 8, 8, 1)

In [103]:


clust(load_digits()).Kmeans(output='replace').classify(model=SVC())



'Kmeans'  237.90 ms
Accuracy: 0.7833333333333333
'classify'  55.02 ms




In [104]:
clust(load_digits()).classify()

Accuracy: 0.9537037037037037
'classify'  179.25 ms




In [110]:

from keras.datasets import mnist

class clust():
    def _load_data(self, sklearn_load_ds):
        (self.X_train,self.y_train),(self.X_test,self.y_test) = mnist.load_data()
            
        
    def __init__(self, sklearn_load_ds):
        self._load_data(sklearn_load_ds)

    
    def classify(self, model=LogisticRegression(random_state=42)):
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        print('Accuracy: {}'.format(accuracy_score(self.y_test, y_pred)))


    def Kmeans(self, output='add'):
        n_clusters = len(np.unique(self.y_train))
        clf = KMeans(n_clusters = n_clusters, random_state=42)
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.predict(self.X_test)
        if output == 'add':
            self.X_train['km_clust'] = y_labels_train
            self.X_test['km_clust'] = y_labels_test
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self


Using TensorFlow backend.


AttributeError: module 'tensorflow' has no attribute 'name_scope'

In [106]:
clust(load_digits()).Kmeans(output='replace').classify(model=tf_model)

NameError: name 'mnist' is not defined