Skip to content

Commit

Permalink
Merge ef93f15 into a30da80
Browse files Browse the repository at this point in the history
  • Loading branch information
benandow committed Jun 9, 2017
2 parents a30da80 + ef93f15 commit 2f3fc30
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 30 deletions.
31 changes: 17 additions & 14 deletions kmodes/kmodes.py
Expand Up @@ -4,6 +4,7 @@

# Author: 'Nico de Vos' <njdevos@gmail.com>
# License: MIT
# Added support for Ng et al.'s dissimilarity measure (Ben Andow <beandow@ncsu.edu>)

# pylint: disable=unused-argument,attribute-defined-outside-init

Expand All @@ -14,13 +15,13 @@
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils.validation import check_array

from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids, genMembshipArray
from .util.dissim import matching_dissim


def init_huang(X, n_clusters, dissim):
"""Initialize centroids according to method by Huang [1997]."""
nattrs = X.shape[1]
npoints, nattrs = X.shape
centroids = np.empty((n_clusters, nattrs), dtype='object')
# determine frequencies of attributes
for iattr in range(nattrs):
Expand All @@ -38,10 +39,11 @@ def init_huang(X, n_clusters, dissim):
# each with different dict ordering.
choices = sorted(choices)
centroids[:, iattr] = np.random.choice(choices, n_clusters)
membship = np.zeros((npoints, npoints), dtype=np.uint8)
# The previously chosen centroids could result in empty clusters,
# so set centroid to closest point in X.
for ik in range(n_clusters):
ndx = np.argsort(dissim(X, centroids[ik]))
ndx = np.argsort(dissim(X, centroids[ik], X=X, membship=membship))
# We want the centroid to be unique.
while np.all(X[ndx[0]] == centroids, axis=1).any():
ndx = np.delete(ndx, 0)
Expand All @@ -57,7 +59,7 @@ def init_cao(X, n_clusters, dissim):
"""
npoints, nattrs = X.shape
centroids = np.empty((n_clusters, nattrs), dtype='object')
# Method is base don determining density of points.
# Method is based on determining density of points.
dens = np.zeros(npoints)
for iattr in range(nattrs):
freq = defaultdict(int)
Expand All @@ -72,10 +74,11 @@ def init_cao(X, n_clusters, dissim):
if n_clusters > 1:
# For the remaining centroids, choose maximum dens * dissim to the
# (already assigned) centroid with the lowest dens * dissim.
membship = np.zeros((npoints, npoints), dtype=np.uint8)
for ik in range(1, n_clusters):
dd = np.empty((ik, npoints))
for ikk in range(ik):
dd[ikk] = dissim(X, centroids[ikk]) * dens
dd[ikk] = dissim(X, centroids[ikk], X=X, membship=membship) * dens
centroids[ik] = X[np.argmax(np.min(dd, axis=0))]

return centroids
Expand Down Expand Up @@ -113,7 +116,7 @@ def move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
return cl_attr_freq, membship, centroids


def _labels_cost(X, centroids, dissim):
def _labels_cost(X, centroids, dissim, membship, Xinit=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-modes algorithm.
"""
Expand All @@ -124,7 +127,7 @@ def _labels_cost(X, centroids, dissim):
cost = 0.
labels = np.empty(npoints, dtype=np.uint8)
for ipoint, curpoint in enumerate(X):
diss = dissim(centroids, curpoint)
diss = dissim(centroids, curpoint, X=(X if Xinit is None else Xinit), membship=membship)
clust = np.argmin(diss)
labels[ipoint] = clust
cost += diss[clust]
Expand All @@ -136,7 +139,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim):
"""Single iteration of k-modes clustering algorithm"""
moves = 0
for ipoint, curpoint in enumerate(X):
clust = np.argmin(dissim(centroids, curpoint))
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
if membship[clust, ipoint]:
# Point is already in its right place.
continue
Expand Down Expand Up @@ -231,7 +234,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
for _ in range(n_clusters)]
for ipoint, curpoint in enumerate(X):
# Initial assignment to clusters
clust = np.argmin(dissim(centroids, curpoint))
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
membship[clust, ipoint] = 1
# Count attribute values per cluster.
for iattr, curattr in enumerate(curpoint):
Expand All @@ -255,7 +258,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
itr += 1
centroids, moves = _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim)
# All points seen in this iteration
labels, ncost = _labels_cost(X, centroids, dissim)
labels, ncost = _labels_cost(X, centroids, dissim, membship)
converged = (moves == 0) or (ncost >= cost)
cost = ncost
if verbose:
Expand All @@ -273,7 +276,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
print("Best run was number {}".format(best + 1))

return all_centroids[best], enc_map, all_labels[best], \
all_costs[best], all_n_iters[best]
all_costs[best], all_n_iters[best], X


class KModes(BaseEstimator, ClusterMixin):
Expand All @@ -291,7 +294,7 @@ class KModes(BaseEstimator, ClusterMixin):
single run.
cat_dissim : func, default: matching_dissim
Dissimilarity function used by the algorithm for categorical variables.
Dissimilarity function used by the kmodes algorithm for categorical variables.
Defaults to the matching dissimilarity function.
init : {'Huang', 'Cao', 'random' or an ndarray}, default: 'Cao'
Expand Down Expand Up @@ -360,7 +363,7 @@ def fit(self, X, y=None, **kwargs):
"""

self._enc_cluster_centroids, self._enc_map, self.labels_,\
self.cost_, self.n_iter_ = k_modes(X,
self.cost_, self.n_iter_, self.X = k_modes(X,
self.n_clusters,
self.max_iter,
self.cat_dissim,
Expand Down Expand Up @@ -393,7 +396,7 @@ def predict(self, X, **kwargs):
assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."
X = check_array(X, dtype=None)
X, _ = encode_features(X, enc_map=self._enc_map)
return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]
return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim, genMembshipArray(self.labels_), self.X)[0]

@property
def cluster_centroids_(self):
Expand Down
23 changes: 12 additions & 11 deletions kmodes/kprototypes.py
Expand Up @@ -4,6 +4,7 @@

# Author: 'Nico de Vos' <njdevos@gmail.com>
# License: MIT
# Added support for Ng et al.'s dissimilarity measure (Ben Andow <beandow@ncsu.edu>)

# pylint: disable=super-on-old-class,unused-argument,attribute-defined-outside-init

Expand All @@ -14,7 +15,7 @@
from sklearn.utils.validation import check_array

from . import kmodes
from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids, genMembshipArray
from .util.dissim import matching_dissim, euclidean_dissim

# Number of tries we give the initialization methods to find non-empty
Expand Down Expand Up @@ -49,7 +50,7 @@ def _split_num_cat(X, categorical):
return Xnum, Xcat


def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma):
def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship, XcatInit=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-prototypes algorithm.
"""
Expand All @@ -62,7 +63,7 @@ def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma):
for ipoint in range(npoints):
# Numerical cost = sum of Euclidean distances
num_costs = num_dissim(centroids[0], Xnum[ipoint])
cat_costs = cat_dissim(centroids[1], Xcat[ipoint])
cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=(Xcat if XcatInit is None else XcatInit), membship=membship)
# Gamma relates the categorical cost to the numerical cost.
tot_costs = num_costs + gamma * cat_costs
clust = np.argmin(tot_costs)
Expand All @@ -79,7 +80,7 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_attr_freq,
for ipoint in range(Xnum.shape[0]):
clust = np.argmin(
num_dissim(centroids[0], Xnum[ipoint]) +
gamma * cat_dissim(centroids[1], Xcat[ipoint])
gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
)
if membship[clust, ipoint]:
# Point is already in its right place.
Expand Down Expand Up @@ -246,7 +247,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
# Initial assignment to clusters
clust = np.argmin(
num_dissim(centroids[0], Xnum[ipoint]) +
gamma * cat_dissim(centroids[1], Xcat[ipoint])
gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
)
membship[clust, ipoint] = 1
# Count attribute values per cluster.
Expand Down Expand Up @@ -292,7 +293,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,

# All points seen in this iteration
labels, ncost = _labels_cost(Xnum, Xcat, centroids,
num_dissim, cat_dissim, gamma)
num_dissim, cat_dissim, gamma, membship)
converged = (moves == 0) or (ncost >= cost)
cost = ncost
if verbose:
Expand All @@ -311,7 +312,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,

# Note: return gamma in case it was automatically determined.
return all_centroids[best], enc_map, all_labels[best], \
all_costs[best], all_n_iters[best], gamma
all_costs[best], all_n_iters[best], gamma, Xcat


class KPrototypes(kmodes.KModes):
Expand All @@ -332,7 +333,7 @@ class KPrototypes(kmodes.KModes):
Defaults to the Euclidian dissimilarity function.
cat_dissim : func, default: matching_dissim
Dissimilarity function used by the algorithm for categorical variables.
Dissimilarity function used by the kmodes algorithm for categorical variables.
Defaults to the matching dissimilarity function.
n_init : int, default: 10
Expand Down Expand Up @@ -386,7 +387,7 @@ class KPrototypes(kmodes.KModes):
"""

def __init__(self, n_clusters=8, max_iter=100, num_dissim=euclidean_dissim,
cat_dissim=matching_dissim, init='Huang', n_init=10, gamma=None,
cat_dissim=matching_dissim, init='Huang', n_init=10, gamma=None,
verbose=0):

super(KPrototypes, self).__init__(n_clusters, max_iter, cat_dissim,
Expand All @@ -407,7 +408,7 @@ def fit(self, X, y=None, categorical=None):
# If self.gamma is None, gamma will be automatically determined from
# the data. The function below returns its value.
self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_,\
self.n_iter_, self.gamma = k_prototypes(X,
self.n_iter_, self.gamma, self.Xcat = k_prototypes(X,
categorical,
self.n_clusters,
self.max_iter,
Expand Down Expand Up @@ -439,7 +440,7 @@ def predict(self, X, categorical=None):
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
Xcat, _ = encode_features(Xcat, enc_map=self._enc_map)
return _labels_cost(Xnum, Xcat, self._enc_cluster_centroids,
self.num_dissim, self.cat_dissim, self.gamma)[0]
self.num_dissim, self.cat_dissim, self.gamma, genMembshipArray(self.labels_), self.Xcat)[0]

@property
def cluster_centroids_(self):
Expand Down
48 changes: 47 additions & 1 deletion kmodes/tests/test_kmodes.py
Expand Up @@ -9,7 +9,7 @@
from sklearn.utils.testing import assert_equal

from kmodes import kmodes

from kmodes.util.dissim import ng_dissim

SOYBEAN = np.array([
[4,0,2,1,1,1,0,1,0,2,1,1,0,2,2,0,0,0,1,0,3,1,1,1,0,0,0,0,4,0,0,0,0,0,0,'D1'],
Expand Down Expand Up @@ -198,3 +198,49 @@ def test_kmodes_nunique_nclusters(self):
np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
np.array([[0, 1],
[0, 2]]))

# Test cases for Ng dissimilarity measure (Ben Andow <beandow@ncsu.edu>)
def test_kmodes_huang_soybean_ng(self):
np.random.seed(42)
kmodes_huang = kmodes.KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=ng_dissim)
result = kmodes_huang.fit_predict(SOYBEAN)
expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kmodes_cao_soybean_ng(self):
kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
result = kmodes_cao.fit_predict(SOYBEAN)
expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kmodes_predict_soybean_ng(self):
kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
kmodes_cao = kmodes_cao.fit(SOYBEAN)
result = kmodes_cao.predict(SOYBEAN2)
expected = np.array([2, 1, 3, 0])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kmodes_nunique_nclusters_ng(self):
data = np.array([
[0, 1],
[0, 1],
[0, 1],
[0, 2],
[0, 2],
[0, 2]
])
np.random.seed(42)
kmodes_cao = kmodes.KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim)
result = kmodes_cao.fit_predict(data, categorical=[1])
expected = np.array([0, 0, 0, 1, 1, 1])
assert_cluster_splits_equal(result, expected)
np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
np.array([[0, 1],
[0, 2]]))
49 changes: 49 additions & 0 deletions kmodes/tests/test_kprototypes.py
Expand Up @@ -10,6 +10,7 @@

from kmodes import kprototypes
from kmodes.tests.test_kmodes import assert_cluster_splits_equal
from kmodes.util.dissim import ng_dissim

STOCKS = np.array([
[738.5, 'tech', 'USA'],
Expand Down Expand Up @@ -244,3 +245,51 @@ def test_kprotoypes_no_categoricals(self):
kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2)
with self.assertRaises(NotImplementedError):
kproto_cao.fit(STOCKS, categorical=[])


# Test cases for Ng dissimilarity measure (Ben Andow <beandow@ncsu.edu>)
def test_kprotoypes_huang_stocks_ng(self):
np.random.seed(42)
kproto_huang = kprototypes.KPrototypes(n_clusters=4, n_init=1, init='Huang', verbose=2, cat_dissim=ng_dissim)
# Untrained model
with self.assertRaises(AssertionError):
kproto_huang.predict(STOCKS, categorical=[1, 2])
result = kproto_huang.fit_predict(STOCKS, categorical=[1, 2])
expected = np.array([0, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kprotoypes_cao_stocks_ng(self):
np.random.seed(42)
kproto_cao = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
result = kproto_cao.fit_predict(STOCKS, categorical=[1, 2])
expected = np.array([2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 1, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kprotoypes_predict_stocks_ng(self):
np.random.seed(42)
kproto_cao = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
kproto_cao = kproto_cao.fit(STOCKS, categorical=[1, 2])
result = kproto_cao.predict(STOCKS2, categorical=[1, 2])
expected = np.array([1, 1, 3, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))

def test_kprotoypes_init_stocks_ng(self):
init_vals = [
np.array([[356.975],
[275.35],
[738.5],
[197.667]]),
np.array([[3, 2],
[0, 2],
[3, 2],
[2, 2]])
]
np.random.seed(42)
kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2, cat_dissim=ng_dissim)
result = kproto_init.fit_predict(STOCKS, categorical=[1, 2])
expected = np.array([2, 0, 0, 0, 0, 1, 1, 1, 1, 3, 3, 3])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint8))
4 changes: 4 additions & 0 deletions kmodes/util/__init__.py
Expand Up @@ -71,3 +71,7 @@ def decode_centroids(encoded, mapping):
def get_unique_rows(a):
"""Gets the unique rows in a numpy array."""
return np.vstack({tuple(row) for row in a})

def genMembshipArray(labels):
"""Generates the cluster membership array"""
return np.array([[1 if v == num else 0 for v in labels ] for num in range(0, np.amax(labels) + 1)])

0 comments on commit 2f3fc30

Please sign in to comment.