Skip to content

Commit

Permalink
extract init methods into file
Browse files Browse the repository at this point in the history
  • Loading branch information
nicodv committed Mar 7, 2021
1 parent f2a80ab commit b65184e
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 64 deletions.
66 changes: 4 additions & 62 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .util import get_max_value_key, encode_features, get_unique_rows, \
decode_centroids, pandas_to_numpy
from .util.dissim import matching_dissim, ng_dissim
from .util.init_methods import init_cao, init_huang


class KModes(BaseEstimator, ClusterMixin):
Expand Down Expand Up @@ -332,74 +333,15 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
choices = [ii for ii, ch in enumerate(membship[from_clust, :]) if ch]
rindx = random_state.choice(choices)

cl_attr_freq, membship, centroids = move_point_cat(
cl_attr_freq, membship, centroids = _move_point_cat(
X[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship, centroids
)

return centroids, moves


def init_huang(X, n_clusters, dissim, random_state):
"""Initialize centroids according to method by Huang [1997]."""
n_attrs = X.shape[1]
centroids = np.empty((n_clusters, n_attrs), dtype='object')
# determine frequencies of attributes
for iattr in range(n_attrs):
# Sample centroids using the probabilities of attributes.
# (I assume that's what's meant in the Huang [1998] paper; it works,
# at least)
# Note: sampling using population in static list with as many choices
# as frequency counts. Since the counts are small integers,
# memory consumption is low.
choices = X[:, iattr]
# So that we are consistent between Python versions,
# each with different dict ordering.
choices = sorted(choices)
centroids[:, iattr] = random_state.choice(choices, n_clusters)
# The previously chosen centroids could result in empty clusters,
# so set centroid to closest point in X.
for ik in range(n_clusters):
ndx = np.argsort(dissim(X, centroids[ik]))
# We want the centroid to be unique, if possible.
while np.all(X[ndx[0]] == centroids, axis=1).any() and ndx.shape[0] > 1:
ndx = np.delete(ndx, 0)
centroids[ik] = X[ndx[0]]

return centroids


def init_cao(X, n_clusters, dissim):
"""Initialize centroids according to method by Cao et al. [2009].
Note: O(N * attr * n_clusters**2), so watch out with large n_clusters
"""
n_points, n_attrs = X.shape
centroids = np.empty((n_clusters, n_attrs), dtype='object')
# Method is based on determining density of points.
dens = np.zeros(n_points)
for iattr in range(n_attrs):
freq = defaultdict(int)
for val in X[:, iattr]:
freq[val] += 1
for ipoint in range(n_points):
dens[ipoint] += freq[X[ipoint, iattr]] / float(n_points) / float(n_attrs)

# Choose initial centroids based on distance and density.
centroids[0] = X[np.argmax(dens)]
if n_clusters > 1:
# For the remaining centroids, choose maximum dens * dissim to the
# (already assigned) centroid with the lowest dens * dissim.
for ik in range(1, n_clusters):
dd = np.empty((ik, n_points))
for ikk in range(ik):
dd[ikk] = dissim(X, centroids[ikk]) * dens
centroids[ik] = X[np.argmax(np.min(dd, axis=0))]

return centroids


def move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
membship, centroids):
def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
membship, centroids):
"""Move point between clusters, categorical attributes."""
membship[to_clust, ipoint] = 1
membship[from_clust, ipoint] = 0
Expand Down
5 changes: 3 additions & 2 deletions kmodes/kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .util import get_max_value_key, encode_features, get_unique_rows, \
decode_centroids, pandas_to_numpy
from .util.dissim import matching_dissim, euclidean_dissim
from .util.init_methods import init_cao, init_huang

# Number of tries we give the initialization methods to find non-empty
# clusters before we switch to random initialization.
Expand Down Expand Up @@ -296,9 +297,9 @@ def k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
if verbose:
print("Init: initializing centroids")
if isinstance(init, str) and init.lower() == 'huang':
centroids = kmodes.init_huang(Xcat, n_clusters, cat_dissim, random_state)
centroids = init_huang(Xcat, n_clusters, cat_dissim, random_state)
elif isinstance(init, str) and init.lower() == 'cao':
centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim)
centroids = init_cao(Xcat, n_clusters, cat_dissim)
elif isinstance(init, str) and init.lower() == 'random':
seeds = random_state.choice(range(n_points), n_clusters)
centroids = Xcat[seeds]
Expand Down
62 changes: 62 additions & 0 deletions kmodes/util/init_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from collections import defaultdict

import numpy as np


def init_huang(X, n_clusters, dissim, random_state):
"""Initialize centroids according to method by Huang [1997]."""
n_attrs = X.shape[1]
centroids = np.empty((n_clusters, n_attrs), dtype='object')
# determine frequencies of attributes
for iattr in range(n_attrs):
# Sample centroids using the probabilities of attributes.
# (I assume that's what's meant in the Huang [1998] paper; it works,
# at least)
# Note: sampling using population in static list with as many choices
# as frequency counts. Since the counts are small integers,
# memory consumption is low.
choices = X[:, iattr]
# So that we are consistent between Python versions,
# each with different dict ordering.
choices = sorted(choices)
centroids[:, iattr] = random_state.choice(choices, n_clusters)
# The previously chosen centroids could result in empty clusters,
# so set centroid to closest point in X.
for ik in range(n_clusters):
ndx = np.argsort(dissim(X, centroids[ik]))
# We want the centroid to be unique, if possible.
while np.all(X[ndx[0]] == centroids, axis=1).any() and ndx.shape[0] > 1:
ndx = np.delete(ndx, 0)
centroids[ik] = X[ndx[0]]

return centroids


def init_cao(X, n_clusters, dissim):
"""Initialize centroids according to method by Cao et al. [2009].
Note: O(N * attr * n_clusters**2), so watch out with large n_clusters
"""
n_points, n_attrs = X.shape
centroids = np.empty((n_clusters, n_attrs), dtype='object')
# Method is based on determining density of points.
dens = np.zeros(n_points)
for iattr in range(n_attrs):
freq = defaultdict(int)
for val in X[:, iattr]:
freq[val] += 1
for ipoint in range(n_points):
dens[ipoint] += freq[X[ipoint, iattr]] / float(n_points) / float(n_attrs)

# Choose initial centroids based on distance and density.
centroids[0] = X[np.argmax(dens)]
if n_clusters > 1:
# For the remaining centroids, choose maximum dens * dissim to the
# (already assigned) centroid with the lowest dens * dissim.
for ik in range(1, n_clusters):
dd = np.empty((ik, n_points))
for ikk in range(ik):
dd[ikk] = dissim(X, centroids[ikk]) * dens
centroids[ik] = X[np.argmax(np.min(dd, axis=0))]

return centroids

0 comments on commit b65184e

Please sign in to comment.