extract init methods into file

nicodv · Mar 7, 2021 · b65184e · b65184e
1 parent f2a80ab
commit b65184e
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 64 deletions.
diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py
@@ -16,6 +16,7 @@
 from .util import get_max_value_key, encode_features, get_unique_rows, \
     decode_centroids, pandas_to_numpy
 from .util.dissim import matching_dissim, ng_dissim
+from .util.init_methods import init_cao, init_huang
 
 
 class KModes(BaseEstimator, ClusterMixin):
@@ -332,74 +333,15 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
             choices = [ii for ii, ch in enumerate(membship[from_clust, :]) if ch]
             rindx = random_state.choice(choices)
 
-            cl_attr_freq, membship, centroids = move_point_cat(
+            cl_attr_freq, membship, centroids = _move_point_cat(
                 X[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship, centroids
             )
 
     return centroids, moves
 
 
-def init_huang(X, n_clusters, dissim, random_state):
-    """Initialize centroids according to method by Huang [1997]."""
-    n_attrs = X.shape[1]
-    centroids = np.empty((n_clusters, n_attrs), dtype='object')
-    # determine frequencies of attributes
-    for iattr in range(n_attrs):
-        # Sample centroids using the probabilities of attributes.
-        # (I assume that's what's meant in the Huang [1998] paper; it works,
-        # at least)
-        # Note: sampling using population in static list with as many choices
-        # as frequency counts. Since the counts are small integers,
-        # memory consumption is low.
-        choices = X[:, iattr]
-        # So that we are consistent between Python versions,
-        # each with different dict ordering.
-        choices = sorted(choices)
-        centroids[:, iattr] = random_state.choice(choices, n_clusters)
-    # The previously chosen centroids could result in empty clusters,
-    # so set centroid to closest point in X.
-    for ik in range(n_clusters):
-        ndx = np.argsort(dissim(X, centroids[ik]))
-        # We want the centroid to be unique, if possible.
-        while np.all(X[ndx[0]] == centroids, axis=1).any() and ndx.shape[0] > 1:
-            ndx = np.delete(ndx, 0)
-        centroids[ik] = X[ndx[0]]
-
-    return centroids
-
-
-def init_cao(X, n_clusters, dissim):
-    """Initialize centroids according to method by Cao et al. [2009].
-
-    Note: O(N * attr * n_clusters**2), so watch out with large n_clusters
-    """
-    n_points, n_attrs = X.shape
-    centroids = np.empty((n_clusters, n_attrs), dtype='object')
-    # Method is based on determining density of points.
-    dens = np.zeros(n_points)
-    for iattr in range(n_attrs):
-        freq = defaultdict(int)
-        for val in X[:, iattr]:
-            freq[val] += 1
-        for ipoint in range(n_points):
-            dens[ipoint] += freq[X[ipoint, iattr]] / float(n_points) / float(n_attrs)
-
-    # Choose initial centroids based on distance and density.
-    centroids[0] = X[np.argmax(dens)]
-    if n_clusters > 1:
-        # For the remaining centroids, choose maximum dens * dissim to the
-        # (already assigned) centroid with the lowest dens * dissim.
-        for ik in range(1, n_clusters):
-            dd = np.empty((ik, n_points))
-            for ikk in range(ik):
-                dd[ikk] = dissim(X, centroids[ikk]) * dens
-            centroids[ik] = X[np.argmax(np.min(dd, axis=0))]
-
-    return centroids
-
-
-def move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
-                   membship, centroids):
+def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
+                    membship, centroids):
     """Move point between clusters, categorical attributes."""
     membship[to_clust, ipoint] = 1
     membship[from_clust, ipoint] = 0

diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py
@@ -16,6 +16,7 @@
 from .util import get_max_value_key, encode_features, get_unique_rows, \
     decode_centroids, pandas_to_numpy
 from .util.dissim import matching_dissim, euclidean_dissim
+from .util.init_methods import init_cao, init_huang
 
 # Number of tries we give the initialization methods to find non-empty
 # clusters before we switch to random initialization.
@@ -296,9 +297,9 @@ def k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
         if verbose:
             print("Init: initializing centroids")
         if isinstance(init, str) and init.lower() == 'huang':
-            centroids = kmodes.init_huang(Xcat, n_clusters, cat_dissim, random_state)
+            centroids = init_huang(Xcat, n_clusters, cat_dissim, random_state)
         elif isinstance(init, str) and init.lower() == 'cao':
-            centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim)
+            centroids = init_cao(Xcat, n_clusters, cat_dissim)
         elif isinstance(init, str) and init.lower() == 'random':
             seeds = random_state.choice(range(n_points), n_clusters)
             centroids = Xcat[seeds]

diff --git a/kmodes/util/init_methods.py b/kmodes/util/init_methods.py
@@ -0,0 +1,62 @@
+from collections import defaultdict
+
+import numpy as np
+
+
+def init_huang(X, n_clusters, dissim, random_state):
+    """Initialize centroids according to method by Huang [1997]."""
+    n_attrs = X.shape[1]
+    centroids = np.empty((n_clusters, n_attrs), dtype='object')
+    # determine frequencies of attributes
+    for iattr in range(n_attrs):
+        # Sample centroids using the probabilities of attributes.
+        # (I assume that's what's meant in the Huang [1998] paper; it works,
+        # at least)
+        # Note: sampling using population in static list with as many choices
+        # as frequency counts. Since the counts are small integers,
+        # memory consumption is low.
+        choices = X[:, iattr]
+        # So that we are consistent between Python versions,
+        # each with different dict ordering.
+        choices = sorted(choices)
+        centroids[:, iattr] = random_state.choice(choices, n_clusters)
+    # The previously chosen centroids could result in empty clusters,
+    # so set centroid to closest point in X.
+    for ik in range(n_clusters):
+        ndx = np.argsort(dissim(X, centroids[ik]))
+        # We want the centroid to be unique, if possible.
+        while np.all(X[ndx[0]] == centroids, axis=1).any() and ndx.shape[0] > 1:
+            ndx = np.delete(ndx, 0)
+        centroids[ik] = X[ndx[0]]
+
+    return centroids
+
+
+def init_cao(X, n_clusters, dissim):
+    """Initialize centroids according to method by Cao et al. [2009].
+
+    Note: O(N * attr * n_clusters**2), so watch out with large n_clusters
+    """
+    n_points, n_attrs = X.shape
+    centroids = np.empty((n_clusters, n_attrs), dtype='object')
+    # Method is based on determining density of points.
+    dens = np.zeros(n_points)
+    for iattr in range(n_attrs):
+        freq = defaultdict(int)
+        for val in X[:, iattr]:
+            freq[val] += 1
+        for ipoint in range(n_points):
+            dens[ipoint] += freq[X[ipoint, iattr]] / float(n_points) / float(n_attrs)
+
+    # Choose initial centroids based on distance and density.
+    centroids[0] = X[np.argmax(dens)]
+    if n_clusters > 1:
+        # For the remaining centroids, choose maximum dens * dissim to the
+        # (already assigned) centroid with the lowest dens * dissim.
+        for ik in range(1, n_clusters):
+            dd = np.empty((ik, n_points))
+            for ikk in range(ik):
+                dd[ikk] = dissim(X, centroids[ikk]) * dens
+            centroids[ik] = X[np.argmax(np.min(dd, axis=0))]
+
+    return centroids