Merge ef93f15 into a30da80

nicodv · Jun 9, 2017 · 2f3fc30 · 2f3fc30
2 parents a30da80 + ef93f15
commit 2f3fc30
Show file tree

Hide file tree

Showing 7 changed files with 212 additions and 30 deletions.
diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py
@@ -4,6 +4,7 @@
 
 # Author: 'Nico de Vos' <njdevos@gmail.com>
 # License: MIT
+# Added support for Ng et al.'s dissimilarity measure (Ben Andow <beandow@ncsu.edu>)
 
 # pylint: disable=unused-argument,attribute-defined-outside-init
 
@@ -14,13 +15,13 @@
 from sklearn.base import BaseEstimator, ClusterMixin
 from sklearn.utils.validation import check_array
 
-from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
+from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids, genMembshipArray
 from .util.dissim import matching_dissim
 
 
 def init_huang(X, n_clusters, dissim):
     """Initialize centroids according to method by Huang [1997]."""
-    nattrs = X.shape[1]
+    npoints, nattrs = X.shape
     centroids = np.empty((n_clusters, nattrs), dtype='object')
     # determine frequencies of attributes
     for iattr in range(nattrs):
@@ -38,10 +39,11 @@ def init_huang(X, n_clusters, dissim):
         # each with different dict ordering.
         choices = sorted(choices)
         centroids[:, iattr] = np.random.choice(choices, n_clusters)
+    membship = np.zeros((npoints, npoints), dtype=np.uint8)
     # The previously chosen centroids could result in empty clusters,
     # so set centroid to closest point in X.
     for ik in range(n_clusters):
-        ndx = np.argsort(dissim(X, centroids[ik]))
+        ndx = np.argsort(dissim(X, centroids[ik], X=X, membship=membship))
         # We want the centroid to be unique.
         while np.all(X[ndx[0]] == centroids, axis=1).any():
             ndx = np.delete(ndx, 0)
@@ -57,7 +59,7 @@ def init_cao(X, n_clusters, dissim):
     """
     npoints, nattrs = X.shape
     centroids = np.empty((n_clusters, nattrs), dtype='object')
-    # Method is base don determining density of points.
+    # Method is based on determining density of points.
     dens = np.zeros(npoints)
     for iattr in range(nattrs):
         freq = defaultdict(int)
@@ -72,10 +74,11 @@ def init_cao(X, n_clusters, dissim):
     if n_clusters > 1:
         # For the remaining centroids, choose maximum dens * dissim to the
         # (already assigned) centroid with the lowest dens * dissim.
+        membship = np.zeros((npoints, npoints), dtype=np.uint8)
         for ik in range(1, n_clusters):
             dd = np.empty((ik, npoints))
             for ikk in range(ik):
-                dd[ikk] = dissim(X, centroids[ikk]) * dens
+                dd[ikk] = dissim(X, centroids[ikk], X=X, membship=membship) * dens
             centroids[ik] = X[np.argmax(np.min(dd, axis=0))]
 
     return centroids
@@ -113,7 +116,7 @@ def move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
     return cl_attr_freq, membship, centroids
 
 
-def _labels_cost(X, centroids, dissim):
+def _labels_cost(X, centroids, dissim, membship, Xinit=None):
     """Calculate labels and cost function given a matrix of points and
     a list of centroids for the k-modes algorithm.
     """
@@ -124,7 +127,7 @@ def _labels_cost(X, centroids, dissim):
     cost = 0.
     labels = np.empty(npoints, dtype=np.uint8)
     for ipoint, curpoint in enumerate(X):
-        diss = dissim(centroids, curpoint)
+        diss = dissim(centroids, curpoint, X=(X if Xinit is None else Xinit), membship=membship)
         clust = np.argmin(diss)
         labels[ipoint] = clust
         cost += diss[clust]
@@ -136,7 +139,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim):
     """Single iteration of k-modes clustering algorithm"""
     moves = 0
     for ipoint, curpoint in enumerate(X):
-        clust = np.argmin(dissim(centroids, curpoint))
+        clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
         if membship[clust, ipoint]:
             # Point is already in its right place.
             continue
@@ -231,7 +234,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
                         for _ in range(n_clusters)]
         for ipoint, curpoint in enumerate(X):
             # Initial assignment to clusters
-            clust = np.argmin(dissim(centroids, curpoint))
+            clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
             membship[clust, ipoint] = 1
             # Count attribute values per cluster.
             for iattr, curattr in enumerate(curpoint):
@@ -255,7 +258,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
             itr += 1
             centroids, moves = _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim)
             # All points seen in this iteration
-            labels, ncost = _labels_cost(X, centroids, dissim)
+            labels, ncost = _labels_cost(X, centroids, dissim, membship)
             converged = (moves == 0) or (ncost >= cost)
             cost = ncost
             if verbose:
@@ -273,7 +276,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose):
         print("Best run was number {}".format(best + 1))
 
     return all_centroids[best], enc_map, all_labels[best], \
-        all_costs[best], all_n_iters[best]
+        all_costs[best], all_n_iters[best], X
 
 
 class KModes(BaseEstimator, ClusterMixin):
@@ -291,7 +294,7 @@ class KModes(BaseEstimator, ClusterMixin):
         single run.
 
     cat_dissim : func, default: matching_dissim
-        Dissimilarity function used by the algorithm for categorical variables.
+        Dissimilarity function used by the kmodes algorithm for categorical variables.
         Defaults to the matching dissimilarity function.
 
     init : {'Huang', 'Cao', 'random' or an ndarray}, default: 'Cao'
@@ -360,7 +363,7 @@ def fit(self, X, y=None, **kwargs):
         """
 
         self._enc_cluster_centroids, self._enc_map, self.labels_,\
-            self.cost_, self.n_iter_ = k_modes(X,
+            self.cost_, self.n_iter_, self.X = k_modes(X,
                                                self.n_clusters,
                                                self.max_iter,
                                                self.cat_dissim,
@@ -393,7 +396,7 @@ def predict(self, X, **kwargs):
         assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."
         X = check_array(X, dtype=None)
         X, _ = encode_features(X, enc_map=self._enc_map)
-        return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]
+        return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim, genMembshipArray(self.labels_), self.X)[0]
 
     @property
     def cluster_centroids_(self):

diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py
@@ -4,6 +4,7 @@
 
 # Author: 'Nico de Vos' <njdevos@gmail.com>
 # License: MIT
+# Added support for Ng et al.'s dissimilarity measure (Ben Andow <beandow@ncsu.edu>)
 
 # pylint: disable=super-on-old-class,unused-argument,attribute-defined-outside-init
 
@@ -14,7 +15,7 @@
 from sklearn.utils.validation import check_array
 
 from . import kmodes
-from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids
+from .util import get_max_value_key, encode_features, get_unique_rows, decode_centroids, genMembshipArray
 from .util.dissim import matching_dissim, euclidean_dissim
 
 # Number of tries we give the initialization methods to find non-empty
@@ -49,7 +50,7 @@ def _split_num_cat(X, categorical):
     return Xnum, Xcat
 
 
-def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma):
+def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship, XcatInit=None):
     """Calculate labels and cost function given a matrix of points and
     a list of centroids for the k-prototypes algorithm.
     """
@@ -62,7 +63,7 @@ def _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma):
     for ipoint in range(npoints):
         # Numerical cost = sum of Euclidean distances
         num_costs = num_dissim(centroids[0], Xnum[ipoint])
-        cat_costs = cat_dissim(centroids[1], Xcat[ipoint])
+        cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=(Xcat if XcatInit is None else XcatInit), membship=membship)
         # Gamma relates the categorical cost to the numerical cost.
         tot_costs = num_costs + gamma * cat_costs
         clust = np.argmin(tot_costs)
@@ -79,7 +80,7 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_attr_freq,
     for ipoint in range(Xnum.shape[0]):
         clust = np.argmin(
             num_dissim(centroids[0], Xnum[ipoint]) +
-            gamma * cat_dissim(centroids[1], Xcat[ipoint])
+            gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
         )
         if membship[clust, ipoint]:
             # Point is already in its right place.
@@ -246,7 +247,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
                 # Initial assignment to clusters
                 clust = np.argmin(
                     num_dissim(centroids[0], Xnum[ipoint]) +
-                    gamma * cat_dissim(centroids[1], Xcat[ipoint])
+                    gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
                 )
                 membship[clust, ipoint] = 1
                 # Count attribute values per cluster.
@@ -292,7 +293,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
 
             # All points seen in this iteration
             labels, ncost = _labels_cost(Xnum, Xcat, centroids,
-                                         num_dissim, cat_dissim, gamma)
+                                         num_dissim, cat_dissim, gamma, membship)
             converged = (moves == 0) or (ncost >= cost)
             cost = ncost
             if verbose:
@@ -311,7 +312,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
 
     # Note: return gamma in case it was automatically determined.
     return all_centroids[best], enc_map, all_labels[best], \
-        all_costs[best], all_n_iters[best], gamma
+        all_costs[best], all_n_iters[best], gamma, Xcat
 
 
 class KPrototypes(kmodes.KModes):
@@ -332,7 +333,7 @@ class KPrototypes(kmodes.KModes):
         Defaults to the Euclidian dissimilarity function.
 
     cat_dissim : func, default: matching_dissim
-        Dissimilarity function used by the algorithm for categorical variables.
+        Dissimilarity function used by the kmodes algorithm for categorical variables.
         Defaults to the matching dissimilarity function.
 
     n_init : int, default: 10
@@ -386,7 +387,7 @@ class KPrototypes(kmodes.KModes):
     """
 
     def __init__(self, n_clusters=8, max_iter=100, num_dissim=euclidean_dissim,
-                 cat_dissim=matching_dissim, init='Huang', n_init=10, gamma=None,
+				 cat_dissim=matching_dissim, init='Huang', n_init=10, gamma=None,
                  verbose=0):
 
         super(KPrototypes, self).__init__(n_clusters, max_iter, cat_dissim,
@@ -407,7 +408,7 @@ def fit(self, X, y=None, categorical=None):
         # If self.gamma is None, gamma will be automatically determined from
         # the data. The function below returns its value.
         self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_,\
-            self.n_iter_, self.gamma = k_prototypes(X,
+            self.n_iter_, self.gamma, self.Xcat = k_prototypes(X,
                                                     categorical,
                                                     self.n_clusters,
                                                     self.max_iter,
@@ -439,7 +440,7 @@ def predict(self, X, categorical=None):
         Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
         Xcat, _ = encode_features(Xcat, enc_map=self._enc_map)
         return _labels_cost(Xnum, Xcat, self._enc_cluster_centroids,
-                            self.num_dissim, self.cat_dissim, self.gamma)[0]
+                            self.num_dissim, self.cat_dissim, self.gamma, genMembshipArray(self.labels_), self.Xcat)[0]
 
     @property
     def cluster_centroids_(self):

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
@@ -9,7 +9,7 @@
 from sklearn.utils.testing import assert_equal
 
 from kmodes import kmodes
-
+from kmodes.util.dissim import ng_dissim
 
 SOYBEAN = np.array([
 [4,0,2,1,1,1,0,1,0,2,1,1,0,2,2,0,0,0,1,0,3,1,1,1,0,0,0,0,4,0,0,0,0,0,0,'D1'],
@@ -198,3 +198,49 @@ def test_kmodes_nunique_nclusters(self):
         np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                       np.array([[0, 1],
                                                 [0, 2]]))
+
+    # Test cases for Ng dissimilarity measure (Ben Andow <beandow@ncsu.edu>)
+    def test_kmodes_huang_soybean_ng(self):
+        np.random.seed(42)
+        kmodes_huang = kmodes.KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=ng_dissim)
+        result = kmodes_huang.fit_predict(SOYBEAN)
+        expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kmodes_cao_soybean_ng(self):
+        kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        result = kmodes_cao.fit_predict(SOYBEAN)
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kmodes_predict_soybean_ng(self):
+        kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        kmodes_cao = kmodes_cao.fit(SOYBEAN)
+        result = kmodes_cao.predict(SOYBEAN2)
+        expected = np.array([2, 1, 3, 0])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kmodes_nunique_nclusters_ng(self):
+        data = np.array([
+            [0, 1],
+            [0, 1],
+            [0, 1],
+            [0, 2],
+            [0, 2],
+            [0, 2]
+        ])
+        np.random.seed(42)
+        kmodes_cao = kmodes.KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        result = kmodes_cao.fit_predict(data, categorical=[1])
+        expected = np.array([0, 0, 0, 1, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
+                                      np.array([[0, 1],
+                                                [0, 2]]))
diff --git a/kmodes/tests/test_kprototypes.py b/kmodes/tests/test_kprototypes.py
@@ -10,6 +10,7 @@
 
 from kmodes import kprototypes
 from kmodes.tests.test_kmodes import assert_cluster_splits_equal
+from kmodes.util.dissim import ng_dissim
 
 STOCKS = np.array([
     [738.5, 'tech', 'USA'],
@@ -244,3 +245,51 @@ def test_kprotoypes_no_categoricals(self):
         kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2)
         with self.assertRaises(NotImplementedError):
             kproto_cao.fit(STOCKS, categorical=[])
+
+
+    # Test cases for Ng dissimilarity measure (Ben Andow <beandow@ncsu.edu>)
+    def test_kprotoypes_huang_stocks_ng(self):
+        np.random.seed(42)
+        kproto_huang = kprototypes.KPrototypes(n_clusters=4, n_init=1, init='Huang', verbose=2, cat_dissim=ng_dissim)
+        # Untrained model
+        with self.assertRaises(AssertionError):
+            kproto_huang.predict(STOCKS, categorical=[1, 2])
+        result = kproto_huang.fit_predict(STOCKS, categorical=[1, 2])
+        expected = np.array([0, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kprotoypes_cao_stocks_ng(self):
+        np.random.seed(42)
+        kproto_cao = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        result = kproto_cao.fit_predict(STOCKS, categorical=[1, 2])
+        expected = np.array([2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kprotoypes_predict_stocks_ng(self):
+        np.random.seed(42)
+        kproto_cao = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
+        kproto_cao = kproto_cao.fit(STOCKS, categorical=[1, 2])
+        result = kproto_cao.predict(STOCKS2, categorical=[1, 2])
+        expected = np.array([1, 1, 3, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
+
+    def test_kprotoypes_init_stocks_ng(self):
+        init_vals = [
+            np.array([[356.975],
+                      [275.35],
+                      [738.5],
+                      [197.667]]),
+            np.array([[3, 2],
+                      [0, 2],
+                      [3, 2],
+                      [2, 2]])
+        ]
+        np.random.seed(42)
+        kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2, cat_dissim=ng_dissim)
+        result = kproto_init.fit_predict(STOCKS, categorical=[1, 2])
+        expected = np.array([2, 0, 0, 0, 0, 1, 1, 1, 1, 3, 3, 3])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint8))
diff --git a/kmodes/util/__init__.py b/kmodes/util/__init__.py
@@ -71,3 +71,7 @@ def decode_centroids(encoded, mapping):
 def get_unique_rows(a):
     """Gets the unique rows in a numpy array."""
     return np.vstack({tuple(row) for row in a})
+
+def genMembshipArray(labels):
+    """Generates the cluster membership array"""
+    return np.array([[1 if v == num else 0 for v in labels ] for num in range(0, np.amax(labels) + 1)])