Merge 9fe5b6d into 2bc7fce

nicodv · Jul 26, 2019 · 576264c · 576264c
2 parents 2bc7fce + 9fe5b6d
commit 576264c
Show file tree

Hide file tree

Showing 3 changed files with 240 additions and 1 deletion.
diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
@@ -9,7 +9,7 @@
 from sklearn.utils.testing import assert_equal
 
 from kmodes.kmodes import KModes
-from kmodes.util.dissim import ng_dissim
+from kmodes.util.dissim import ng_dissim, jaccard_dissim_binary, jaccard_dissim_label
 
 
 SOYBEAN = np.array([
@@ -124,6 +124,102 @@
 # Drop target column
 SOYBEAN2 = SOYBEAN2[:, :35]
 
+# test data with categorical variables that have been label encoded
+TEST_DATA = np.array([
+    [2, 22, 14, 45,  2,  0,  1,  2,  5],
+    [2, 13, 13, 19,  2,  0,  1,  2,  5],
+    [3, 25,  4,  3,  0,  1,  2,  0,  4],
+    [2, 13, 15, 18,  0,  1,  2,  2,  3],
+    [3, 10,  4, 42,  0,  2,  1,  1,  2],
+    [2, 16, 21, 14,  0,  1,  2,  2,  2],
+    [2, 16, 19, 37,  0,  2,  1,  2,  2],
+    [2, 20,  9, 34,  0,  1,  2,  3,  5],
+    [2, 14, 21, 44,  0,  1,  2,  3,  2],
+    [2, 26,  5, 30,  0,  1,  2,  3,  3],
+    [3, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 20,  1, 27,  3,  3,  3,  2,  0],
+    [3,  6,  8, 19,  0,  1,  2,  1,  2],
+    [2, 13,  8, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 16, 19, 42,  0,  1,  2,  2,  5],
+    [7,  7,  5, 43,  0,  2,  1,  2,  2],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [3,  3,  5, 12,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [7, 15, 19, 17,  0,  1,  2,  2,  2],
+    [1,  1, 15, 24,  0,  1,  2,  2,  2],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2,  5,  7,  9,  0,  1,  2,  3,  5],
+    [2, 24,  6, 10,  0,  2,  1,  2,  2],
+    [2, 13, 16, 29,  0,  2,  1,  2,  2],
+    [3,  6,  8,  1,  0,  1,  2,  2,  5],
+    [2, 16, 15, 34,  0,  1,  2,  2,  1],
+    [0, 24, 14, 12,  3,  3,  3,  2,  0],
+    [3,  8, 21, 13,  3,  3,  3,  2,  0],
+    [2, 17, 15, 42,  3,  3,  3,  2,  0],
+    [2, 25, 18, 16,  3,  3,  3,  2,  0],
+    [2,  3, 15, 42,  3,  3,  3,  2,  0],
+    [6, 13, 15, 22,  3,  3,  3,  2,  0],
+    [3,  8, 18, 24,  1,  0,  2,  2,  5],
+    [7, 20, 15, 26,  1,  0,  2,  2,  1],
+    [2, 20,  7, 35,  0,  1,  2,  2,  5],
+    [2, 16, 12, 28,  0,  1,  2,  2,  5],
+    [2, 16,  5, 39,  0,  1,  2,  2,  2],
+    [3,  6, 11,  8,  0,  1,  2,  2,  2],
+    [7,  6, 15, 44,  1,  0,  2,  2,  4],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 16,  7,  6,  3,  3,  3,  2,  0],
+    [1, 13,  2, 46,  3,  3,  3,  2,  0],
+    [0, 14,  5, 41,  3,  3,  3,  2,  0],
+    [2, 24, 19,  0,  3,  3,  3,  2,  0],
+    [2, 14,  3, 35,  3,  3,  3,  2,  0],
+    [6, 19,  7,  5,  0,  2,  1,  2,  2],
+    [5,  6, 11, 44,  3,  3,  3,  2,  0],
+    [7, 16, 21, 21,  3,  3,  3,  2,  0],
+    [2, 19,  7, 44,  3,  3,  3,  2,  0],
+    [2, 24, 18, 33,  1,  0,  2,  1,  4],
+    [2, 16,  8, 44,  0,  2,  1,  2,  1],
+    [3,  2,  5, 15,  0,  1,  2,  2,  2],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2,  4, 15, 47,  0,  1,  2,  2,  2],
+    [7, 13, 15, 25,  0,  1,  2,  2,  1],
+    [1, 19, 10, 15,  3,  3,  3,  2,  0],
+    [2, 13,  5, 44,  0,  1,  2,  1,  2],
+    [5, 11, 18, 20,  3,  3,  3,  2,  0],
+    [7,  9,  5, 40,  0,  1,  2,  1,  4],
+    [3,  6, 16, 38,  3,  3,  3,  2,  0],
+    [2, 24, 22, 12,  0,  1,  2,  2,  3],
+    [5, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 16, 15, 23,  0,  1,  2,  2,  5],
+    [2, 13,  0, 25,  1,  0,  2,  2,  2],
+    [2, 23, 15, 36,  3,  3,  3,  2,  0],
+    [2, 25, 10,  2,  1,  0,  2,  2,  5],
+    [2, 21,  7,  4,  1,  0,  2,  2,  1],
+    [1, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [6,  9,  1,  0,  3,  3,  3,  2,  0],
+    [1,  7, 20, 47,  3,  3,  3,  2,  0],
+    [2, 25, 10,  7,  0,  1,  2,  2,  2],
+    [7,  0,  4, 32,  1,  2,  0,  2,  5],
+    [1, 12, 12, 15,  0,  1,  2,  3,  3],
+    [2, 26, 15, 25,  0,  1,  2,  0,  5],
+    [2, 20, 15, 19,  0,  1,  2,  2,  1],
+    [4,  6,  9, 11,  2,  0,  1,  1,  4],
+    [2, 13, 15, 42,  0,  2,  1,  2,  2],
+    [3,  5, 21, 31,  0,  1,  2,  3,  5],
+    [2, 13, 19, 33,  0,  2,  1,  2,  2],
+    [1, 11, 10,  0,  0,  2,  1,  0,  2]
+])
+
+TEST_DATA_PREDICT = np.array([
+    [2, 22, 14, 45,  2,  0,  1,  2,  5],
+    [7, 13, 13, 19,  2,  0,  1,  2,  5],
+    [5, 18, 19, 33,  0,  2,  1,  2,  2],
+    [1, 11, 10,  0,  0,  2,  1,  0,  2]
+])
+
 
 def assert_cluster_splits_equal(array1, array2):
 
@@ -334,6 +430,78 @@ def test_kmodes_nunique_nclusters_ng(self):
                                       np.array([[0, 2],
                                                 [0, 1]]))
 
+    def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_binary, random_state=42)
+        # binary encoded variables are required
+        bin_variables = SOYBEAN.astype(bool).astype(int)
+        result = kmodes_huang.fit_predict(bin_variables)
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
+                             3, 3, 1, 1, 3, 1, 3, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
+        kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
+                            cat_dissim=jaccard_dissim_binary, random_state=42)
+        # binary encoded variables are required
+        bin_variables = SOYBEAN.astype(bool).astype(int)
+        result = kmodes_Cao.fit_predict(bin_variables)
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                             1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_binary, random_state=42)
+        # binary encoded variables are required
+        bin_variables = SOYBEAN.astype(bool).astype(int)
+        kmodes_huang = kmodes_huang.fit(bin_variables)
+        # binary encoded variables required for prediction as well
+        bin_variables_pred = SOYBEAN2.astype(bool).astype(int)
+        result = kmodes_huang.fit_predict(bin_variables_pred)
+        expected = np.array([0, 1, 2, 3])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_huang_soybean_jaccard_dissim_label(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_label, random_state=42)
+        result = kmodes_huang.fit_predict(TEST_DATA)
+        expected = np.array([3, 3, 2, 1, 1, 3, 3, 3, 3, 3, 0, 2, 2, 0, 0, 3, 3, 0, 0,
+                             0, 2, 2, 0, 3, 2, 3, 2, 2, 0, 1, 1, 0, 1, 1, 0, 2, 3, 3,
+                             3, 2, 2, 0, 0, 2, 1, 0, 0, 0, 2, 3, 0, 0, 2, 3, 2, 0, 2,
+                             2, 2, 3, 0, 3, 2, 2, 0, 0, 3, 2, 1, 3, 2, 0, 0, 2, 2, 2,
+                             3, 2, 2, 2, 2, 1, 3, 2, 2])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_cao_soybean_jaccard_dissim_label(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
+                              cat_dissim=jaccard_dissim_label, random_state=42)
+        result = kmodes_huang.fit_predict(TEST_DATA)
+        expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, 0, 0, 1, 1,
+                             0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 3, 1, 1, 2, 2, 0, 0, 2,
+                             0, 0, 0, 0, 3, 2, 2, 2, 0, 1, 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0,
+                             1, 1, 0, 1, 1, 0, 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1])
+
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_predict_soybean_jaccard_dissim_label(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_label, random_state=42)
+        kmodes_huang = kmodes_huang.fit(TEST_DATA)
+        result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
+        expected = np.array([1, 0, 1, 2])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+
     def test_kmodes_ninit(self):
         kmodes = KModes(n_init=10, init='Huang')
         self.assertEqual(kmodes.n_init, 10)

diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
@@ -10,6 +10,29 @@ def matching_dissim(a, b, **_):
     return np.sum(a != b, axis=1)
 
 
+def jaccard_dissim_binary(a, b, **__):
+    """Jaccard dissimilarity function for binary encoded variables"""
+    if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all():
+        return 1 - np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
+    raise ValueError("Missing or non Binary values detected in Binary columns.")
+
+
+def jaccard_dissim_label(a, b, **__):
+    """Jaccard dissimilarity function for label encoded variables"""
+    if np.isnan(a.astype('float64')).any() or np.isnan(b.astype('float64')).any():
+        raise ValueError("Missing values detected in Numeric columns.")
+    intersect_len = np.empty(len(a), dtype=int)
+    union_len = np.empty(len(a), dtype=int)
+    i = 0
+    for row in a:
+        intersect_len[i] = len(np.intersect1d(row, b))
+        union_len[i] = len(np.unique(row)) + len(np.unique(b)) - intersect_len[i]
+        i += 1
+    if (union_len == 0).any():
+        raise ValueError("Insufficient Number of data since union is 0")
+    return 1 - intersect_len / union_len
+
+
 def euclidean_dissim(a, b, **_):
     """Euclidean distance dissimilarity function"""
     if np.isnan(a).any() or np.isnan(b).any():

diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
@@ -8,6 +8,7 @@
 from sklearn.utils.testing import assert_equal, assert_array_equal
 
 from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim
+from kmodes.util.dissim import jaccard_dissim_binary, jaccard_dissim_label
 
 
 class TestDissimilarityMeasures(unittest.TestCase):
@@ -25,6 +26,53 @@ def test_matching_dissim(self):
         b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']])
         assert_array_equal(np.array([0, 4]), matching_dissim(a, b))
 
+    def test_jaccard_dissim_binary(self):
+        a = np.array([[0, 1, 1, 0, 1, 1]])
+        b = np.array([[0, 1, 1, 0, 1, 0]])
+        assert_equal(0.25, jaccard_dissim_binary(a, b))
+
+        a = np.array([[0, 1, 1, 0, 1, 1]])
+        b = np.array([[0, np.NaN, 1, 0, 1, 0]])
+        with self.assertRaises(ValueError):
+            jaccard_dissim_binary(a, b)
+
+        # test where values are non binary but also not having np.NaN
+        a = np.array([[0, 1, 2, 0, 1, 2]])
+        b = np.array([[0, 1, 2, 0, 1, 0]])
+        with self.assertRaises(ValueError):
+            jaccard_dissim_binary(a, b)
+
+        # test for dissimilarity = 0 both sets are same
+        a = np.array([[1, 1, 0, 1, 1, 0]])
+        b = np.array([[1, 1, 0, 1, 1, 0]])
+        assert_equal(0, jaccard_dissim_binary(a, b))
+
+        # test for dissimilarity = 0 sets are different
+        a = np.array([[0, 0, 1, 0, 0, 1]])
+        b = np.array([[1, 1, 0, 1, 1, 0]])
+        assert_equal(1, jaccard_dissim_binary(a, b))
+
+    def test_jaccard_dissim_label(self):
+        a = np.array([[0, 1, 2, 0, 1, 2]])
+        b = np.array([[0, 1, 2, 0, 3, 0]])
+        assert_equal(0.25, jaccard_dissim_label(a, b))
+
+        a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
+        b = np.array([[0, 1, 2, 0, 1, 0]])
+        with self.assertRaises(ValueError):
+            jaccard_dissim_label(a, b)
+
+        # test for dissimilarity = 0 Both sets are same
+        a = np.array([[1, 2, 0, 3, 1, 0]])
+        b = np.array([[1, 2, 0, 3, 1, 0]])
+        assert_equal(0, jaccard_dissim_label(a, b))
+
+        # test for dissimilarity = 1 Both sets are different
+        a = np.array([[1, 2, 0, 3, 1, 0]])
+        b = np.array([[5, 4, 6, 7, 8, 9]])
+        assert_equal(1, jaccard_dissim_label(a, b))
+
+
     def test_euclidian_dissim(self):
         a = np.array([[0., 1., 2., 0., 1., 2.]])
         b = np.array([[3., 1., 3., 0., 1., 0.]])