Skip to content

Commit

Permalink
Merge 9fe5b6d into 2bc7fce
Browse files Browse the repository at this point in the history
  • Loading branch information
BikashPandey17 committed Jul 26, 2019
2 parents 2bc7fce + 9fe5b6d commit 576264c
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 1 deletion.
170 changes: 169 additions & 1 deletion kmodes/tests/test_kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sklearn.utils.testing import assert_equal

from kmodes.kmodes import KModes
from kmodes.util.dissim import ng_dissim
from kmodes.util.dissim import ng_dissim, jaccard_dissim_binary, jaccard_dissim_label


SOYBEAN = np.array([
Expand Down Expand Up @@ -124,6 +124,102 @@
# Drop target column
SOYBEAN2 = SOYBEAN2[:, :35]

# test data with categorical variables that have been label encoded
TEST_DATA = np.array([
[2, 22, 14, 45, 2, 0, 1, 2, 5],
[2, 13, 13, 19, 2, 0, 1, 2, 5],
[3, 25, 4, 3, 0, 1, 2, 0, 4],
[2, 13, 15, 18, 0, 1, 2, 2, 3],
[3, 10, 4, 42, 0, 2, 1, 1, 2],
[2, 16, 21, 14, 0, 1, 2, 2, 2],
[2, 16, 19, 37, 0, 2, 1, 2, 2],
[2, 20, 9, 34, 0, 1, 2, 3, 5],
[2, 14, 21, 44, 0, 1, 2, 3, 2],
[2, 26, 5, 30, 0, 1, 2, 3, 3],
[3, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 20, 1, 27, 3, 3, 3, 2, 0],
[3, 6, 8, 19, 0, 1, 2, 1, 2],
[2, 13, 8, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 16, 19, 42, 0, 1, 2, 2, 5],
[7, 7, 5, 43, 0, 2, 1, 2, 2],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[3, 3, 5, 12, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[7, 15, 19, 17, 0, 1, 2, 2, 2],
[1, 1, 15, 24, 0, 1, 2, 2, 2],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 5, 7, 9, 0, 1, 2, 3, 5],
[2, 24, 6, 10, 0, 2, 1, 2, 2],
[2, 13, 16, 29, 0, 2, 1, 2, 2],
[3, 6, 8, 1, 0, 1, 2, 2, 5],
[2, 16, 15, 34, 0, 1, 2, 2, 1],
[0, 24, 14, 12, 3, 3, 3, 2, 0],
[3, 8, 21, 13, 3, 3, 3, 2, 0],
[2, 17, 15, 42, 3, 3, 3, 2, 0],
[2, 25, 18, 16, 3, 3, 3, 2, 0],
[2, 3, 15, 42, 3, 3, 3, 2, 0],
[6, 13, 15, 22, 3, 3, 3, 2, 0],
[3, 8, 18, 24, 1, 0, 2, 2, 5],
[7, 20, 15, 26, 1, 0, 2, 2, 1],
[2, 20, 7, 35, 0, 1, 2, 2, 5],
[2, 16, 12, 28, 0, 1, 2, 2, 5],
[2, 16, 5, 39, 0, 1, 2, 2, 2],
[3, 6, 11, 8, 0, 1, 2, 2, 2],
[7, 6, 15, 44, 1, 0, 2, 2, 4],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 16, 7, 6, 3, 3, 3, 2, 0],
[1, 13, 2, 46, 3, 3, 3, 2, 0],
[0, 14, 5, 41, 3, 3, 3, 2, 0],
[2, 24, 19, 0, 3, 3, 3, 2, 0],
[2, 14, 3, 35, 3, 3, 3, 2, 0],
[6, 19, 7, 5, 0, 2, 1, 2, 2],
[5, 6, 11, 44, 3, 3, 3, 2, 0],
[7, 16, 21, 21, 3, 3, 3, 2, 0],
[2, 19, 7, 44, 3, 3, 3, 2, 0],
[2, 24, 18, 33, 1, 0, 2, 1, 4],
[2, 16, 8, 44, 0, 2, 1, 2, 1],
[3, 2, 5, 15, 0, 1, 2, 2, 2],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 4, 15, 47, 0, 1, 2, 2, 2],
[7, 13, 15, 25, 0, 1, 2, 2, 1],
[1, 19, 10, 15, 3, 3, 3, 2, 0],
[2, 13, 5, 44, 0, 1, 2, 1, 2],
[5, 11, 18, 20, 3, 3, 3, 2, 0],
[7, 9, 5, 40, 0, 1, 2, 1, 4],
[3, 6, 16, 38, 3, 3, 3, 2, 0],
[2, 24, 22, 12, 0, 1, 2, 2, 3],
[5, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 16, 15, 23, 0, 1, 2, 2, 5],
[2, 13, 0, 25, 1, 0, 2, 2, 2],
[2, 23, 15, 36, 3, 3, 3, 2, 0],
[2, 25, 10, 2, 1, 0, 2, 2, 5],
[2, 21, 7, 4, 1, 0, 2, 2, 1],
[1, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[6, 9, 1, 0, 3, 3, 3, 2, 0],
[1, 7, 20, 47, 3, 3, 3, 2, 0],
[2, 25, 10, 7, 0, 1, 2, 2, 2],
[7, 0, 4, 32, 1, 2, 0, 2, 5],
[1, 12, 12, 15, 0, 1, 2, 3, 3],
[2, 26, 15, 25, 0, 1, 2, 0, 5],
[2, 20, 15, 19, 0, 1, 2, 2, 1],
[4, 6, 9, 11, 2, 0, 1, 1, 4],
[2, 13, 15, 42, 0, 2, 1, 2, 2],
[3, 5, 21, 31, 0, 1, 2, 3, 5],
[2, 13, 19, 33, 0, 2, 1, 2, 2],
[1, 11, 10, 0, 0, 2, 1, 0, 2]
])

TEST_DATA_PREDICT = np.array([
[2, 22, 14, 45, 2, 0, 1, 2, 5],
[7, 13, 13, 19, 2, 0, 1, 2, 5],
[5, 18, 19, 33, 0, 2, 1, 2, 2],
[1, 11, 10, 0, 0, 2, 1, 0, 2]
])


def assert_cluster_splits_equal(array1, array2):

Expand Down Expand Up @@ -334,6 +430,78 @@ def test_kmodes_nunique_nclusters_ng(self):
np.array([[0, 2],
[0, 1]]))

def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_binary, random_state=42)
# binary encoded variables are required
bin_variables = SOYBEAN.astype(bool).astype(int)
result = kmodes_huang.fit_predict(bin_variables)
expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
3, 3, 1, 1, 3, 1, 3, 1, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
cat_dissim=jaccard_dissim_binary, random_state=42)
# binary encoded variables are required
bin_variables = SOYBEAN.astype(bool).astype(int)
result = kmodes_Cao.fit_predict(bin_variables)
expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0])

assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_binary, random_state=42)
# binary encoded variables are required
bin_variables = SOYBEAN.astype(bool).astype(int)
kmodes_huang = kmodes_huang.fit(bin_variables)
# binary encoded variables required for prediction as well
bin_variables_pred = SOYBEAN2.astype(bool).astype(int)
result = kmodes_huang.fit_predict(bin_variables_pred)
expected = np.array([0, 1, 2, 3])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_huang_soybean_jaccard_dissim_label(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_label, random_state=42)
result = kmodes_huang.fit_predict(TEST_DATA)
expected = np.array([3, 3, 2, 1, 1, 3, 3, 3, 3, 3, 0, 2, 2, 0, 0, 3, 3, 0, 0,
0, 2, 2, 0, 3, 2, 3, 2, 2, 0, 1, 1, 0, 1, 1, 0, 2, 3, 3,
3, 2, 2, 0, 0, 2, 1, 0, 0, 0, 2, 3, 0, 0, 2, 3, 2, 0, 2,
2, 2, 3, 0, 3, 2, 2, 0, 0, 3, 2, 1, 3, 2, 0, 0, 2, 2, 2,
3, 2, 2, 2, 2, 1, 3, 2, 2])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_cao_soybean_jaccard_dissim_label(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
cat_dissim=jaccard_dissim_label, random_state=42)
result = kmodes_huang.fit_predict(TEST_DATA)
expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, 0, 0, 1, 1,
0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 3, 1, 1, 2, 2, 0, 0, 2,
0, 0, 0, 0, 3, 2, 2, 2, 0, 1, 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0,
1, 1, 0, 1, 1, 0, 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1])

assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_predict_soybean_jaccard_dissim_label(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_label, random_state=42)
kmodes_huang = kmodes_huang.fit(TEST_DATA)
result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
expected = np.array([1, 0, 1, 2])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))


def test_kmodes_ninit(self):
kmodes = KModes(n_init=10, init='Huang')
self.assertEqual(kmodes.n_init, 10)
Expand Down
23 changes: 23 additions & 0 deletions kmodes/util/dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,29 @@ def matching_dissim(a, b, **_):
return np.sum(a != b, axis=1)


def jaccard_dissim_binary(a, b, **__):
"""Jaccard dissimilarity function for binary encoded variables"""
if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all():
return 1 - np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
raise ValueError("Missing or non Binary values detected in Binary columns.")


def jaccard_dissim_label(a, b, **__):
"""Jaccard dissimilarity function for label encoded variables"""
if np.isnan(a.astype('float64')).any() or np.isnan(b.astype('float64')).any():
raise ValueError("Missing values detected in Numeric columns.")
intersect_len = np.empty(len(a), dtype=int)
union_len = np.empty(len(a), dtype=int)
i = 0
for row in a:
intersect_len[i] = len(np.intersect1d(row, b))
union_len[i] = len(np.unique(row)) + len(np.unique(b)) - intersect_len[i]
i += 1
if (union_len == 0).any():
raise ValueError("Insufficient Number of data since union is 0")
return 1 - intersect_len / union_len


def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function"""
if np.isnan(a).any() or np.isnan(b).any():
Expand Down
48 changes: 48 additions & 0 deletions kmodes/util/tests/test_dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sklearn.utils.testing import assert_equal, assert_array_equal

from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim
from kmodes.util.dissim import jaccard_dissim_binary, jaccard_dissim_label


class TestDissimilarityMeasures(unittest.TestCase):
Expand All @@ -25,6 +26,53 @@ def test_matching_dissim(self):
b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']])
assert_array_equal(np.array([0, 4]), matching_dissim(a, b))

def test_jaccard_dissim_binary(self):
a = np.array([[0, 1, 1, 0, 1, 1]])
b = np.array([[0, 1, 1, 0, 1, 0]])
assert_equal(0.25, jaccard_dissim_binary(a, b))

a = np.array([[0, 1, 1, 0, 1, 1]])
b = np.array([[0, np.NaN, 1, 0, 1, 0]])
with self.assertRaises(ValueError):
jaccard_dissim_binary(a, b)

# test where values are non binary but also not having np.NaN
a = np.array([[0, 1, 2, 0, 1, 2]])
b = np.array([[0, 1, 2, 0, 1, 0]])
with self.assertRaises(ValueError):
jaccard_dissim_binary(a, b)

# test for dissimilarity = 0 both sets are same
a = np.array([[1, 1, 0, 1, 1, 0]])
b = np.array([[1, 1, 0, 1, 1, 0]])
assert_equal(0, jaccard_dissim_binary(a, b))

# test for dissimilarity = 0 sets are different
a = np.array([[0, 0, 1, 0, 0, 1]])
b = np.array([[1, 1, 0, 1, 1, 0]])
assert_equal(1, jaccard_dissim_binary(a, b))

def test_jaccard_dissim_label(self):
a = np.array([[0, 1, 2, 0, 1, 2]])
b = np.array([[0, 1, 2, 0, 3, 0]])
assert_equal(0.25, jaccard_dissim_label(a, b))

a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
b = np.array([[0, 1, 2, 0, 1, 0]])
with self.assertRaises(ValueError):
jaccard_dissim_label(a, b)

# test for dissimilarity = 0 Both sets are same
a = np.array([[1, 2, 0, 3, 1, 0]])
b = np.array([[1, 2, 0, 3, 1, 0]])
assert_equal(0, jaccard_dissim_label(a, b))

# test for dissimilarity = 1 Both sets are different
a = np.array([[1, 2, 0, 3, 1, 0]])
b = np.array([[5, 4, 6, 7, 8, 9]])
assert_equal(1, jaccard_dissim_label(a, b))


def test_euclidian_dissim(self):
a = np.array([[0., 1., 2., 0., 1., 2.]])
b = np.array([[3., 1., 3., 0., 1., 0.]])
Expand Down

0 comments on commit 576264c

Please sign in to comment.