From 4a0e5be19e138f33a0c4d5d9683f91ff2418434f Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 8 Mar 2022 11:04:32 +0100 Subject: [PATCH 1/5] Draft usage of sample weights. --- kmodes/kmodes.py | 6 ++--- kmodes/kprototypes.py | 53 ++++++++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py index 1ff9e44..41395c8 100644 --- a/kmodes/kmodes.py +++ b/kmodes/kmodes.py @@ -361,7 +361,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state): def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq, - membship, centroids): + membship, centroids, sample_weight=1): """Move point between clusters, categorical attributes.""" membship[to_clust, ipoint] = 1 membship[from_clust, ipoint] = 0 @@ -371,7 +371,7 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq, from_attr_counts = cl_attr_freq[from_clust][iattr] # Increment the attribute count for the new "to" cluster - to_attr_counts[curattr] += 1 + to_attr_counts[curattr] += sample_weight current_attribute_value_freq = to_attr_counts[curattr] current_centroid_value = centroids[to_clust][iattr] @@ -381,7 +381,7 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq, centroids[to_clust][iattr] = curattr # Decrement the attribute count for the old "from" cluster - from_attr_counts[curattr] -= 1 + from_attr_counts[curattr] -= sample_weight old_centroid_value = centroids[from_clust][iattr] if old_centroid_value == curattr: diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py index 1a413ee..20888e3 100644 --- a/kmodes/kprototypes.py +++ b/kmodes/kprototypes.py @@ -116,12 +116,13 @@ class KPrototypes(kmodes.KModes): def __init__(self, n_clusters=8, max_iter=100, num_dissim=euclidean_dissim, cat_dissim=matching_dissim, init='Cao', n_init=10, gamma=None, - verbose=0, random_state=None, n_jobs=1): + verbose=0, random_state=None, n_jobs=1, sample_weights=None): super(KPrototypes, self).__init__(n_clusters, max_iter, cat_dissim, init, verbose=verbose, random_state=random_state, n_jobs=n_jobs) self.num_dissim = num_dissim + self.sample_weights = sample_weights self.gamma = gamma self.n_init = n_init if isinstance(self.init, list) and self.n_init > 1: @@ -162,7 +163,8 @@ def fit(self, X, y=None, categorical=None): self.n_init, self.verbose, random_state, - self.n_jobs + self.n_jobs, + self.sample_weights, ) return self @@ -194,7 +196,7 @@ def predict(self, X, categorical=None): Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None) Xcat, _ = encode_features(Xcat, enc_map=self._enc_map) return labels_cost(Xnum, Xcat, self._enc_cluster_centroids, - self.num_dissim, self.cat_dissim, self.gamma)[0] + self.num_dissim, self.cat_dissim, self.gamma, self.sample_weights)[0] @property def cluster_centroids_(self): @@ -207,7 +209,7 @@ def cluster_centroids_(self): "because the model is not yet fitted.") -def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None): +def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None, sample_weights=None): """Calculate labels and cost function given a matrix of points and a list of centroids for the k-prototypes algorithm. """ @@ -225,13 +227,16 @@ def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=N tot_costs = num_costs + gamma * cat_costs clust = np.argmin(tot_costs) labels[ipoint] = clust - cost += tot_costs[clust] + if sample_weights: + cost += tot_costs[clust] * sample_weights[ipoint] + else: + cost += tot_costs[clust] return labels, cost def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, - gamma, init, n_init, verbose, random_state, n_jobs): + gamma, init, n_init, verbose, random_state, n_jobs, sample_weights=None): """k-prototypes algorithm""" random_state = check_random_state(random_state) if sparse.issparse(X): @@ -286,13 +291,13 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, results.append(_k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter, num_dissim, cat_dissim, gamma, - init, init_no, verbose, seeds[init_no])) + init, init_no, verbose, seeds[init_no], sample_weights)) else: results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter, num_dissim, cat_dissim, gamma, - init, init_no, verbose, seed) + init, init_no, verbose, seed, sample_weights) for init_no, seed in enumerate(seeds)) all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results) @@ -307,7 +312,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter, num_dissim, cat_dissim, gamma, init, init_no, - verbose, random_state): + verbose, random_state, sample_weights=None): # For numerical part of initialization, we don't have a guarantee # that there is not an empty cluster, so we need to retry until # there is none. @@ -370,18 +375,19 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)] for _ in range(n_clusters)] for ipoint in range(n_points): + sample_weight = sample_weights[ipoint] if sample_weights is not None else 1 # Initial assignment to clusters clust = np.argmin( num_dissim(centroids[0], Xnum[ipoint]) + gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship) ) membship[clust, ipoint] = 1 - cl_memb_sum[clust] += 1 + cl_memb_sum[clust] += sample_weight # Count attribute values per cluster. for iattr, curattr in enumerate(Xnum[ipoint]): - cl_attr_sum[clust, iattr] += curattr + cl_attr_sum[clust, iattr] += curattr * sample_weight for iattr, curattr in enumerate(Xcat[ipoint]): - cl_attr_freq[clust][iattr][curattr] += 1 + cl_attr_freq[clust][iattr][curattr] += sample_weight # If no empty clusters, then consider initialization finalized. if membship.sum(axis=1).min() > 0: @@ -412,7 +418,7 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, converged = False _, cost = labels_cost(Xnum, Xcat, centroids, - num_dissim, cat_dissim, gamma, membship) + num_dissim, cat_dissim, gamma, membship, sample_weights) epoch_costs = [cost] while itr < max_iter and not converged: @@ -420,11 +426,11 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, centroids, moves = _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq, membship, num_dissim, cat_dissim, gamma, - random_state) + random_state, sample_weights) # All points seen in this iteration labels, ncost = labels_cost(Xnum, Xcat, centroids, - num_dissim, cat_dissim, gamma, membship) + num_dissim, cat_dissim, gamma, membship, sample_weights) converged = (moves == 0) or (ncost >= cost) epoch_costs.append(ncost) cost = ncost @@ -436,10 +442,11 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq, - membship, num_dissim, cat_dissim, gamma, random_state): + membship, num_dissim, cat_dissim, gamma, random_state, sample_weights): """Single iteration of the k-prototypes algorithm""" moves = 0 for ipoint in range(Xnum.shape[0]): + sample_weight = sample_weights[ipoint] if sample_weights is not None else 1 clust = np.argmin( num_dissim(centroids[0], Xnum[ipoint]) + gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship) @@ -455,12 +462,12 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_ # Note that membship gets updated by kmodes.move_point_cat. # move_point_num only updates things specific to the k-means part. cl_attr_sum, cl_memb_sum = _move_point_num( - Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum + Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum, sample_weight ) # noinspection PyProtectedMember cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat( Xcat[ipoint], ipoint, clust, old_clust, - cl_attr_freq, membship, centroids[1] + cl_attr_freq, membship, centroids[1], sample_weight ) # Update old and new centroids for numerical attributes using @@ -490,15 +497,15 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_ return centroids, moves -def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum): +def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum, sample_weight=1): """Move point between clusters, numerical attributes.""" # Update sum of attributes in cluster. for iattr, curattr in enumerate(point): - cl_attr_sum[to_clust][iattr] += curattr - cl_attr_sum[from_clust][iattr] -= curattr + cl_attr_sum[to_clust][iattr] += curattr * sample_weight + cl_attr_sum[from_clust][iattr] -= curattr * sample_weight # Update sums of memberships in cluster - cl_memb_sum[to_clust] += 1 - cl_memb_sum[from_clust] -= 1 + cl_memb_sum[to_clust] += sample_weight + cl_memb_sum[from_clust] -= sample_weight return cl_attr_sum, cl_memb_sum From 59ad46f97bf65736f68c3af79eca84abecbd983f Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 8 Mar 2022 11:04:51 +0100 Subject: [PATCH 2/5] Add example for sample weights. --- examples/people.csv | 4 ++++ examples/people.py | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 examples/people.csv create mode 100644 examples/people.py diff --git a/examples/people.csv b/examples/people.csv new file mode 100644 index 0000000..8320b1a --- /dev/null +++ b/examples/people.csv @@ -0,0 +1,4 @@ +Marc, .2, -2, A +Martine, -.2, 0, A +Max, .2, .2, A +Maxine, .2, .1, A \ No newline at end of file diff --git a/examples/people.py b/examples/people.py new file mode 100644 index 0000000..84165db --- /dev/null +++ b/examples/people.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +import numpy as np +from kmodes.kprototypes import KPrototypes + +# stocks with their market caps, sectors and countries +syms = np.genfromtxt('people.csv', dtype=str, delimiter=',')[:, 0] +X = np.genfromtxt('people.csv', dtype=object, delimiter=',')[:, 1:] +X[:, 0] = X[:, 0].astype(float) + +weights = [1] * 4 +weights[2] = 100 +weights[3] = 100 + +kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2, sample_weights=weights) +clusters = kproto.fit_predict(X, categorical=[2]) + +# Print cluster centroids of the trained model. +print(kproto.cluster_centroids_) +# Print training statistics +print(kproto.cost_) +print(kproto.n_iter_) + +for s, c in zip(syms, clusters): + print(f"Symbol: {s}, cluster:{c}") From 957beed09eda3ddcba7c74fbe381278d5d22f6aa Mon Sep 17 00:00:00 2001 From: kklein Date: Thu, 10 Mar 2022 07:36:37 +0100 Subject: [PATCH 3/5] Explicitly check of Noneness. --- kmodes/kprototypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py index 20888e3..f2c9679 100644 --- a/kmodes/kprototypes.py +++ b/kmodes/kprototypes.py @@ -227,7 +227,7 @@ def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=N tot_costs = num_costs + gamma * cat_costs clust = np.argmin(tot_costs) labels[ipoint] = clust - if sample_weights: + if sample_weights is not None: cost += tot_costs[clust] * sample_weights[ipoint] else: cost += tot_costs[clust] From d0f4469414a7d8bedabc9d4d14a4cece95be105b Mon Sep 17 00:00:00 2001 From: kklein Date: Mon, 14 Mar 2022 12:48:10 +0100 Subject: [PATCH 4/5] Validate input and move sample_weights to function parameters. --- kmodes/kprototypes.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py index f2c9679..8703be9 100644 --- a/kmodes/kprototypes.py +++ b/kmodes/kprototypes.py @@ -116,13 +116,12 @@ class KPrototypes(kmodes.KModes): def __init__(self, n_clusters=8, max_iter=100, num_dissim=euclidean_dissim, cat_dissim=matching_dissim, init='Cao', n_init=10, gamma=None, - verbose=0, random_state=None, n_jobs=1, sample_weights=None): + verbose=0, random_state=None, n_jobs=1): super(KPrototypes, self).__init__(n_clusters, max_iter, cat_dissim, init, verbose=verbose, random_state=random_state, n_jobs=n_jobs) self.num_dissim = num_dissim - self.sample_weights = sample_weights self.gamma = gamma self.n_init = n_init if isinstance(self.init, list) and self.n_init > 1: @@ -131,13 +130,17 @@ def __init__(self, n_clusters=8, max_iter=100, num_dissim=euclidean_dissim, "Setting n_init to 1.") self.n_init = 1 - def fit(self, X, y=None, categorical=None): + def fit(self, X, y=None, categorical=None, sample_weights=None): """Compute k-prototypes clustering. Parameters ---------- X : array-like, shape=[n_samples, n_features] categorical : Index of columns that contain categorical data + sample_weights : sequence, default: None + The weight that is assigned to each individual data point when + updating the centroids. + """ if categorical is not None: assert isinstance(categorical, (int, list, tuple)), f"The 'categorical' \ @@ -148,6 +151,8 @@ def fit(self, X, y=None, categorical=None): X = pandas_to_numpy(X) random_state = check_random_state(self.random_state) + _validate_sample_weights(sample_weights, n_samples=X.shape[0]) + # If self.gamma is None, gamma will be automatically determined from # the data. The function below returns its value. self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \ @@ -164,12 +169,12 @@ def fit(self, X, y=None, categorical=None): self.verbose, random_state, self.n_jobs, - self.sample_weights, + sample_weights, ) return self - def predict(self, X, categorical=None): + def predict(self, X, categorical=None, sample_weights=None): """Predict the closest cluster each sample in X belongs to. Parameters @@ -196,7 +201,7 @@ def predict(self, X, categorical=None): Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None) Xcat, _ = encode_features(Xcat, enc_map=self._enc_map) return labels_cost(Xnum, Xcat, self._enc_cluster_centroids, - self.num_dissim, self.cat_dissim, self.gamma, self.sample_weights)[0] + self.num_dissim, self.cat_dissim, self.gamma, sample_weights)[0] @property def cluster_centroids_(self): @@ -209,7 +214,8 @@ def cluster_centroids_(self): "because the model is not yet fitted.") -def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None, sample_weights=None): +def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, + membship=None, sample_weights=None): """Calculate labels and cost function given a matrix of points and a list of centroids for the k-prototypes algorithm. """ @@ -291,7 +297,8 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, results.append(_k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter, num_dissim, cat_dissim, gamma, - init, init_no, verbose, seeds[init_no], sample_weights)) + init, init_no, verbose, seeds[init_no], + sample_weights)) else: results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs, @@ -369,10 +376,10 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, # can do k-means on the numerical attributes. cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64) # Same for the membership sum per cluster - cl_memb_sum = np.zeros(n_clusters, dtype=int) + cl_memb_sum = np.zeros(n_clusters, dtype=np.float64) # cl_attr_freq is a list of lists with dictionaries that contain # the frequencies of values per cluster and attribute. - cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)] + cl_attr_freq = [[defaultdict(float) for _ in range(ncatattrs)] for _ in range(n_clusters)] for ipoint in range(n_points): sample_weight = sample_weights[ipoint] if sample_weights is not None else 1 @@ -520,3 +527,16 @@ def _split_num_cat(X, categorical): if ii not in categorical]]).astype(np.float64) Xcat = np.asanyarray(X[:, categorical]) return Xnum, Xcat + + +def _validate_sample_weights(sample_weights, n_samples): + if sample_weights is not None: + if len(sample_weights) != n_samples: + raise ValueError("Should have as many sample_weights as samples.") + if any( + not isinstance(weight, int) and not isinstance(weight, float) + for weight in sample_weights + ): + raise ValueError("sample_weights should either be int or floats.") + if any(sample < 0 for sample in sample_weights): + raise ValueError("sample_weights should be positive.") From 8068a91980b90ca3d4557b73b3799d338b09b29f Mon Sep 17 00:00:00 2001 From: kklein Date: Mon, 14 Mar 2022 12:48:23 +0100 Subject: [PATCH 5/5] Add tests for sample weights. --- kmodes/tests/test_kprototypes.py | 55 ++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/kmodes/tests/test_kprototypes.py b/kmodes/tests/test_kprototypes.py index 26d1d6d..05fc495 100644 --- a/kmodes/tests/test_kprototypes.py +++ b/kmodes/tests/test_kprototypes.py @@ -333,3 +333,58 @@ def test_kprototypes_ninit(self): self.assertEqual(kmodes.n_init, 10) kmodes = kprototypes.KPrototypes(n_init=10, init=[np.array([]), np.array([])]) self.assertEqual(kmodes.n_init, 1) + + def test_kprototypes_sample_weights_validation(self): + kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2) + sample_weights_too_few = [1] * 11 + with self.assertRaises(ValueError): + kproto.fit_predict( + STOCKS, categorical=[1, 2], sample_weights=sample_weights_too_few + ) + sample_weights_negative = [-1] + [1] * 11 + with self.assertRaises(ValueError): + kproto.fit_predict( + STOCKS, categorical=[1, 2], sample_weights=sample_weights_negative + ) + sample_weights_non_numerical = [None] + [1] * 11 + with self.assertRaises(ValueError): + kproto.fit_predict( + STOCKS, categorical=[1, 2], sample_weights=sample_weights_non_numerical + ) + + def test_k_prototypes_sample_weights_all_but_one_zero(self): + """Test whether centroid collapses to single datapoint with non-zero weight.""" + kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', verbose=2) + n_samples = 2 + for indicator in range(n_samples): + sample_weights = np.zeros(n_samples) + sample_weights[indicator] = 1 + model = kproto.fit( + STOCKS[:n_samples, :], categorical=[1, 2], sample_weights=sample_weights + ) + self.assertTrue((model.cluster_centroids_[0, :] == STOCKS[indicator, :]).all()) + + def test_k_prototypes_sample_weights_unchanged(self): + """Test whether centroid definition remains unchanged when scaling uniformly.""" + categorical = [1, 2] + kproto_baseline = kprototypes.KPrototypes(n_clusters=3, init='Cao') + model_baseline = kproto_baseline.fit(STOCKS, categorical=categorical) + expected = set(tuple(row) for row in model_baseline.cluster_centroids_) + # The exact value of a weight shouldn't matter if equal for all samples. + for weight in [.5, .1, 1, 1., 2]: + sample_weights = [weight] * STOCKS.shape[0] + kproto_weighted = kprototypes.KPrototypes(n_clusters=3, init='Cao') + model_weighted = kproto_weighted.fit( + STOCKS, categorical=categorical, sample_weights=sample_weights + ) + factual = set(tuple(row) for row in model_weighted.cluster_centroids_) + # Centroids might be ordered differently. To compare the centroids, we first + # sort them. + tuple_pairs = zip(sorted(expected), sorted(factual)) + for tuple_expected, tuple_factual in tuple_pairs: + # Test numerical features for almost equality, categorical features for + # actual equality. + self.assertAlmostEqual(float(tuple_expected[0]), float(tuple_factual[0])) + for index in categorical: + self.assertTrue(tuple_expected[index] == tuple_factual[index]) +