From dbd423ec278a04eb9e12355a24f22c21f1a2619d Mon Sep 17 00:00:00 2001 From: Nico de Vos Date: Sat, 6 Mar 2021 21:57:56 -0800 Subject: [PATCH] move labels_cost to the top, as it's public; make some functions protected --- kmodes/kmodes.py | 44 ++++++++++++------------ kmodes/kprototypes.py | 79 ++++++++++++++++++++++--------------------- 2 files changed, 62 insertions(+), 61 deletions(-) diff --git a/kmodes/kmodes.py b/kmodes/kmodes.py index 76b526f..46ffbc5 100644 --- a/kmodes/kmodes.py +++ b/kmodes/kmodes.py @@ -124,7 +124,7 @@ def fit(self, X, y=None, **kwargs): random_state = check_random_state(self.random_state) self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \ - self.n_iter_, self.epoch_costs_ = k_modes( + self.n_iter_, self.epoch_costs_ = _k_modes( X, self.n_clusters, self.max_iter, @@ -180,7 +180,26 @@ def cluster_centroids_(self): "because the model is not yet fitted.") -def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs): +def labels_cost(X, centroids, dissim, membship=None): + """Calculate labels and cost function given a matrix of points and + a list of centroids for the k-modes algorithm. + """ + + X = check_array(X) + + n_points = X.shape[0] + cost = 0. + labels = np.empty(n_points, dtype=np.uint16) + for ipoint, curpoint in enumerate(X): + diss = dissim(centroids, curpoint, X=X, membship=membship) + clust = np.argmin(diss) + labels[ipoint] = clust + cost += diss[clust] + + return labels, cost + + +def _k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs): """k-modes algorithm""" random_state = check_random_state(random_state) if sparse.issparse(X): @@ -322,7 +341,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state): moves += 1 old_clust = np.argwhere(membship[:, ipoint])[0][0] - cl_attr_freq, membship, centroids = move_point_cat( + cl_attr_freq, membship, centroids = _move_point_cat( curpoint, ipoint, clust, old_clust, cl_attr_freq, membship, centroids ) @@ -370,22 +389,3 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq, centroids[from_clust][iattr] = get_max_value_key(from_attr_counts) return cl_attr_freq, membship, centroids - - -def labels_cost(X, centroids, dissim, membship=None): - """Calculate labels and cost function given a matrix of points and - a list of centroids for the k-modes algorithm. - """ - - X = check_array(X) - - n_points = X.shape[0] - cost = 0. - labels = np.empty(n_points, dtype=np.uint16) - for ipoint, curpoint in enumerate(X): - diss = dissim(centroids, curpoint, X=X, membship=membship) - clust = np.argmin(diss) - labels[ipoint] = clust - cost += diss[clust] - - return labels, cost diff --git a/kmodes/kprototypes.py b/kmodes/kprototypes.py index d13fb36..df5ae1e 100644 --- a/kmodes/kprototypes.py +++ b/kmodes/kprototypes.py @@ -150,7 +150,7 @@ def fit(self, X, y=None, categorical=None): # If self.gamma is None, gamma will be automatically determined from # the data. The function below returns its value. self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \ - self.n_iter_, self.epoch_costs_, self.gamma = k_prototypes( + self.n_iter_, self.epoch_costs_, self.gamma = _k_prototypes( X, categorical, self.n_clusters, @@ -208,8 +208,31 @@ def cluster_centroids_(self): "because the model is not yet fitted.") -def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, - gamma, init, n_init, verbose, random_state, n_jobs): +def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None): + """Calculate labels and cost function given a matrix of points and + a list of centroids for the k-prototypes algorithm. + """ + + n_points = Xnum.shape[0] + Xnum = check_array(Xnum) + + cost = 0. + labels = np.empty(n_points, dtype=np.uint16) + for ipoint in range(n_points): + # Numerical cost = sum of Euclidean distances + num_costs = num_dissim(centroids[0], Xnum[ipoint]) + cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship) + # Gamma relates the categorical cost to the numerical cost. + tot_costs = num_costs + gamma * cat_costs + clust = np.argmin(tot_costs) + labels[ipoint] = clust + cost += tot_costs[clust] + + return labels, cost + + +def _k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, + gamma, init, n_init, verbose, random_state, n_jobs): """k-prototypes algorithm""" random_state = check_random_state(random_state) if sparse.issparse(X): @@ -261,16 +284,16 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) if n_jobs == 1: for init_no in range(n_init): - results.append(k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, - n_clusters, n_points, max_iter, - num_dissim, cat_dissim, gamma, - init, init_no, verbose, seeds[init_no])) + results.append(_k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, + n_clusters, n_points, max_iter, + num_dissim, cat_dissim, gamma, + init, init_no, verbose, seeds[init_no])) else: results = Parallel(n_jobs=n_jobs, verbose=0)( - delayed(k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs, - n_clusters, n_points, max_iter, - num_dissim, cat_dissim, gamma, - init, init_no, verbose, seed) + delayed(_k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs, + n_clusters, n_points, max_iter, + num_dissim, cat_dissim, gamma, + init, init_no, verbose, seed) for init_no, seed in enumerate(seeds)) all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results) @@ -283,9 +306,9 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, all_n_iters[best], all_epoch_costs[best], gamma -def k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, - max_iter, num_dissim, cat_dissim, gamma, init, init_no, - verbose, random_state): +def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, + max_iter, num_dissim, cat_dissim, gamma, init, init_no, + verbose, random_state): # For numerical part of initialization, we don't have a guarantee # that there is not an empty cluster, so we need to retry until # there is none. @@ -435,7 +458,8 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_ cl_attr_sum, cl_memb_sum = _move_point_num( Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum ) - cl_attr_freq, membship, centroids[1] = kmodes.move_point_cat( + # noinspection PyProtectedMember + cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat( Xcat[ipoint], ipoint, clust, old_clust, cl_attr_freq, membship, centroids[1] ) @@ -459,7 +483,7 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_ cl_attr_sum, cl_memb_sum = _move_point_num( Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum ) - cl_attr_freq, membship, centroids[1] = kmodes.move_point_cat( + cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat( Xcat[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship, centroids[1] ) @@ -490,26 +514,3 @@ def _split_num_cat(X, categorical): if ii not in categorical]]).astype(np.float64) Xcat = np.asanyarray(X[:, categorical]) return Xnum, Xcat - - -def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None): - """Calculate labels and cost function given a matrix of points and - a list of centroids for the k-prototypes algorithm. - """ - - n_points = Xnum.shape[0] - Xnum = check_array(Xnum) - - cost = 0. - labels = np.empty(n_points, dtype=np.uint16) - for ipoint in range(n_points): - # Numerical cost = sum of Euclidean distances - num_costs = num_dissim(centroids[0], Xnum[ipoint]) - cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship) - # Gamma relates the categorical cost to the numerical cost. - tot_costs = num_costs + gamma * cat_costs - clust = np.argmin(tot_costs) - labels[ipoint] = clust - cost += tot_costs[clust] - - return labels, cost