Skip to content

Commit

Permalink
move labels_cost to the top, as it's public; make some functions prot…
Browse files Browse the repository at this point in the history
…ected
  • Loading branch information
nicodv committed Mar 7, 2021
1 parent b65184e commit dbd423e
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 61 deletions.
44 changes: 22 additions & 22 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def fit(self, X, y=None, **kwargs):

random_state = check_random_state(self.random_state)
self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
self.n_iter_, self.epoch_costs_ = k_modes(
self.n_iter_, self.epoch_costs_ = _k_modes(
X,
self.n_clusters,
self.max_iter,
Expand Down Expand Up @@ -180,7 +180,26 @@ def cluster_centroids_(self):
"because the model is not yet fitted.")


def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs):
def labels_cost(X, centroids, dissim, membship=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-modes algorithm.
"""

X = check_array(X)

n_points = X.shape[0]
cost = 0.
labels = np.empty(n_points, dtype=np.uint16)
for ipoint, curpoint in enumerate(X):
diss = dissim(centroids, curpoint, X=X, membship=membship)
clust = np.argmin(diss)
labels[ipoint] = clust
cost += diss[clust]

return labels, cost


def _k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs):
"""k-modes algorithm"""
random_state = check_random_state(random_state)
if sparse.issparse(X):
Expand Down Expand Up @@ -322,7 +341,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
moves += 1
old_clust = np.argwhere(membship[:, ipoint])[0][0]

cl_attr_freq, membship, centroids = move_point_cat(
cl_attr_freq, membship, centroids = _move_point_cat(
curpoint, ipoint, clust, old_clust, cl_attr_freq, membship, centroids
)

Expand Down Expand Up @@ -370,22 +389,3 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
centroids[from_clust][iattr] = get_max_value_key(from_attr_counts)

return cl_attr_freq, membship, centroids


def labels_cost(X, centroids, dissim, membship=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-modes algorithm.
"""

X = check_array(X)

n_points = X.shape[0]
cost = 0.
labels = np.empty(n_points, dtype=np.uint16)
for ipoint, curpoint in enumerate(X):
diss = dissim(centroids, curpoint, X=X, membship=membship)
clust = np.argmin(diss)
labels[ipoint] = clust
cost += diss[clust]

return labels, cost
79 changes: 40 additions & 39 deletions kmodes/kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def fit(self, X, y=None, categorical=None):
# If self.gamma is None, gamma will be automatically determined from
# the data. The function below returns its value.
self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
self.n_iter_, self.epoch_costs_, self.gamma = k_prototypes(
self.n_iter_, self.epoch_costs_, self.gamma = _k_prototypes(
X,
categorical,
self.n_clusters,
Expand Down Expand Up @@ -208,8 +208,31 @@ def cluster_centroids_(self):
"because the model is not yet fitted.")


def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
gamma, init, n_init, verbose, random_state, n_jobs):
def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-prototypes algorithm.
"""

n_points = Xnum.shape[0]
Xnum = check_array(Xnum)

cost = 0.
labels = np.empty(n_points, dtype=np.uint16)
for ipoint in range(n_points):
# Numerical cost = sum of Euclidean distances
num_costs = num_dissim(centroids[0], Xnum[ipoint])
cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
# Gamma relates the categorical cost to the numerical cost.
tot_costs = num_costs + gamma * cat_costs
clust = np.argmin(tot_costs)
labels[ipoint] = clust
cost += tot_costs[clust]

return labels, cost


def _k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
gamma, init, n_init, verbose, random_state, n_jobs):
"""k-prototypes algorithm"""
random_state = check_random_state(random_state)
if sparse.issparse(X):
Expand Down Expand Up @@ -261,16 +284,16 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
if n_jobs == 1:
for init_no in range(n_init):
results.append(k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seeds[init_no]))
results.append(_k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seeds[init_no]))
else:
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seed)
delayed(_k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seed)
for init_no, seed in enumerate(seeds))
all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results)

Expand All @@ -283,9 +306,9 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
all_n_iters[best], all_epoch_costs[best], gamma


def k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
max_iter, num_dissim, cat_dissim, gamma, init, init_no,
verbose, random_state):
def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
max_iter, num_dissim, cat_dissim, gamma, init, init_no,
verbose, random_state):
# For numerical part of initialization, we don't have a guarantee
# that there is not an empty cluster, so we need to retry until
# there is none.
Expand Down Expand Up @@ -435,7 +458,8 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
cl_attr_sum, cl_memb_sum = _move_point_num(
Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum
)
cl_attr_freq, membship, centroids[1] = kmodes.move_point_cat(
# noinspection PyProtectedMember
cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat(
Xcat[ipoint], ipoint, clust, old_clust,
cl_attr_freq, membship, centroids[1]
)
Expand All @@ -459,7 +483,7 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
cl_attr_sum, cl_memb_sum = _move_point_num(
Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum
)
cl_attr_freq, membship, centroids[1] = kmodes.move_point_cat(
cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat(
Xcat[rindx], rindx, old_clust, from_clust,
cl_attr_freq, membship, centroids[1]
)
Expand Down Expand Up @@ -490,26 +514,3 @@ def _split_num_cat(X, categorical):
if ii not in categorical]]).astype(np.float64)
Xcat = np.asanyarray(X[:, categorical])
return Xnum, Xcat


def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-prototypes algorithm.
"""

n_points = Xnum.shape[0]
Xnum = check_array(Xnum)

cost = 0.
labels = np.empty(n_points, dtype=np.uint16)
for ipoint in range(n_points):
# Numerical cost = sum of Euclidean distances
num_costs = num_dissim(centroids[0], Xnum[ipoint])
cat_costs = cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
# Gamma relates the categorical cost to the numerical cost.
tot_costs = num_costs + gamma * cat_costs
clust = np.argmin(tot_costs)
labels[ipoint] = clust
cost += tot_costs[clust]

return labels, cost

0 comments on commit dbd423e

Please sign in to comment.