Skip to content

Commit

Permalink
Merge 0e922ed into 370d64b
Browse files Browse the repository at this point in the history
  • Loading branch information
kklein committed Mar 14, 2022
2 parents 370d64b + 0e922ed commit 1599ca8
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 29 deletions.
4 changes: 4 additions & 0 deletions examples/people.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Marc, .2, -2, A
Martine, -.2, 0, A
Max, .2, .2, A
Maxine, .2, .1, A
25 changes: 25 additions & 0 deletions examples/people.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env python

import numpy as np
from kmodes.kprototypes import KPrototypes

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('people.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('people.csv', dtype=object, delimiter=',')[:, 1:]
X[:, 0] = X[:, 0].astype(float)

weights = [1] * 4
weights[2] = 100
weights[3] = 100

kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2, sample_weights=weights)
clusters = kproto.fit_predict(X, categorical=[2])

# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(syms, clusters):
print(f"Symbol: {s}, cluster:{c}")
6 changes: 3 additions & 3 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):


def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
membship, centroids):
membship, centroids, sample_weight=1):
"""Move point between clusters, categorical attributes."""
membship[to_clust, ipoint] = 1
membship[from_clust, ipoint] = 0
Expand All @@ -371,7 +371,7 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
from_attr_counts = cl_attr_freq[from_clust][iattr]

# Increment the attribute count for the new "to" cluster
to_attr_counts[curattr] += 1
to_attr_counts[curattr] += sample_weight

current_attribute_value_freq = to_attr_counts[curattr]
current_centroid_value = centroids[to_clust][iattr]
Expand All @@ -381,7 +381,7 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
centroids[to_clust][iattr] = curattr

# Decrement the attribute count for the old "from" cluster
from_attr_counts[curattr] -= 1
from_attr_counts[curattr] -= sample_weight

old_centroid_value = centroids[from_clust][iattr]
if old_centroid_value == curattr:
Expand Down
79 changes: 53 additions & 26 deletions kmodes/kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,17 @@ def __init__(self, n_clusters=8, max_iter=100, num_dissim=euclidean_dissim,
"Setting n_init to 1.")
self.n_init = 1

def fit(self, X, y=None, categorical=None):
def fit(self, X, y=None, categorical=None, sample_weights=None):
"""Compute k-prototypes clustering.
Parameters
----------
X : array-like, shape=[n_samples, n_features]
categorical : Index of columns that contain categorical data
sample_weights : sequence, default: None
The weight that is assigned to each individual data point when
updating the centroids.
"""
if categorical is not None:
assert isinstance(categorical, (int, list, tuple)), f"The 'categorical' \
Expand All @@ -147,6 +151,8 @@ def fit(self, X, y=None, categorical=None):
X = pandas_to_numpy(X)

random_state = check_random_state(self.random_state)
_validate_sample_weights(sample_weights, n_samples=X.shape[0])

# If self.gamma is None, gamma will be automatically determined from
# the data. The function below returns its value.
self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
Expand All @@ -162,12 +168,13 @@ def fit(self, X, y=None, categorical=None):
self.n_init,
self.verbose,
random_state,
self.n_jobs
self.n_jobs,
sample_weights,
)

return self

def predict(self, X, categorical=None):
def predict(self, X, categorical=None, sample_weights=None):
"""Predict the closest cluster each sample in X belongs to.
Parameters
Expand All @@ -194,7 +201,7 @@ def predict(self, X, categorical=None):
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
Xcat, _ = encode_features(Xcat, enc_map=self._enc_map)
return labels_cost(Xnum, Xcat, self._enc_cluster_centroids,
self.num_dissim, self.cat_dissim, self.gamma)[0]
self.num_dissim, self.cat_dissim, self.gamma, sample_weights)[0]

@property
def cluster_centroids_(self):
Expand All @@ -207,7 +214,8 @@ def cluster_centroids_(self):
"because the model is not yet fitted.")


def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=None):
def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma,
membship=None, sample_weights=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-prototypes algorithm.
"""
Expand All @@ -225,13 +233,16 @@ def labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma, membship=N
tot_costs = num_costs + gamma * cat_costs
clust = np.argmin(tot_costs)
labels[ipoint] = clust
cost += tot_costs[clust]
if sample_weights is not None:
cost += tot_costs[clust] * sample_weights[ipoint]
else:
cost += tot_costs[clust]

return labels, cost


def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
gamma, init, n_init, verbose, random_state, n_jobs):
gamma, init, n_init, verbose, random_state, n_jobs, sample_weights=None):
"""k-prototypes algorithm"""
random_state = check_random_state(random_state)
if sparse.issparse(X):
Expand Down Expand Up @@ -286,13 +297,14 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
results.append(_k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seeds[init_no]))
init, init_no, verbose, seeds[init_no],
sample_weights))
else:
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(_k_prototypes_single)(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seed)
init, init_no, verbose, seed, sample_weights)
for init_no, seed in enumerate(seeds))
all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results)

Expand All @@ -307,7 +319,7 @@ def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,

def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
max_iter, num_dissim, cat_dissim, gamma, init, init_no,
verbose, random_state):
verbose, random_state, sample_weights=None):
# For numerical part of initialization, we don't have a guarantee
# that there is not an empty cluster, so we need to retry until
# there is none.
Expand Down Expand Up @@ -364,24 +376,25 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
# can do k-means on the numerical attributes.
cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
# Same for the membership sum per cluster
cl_memb_sum = np.zeros(n_clusters, dtype=int)
cl_memb_sum = np.zeros(n_clusters, dtype=np.float64)
# cl_attr_freq is a list of lists with dictionaries that contain
# the frequencies of values per cluster and attribute.
cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
cl_attr_freq = [[defaultdict(float) for _ in range(ncatattrs)]
for _ in range(n_clusters)]
for ipoint in range(n_points):
sample_weight = sample_weights[ipoint] if sample_weights is not None else 1
# Initial assignment to clusters
clust = np.argmin(
num_dissim(centroids[0], Xnum[ipoint]) + gamma *
cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
)
membship[clust, ipoint] = 1
cl_memb_sum[clust] += 1
cl_memb_sum[clust] += sample_weight
# Count attribute values per cluster.
for iattr, curattr in enumerate(Xnum[ipoint]):
cl_attr_sum[clust, iattr] += curattr
cl_attr_sum[clust, iattr] += curattr * sample_weight
for iattr, curattr in enumerate(Xcat[ipoint]):
cl_attr_freq[clust][iattr][curattr] += 1
cl_attr_freq[clust][iattr][curattr] += sample_weight

# If no empty clusters, then consider initialization finalized.
if membship.sum(axis=1).min() > 0:
Expand Down Expand Up @@ -412,19 +425,19 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,
converged = False

_, cost = labels_cost(Xnum, Xcat, centroids,
num_dissim, cat_dissim, gamma, membship)
num_dissim, cat_dissim, gamma, membship, sample_weights)

epoch_costs = [cost]
while itr < max_iter and not converged:
itr += 1
centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq, membship, moves = \
_k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum,
cl_attr_freq, membship, num_dissim, cat_dissim,
gamma, random_state)
gamma, random_state, sample_weights)

# All points seen in this iteration
labels, ncost = labels_cost(Xnum, Xcat, centroids,
num_dissim, cat_dissim, gamma, membship)
num_dissim, cat_dissim, gamma, membship, sample_weights)
converged = (moves == 0) or (ncost >= cost)
epoch_costs.append(ncost)
cost = ncost
Expand All @@ -436,10 +449,11 @@ def _k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points,


def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq,
membship, num_dissim, cat_dissim, gamma, random_state):
membship, num_dissim, cat_dissim, gamma, random_state, sample_weights):
"""Single iteration of the k-prototypes algorithm"""
moves = 0
for ipoint in range(Xnum.shape[0]):
sample_weight = sample_weights[ipoint] if sample_weights is not None else 1
clust = np.argmin(
num_dissim(centroids[0], Xnum[ipoint]) +
gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
Expand All @@ -455,12 +469,12 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
# Note that membship gets updated by kmodes.move_point_cat.
# move_point_num only updates things specific to the k-means part.
cl_attr_sum, cl_memb_sum = _move_point_num(
Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum
Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum, sample_weight
)
# noinspection PyProtectedMember
cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat(
Xcat[ipoint], ipoint, clust, old_clust,
cl_attr_freq, membship, centroids[1]
cl_attr_freq, membship, centroids[1], sample_weight
)

# Update old and new centroids for numerical attributes using
Expand Down Expand Up @@ -490,15 +504,15 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
return centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq, membship, moves


def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum):
def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum, sample_weight=1):
"""Move point between clusters, numerical attributes."""
# Update sum of attributes in cluster.
for iattr, curattr in enumerate(point):
cl_attr_sum[to_clust][iattr] += curattr
cl_attr_sum[from_clust][iattr] -= curattr
cl_attr_sum[to_clust][iattr] += curattr * sample_weight
cl_attr_sum[from_clust][iattr] -= curattr * sample_weight
# Update sums of memberships in cluster
cl_memb_sum[to_clust] += 1
cl_memb_sum[from_clust] -= 1
cl_memb_sum[to_clust] += sample_weight
cl_memb_sum[from_clust] -= sample_weight
return cl_attr_sum, cl_memb_sum


Expand All @@ -513,3 +527,16 @@ def _split_num_cat(X, categorical):
if ii not in categorical]]).astype(np.float64)
Xcat = np.asanyarray(X[:, categorical])
return Xnum, Xcat


def _validate_sample_weights(sample_weights, n_samples):
if sample_weights is not None:
if len(sample_weights) != n_samples:
raise ValueError("Should have as many sample_weights as samples.")
if any(
not isinstance(weight, int) and not isinstance(weight, float)
for weight in sample_weights
):
raise ValueError("sample_weights should either be int or floats.")
if any(sample < 0 for sample in sample_weights):
raise ValueError("sample_weights should be positive.")
55 changes: 55 additions & 0 deletions kmodes/tests/test_kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,58 @@ def test_kprototypes_ninit(self):
self.assertEqual(kmodes.n_init, 10)
kmodes = kprototypes.KPrototypes(n_init=10, init=[np.array([]), np.array([])])
self.assertEqual(kmodes.n_init, 1)

def test_kprototypes_sample_weights_validation(self):
kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2)
sample_weights_too_few = [1] * 11
with self.assertRaises(ValueError):
kproto.fit_predict(
STOCKS, categorical=[1, 2], sample_weights=sample_weights_too_few
)
sample_weights_negative = [-1] + [1] * 11
with self.assertRaises(ValueError):
kproto.fit_predict(
STOCKS, categorical=[1, 2], sample_weights=sample_weights_negative
)
sample_weights_non_numerical = [None] + [1] * 11
with self.assertRaises(ValueError):
kproto.fit_predict(
STOCKS, categorical=[1, 2], sample_weights=sample_weights_non_numerical
)

def test_k_prototypes_sample_weights_all_but_one_zero(self):
"""Test whether centroid collapses to single datapoint with non-zero weight."""
kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', verbose=2)
n_samples = 2
for indicator in range(n_samples):
sample_weights = np.zeros(n_samples)
sample_weights[indicator] = 1
model = kproto.fit(
STOCKS[:n_samples, :], categorical=[1, 2], sample_weights=sample_weights
)
self.assertTrue((model.cluster_centroids_[0, :] == STOCKS[indicator, :]).all())

def test_k_prototypes_sample_weights_unchanged(self):
"""Test whether centroid definition remains unchanged when scaling uniformly."""
categorical = [1, 2]
kproto_baseline = kprototypes.KPrototypes(n_clusters=3, init='Cao')
model_baseline = kproto_baseline.fit(STOCKS, categorical=categorical)
expected = set(tuple(row) for row in model_baseline.cluster_centroids_)
# The exact value of a weight shouldn't matter if equal for all samples.
for weight in [.5, .1, 1, 1., 2]:
sample_weights = [weight] * STOCKS.shape[0]
kproto_weighted = kprototypes.KPrototypes(n_clusters=3, init='Cao')
model_weighted = kproto_weighted.fit(
STOCKS, categorical=categorical, sample_weights=sample_weights
)
factual = set(tuple(row) for row in model_weighted.cluster_centroids_)
# Centroids might be ordered differently. To compare the centroids, we first
# sort them.
tuple_pairs = zip(sorted(expected), sorted(factual))
for tuple_expected, tuple_factual in tuple_pairs:
# Test numerical features for almost equality, categorical features for
# actual equality.
self.assertAlmostEqual(float(tuple_expected[0]), float(tuple_factual[0]))
for index in categorical:
self.assertTrue(tuple_expected[index] == tuple_factual[index])

0 comments on commit 1599ca8

Please sign in to comment.