Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft implementation of sample_weight for kmodes. #174

Merged
merged 3 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 44 additions & 16 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,22 @@ def __init__(self, n_clusters=8, max_iter=100, cat_dissim=matching_dissim,
"Setting n_init to 1.")
self.n_init = 1

def fit(self, X, y=None, **kwargs):
def fit(self, X, y=None, sample_weight=None, **kwargs):
"""Compute k-modes clustering.

Parameters
----------
X : array-like, shape=[n_samples, n_features]

sample_weight : sequence, default: None
The weight that is assigned to each individual data point when
updating the centroids.
"""
X = pandas_to_numpy(X)

random_state = check_random_state(self.random_state)
_validate_sample_weight(sample_weight, n_samples=X.shape[0])

self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
self.n_iter_, self.epoch_costs_ = k_modes(
X,
Expand All @@ -134,6 +140,7 @@ def fit(self, X, y=None, **kwargs):
self.verbose,
random_state,
self.n_jobs,
sample_weight
)
return self

Expand Down Expand Up @@ -179,7 +186,7 @@ def cluster_centroids_(self):
"because the model is not yet fitted.")


def labels_cost(X, centroids, dissim, membship=None):
def labels_cost(X, centroids, dissim, membship=None, sample_weight=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-modes algorithm.
"""
Expand All @@ -190,15 +197,17 @@ def labels_cost(X, centroids, dissim, membship=None):
cost = 0.
labels = np.empty(n_points, dtype=np.uint16)
for ipoint, curpoint in enumerate(X):
weight = sample_weight[ipoint] if sample_weight is not None else 1
diss = dissim(centroids, curpoint, X=X, membship=membship)
clust = np.argmin(diss)
labels[ipoint] = clust
cost += diss[clust]
cost += diss[clust] * weight

return labels, cost


def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs):
def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs,
sample_weight=None):
"""k-modes algorithm"""
random_state = check_random_state(random_state)
if sparse.issparse(X):
Expand Down Expand Up @@ -229,13 +238,13 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state
if n_jobs == 1:
for init_no in range(n_init):
results.append(_k_modes_single(
X, n_clusters, n_points, n_attrs, max_iter,
dissim, init, init_no, verbose, seeds[init_no]
X, n_clusters, n_points, n_attrs, max_iter, dissim, init, init_no,
verbose, seeds[init_no], sample_weight
))
else:
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(_k_modes_single)(X, n_clusters, n_points, n_attrs, max_iter,
dissim, init, init_no, verbose, seed)
dissim, init, init_no, verbose, seed, sample_weight)
for init_no, seed in enumerate(seeds))
all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results)

Expand All @@ -248,7 +257,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state


def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, init_no,
verbose, random_state):
verbose, random_state, sample_weight=None):
random_state = check_random_state(random_state)
# _____ INIT _____
if verbose:
Expand Down Expand Up @@ -282,12 +291,13 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
cl_attr_freq = [[defaultdict(int) for _ in range(n_attrs)]
for _ in range(n_clusters)]
for ipoint, curpoint in enumerate(X):
weight = sample_weight[ipoint] if sample_weight is not None else 1
# Initial assignment to clusters
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
membship[clust, ipoint] = 1
# Count attribute values per cluster.
for iattr, curattr in enumerate(curpoint):
cl_attr_freq[clust][iattr][curattr] += 1
cl_attr_freq[clust][iattr][curattr] += weight
# Perform an initial centroid update.
for ik in range(n_clusters):
for iattr in range(n_attrs):
Expand All @@ -304,7 +314,7 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
labels = None
converged = False

_, cost = labels_cost(X, centroids, dissim, membship)
_, cost = labels_cost(X, centroids, dissim, membship, sample_weight)

epoch_costs = [cost]
while itr < max_iter and not converged:
Expand All @@ -315,10 +325,11 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
cl_attr_freq,
membship,
dissim,
random_state
random_state,
sample_weight
)
# All points seen in this iteration
labels, ncost = labels_cost(X, centroids, dissim, membship)
labels, ncost = labels_cost(X, centroids, dissim, membship, sample_weight)
converged = (moves == 0) or (ncost >= cost)
epoch_costs.append(ncost)
cost = ncost
Expand All @@ -329,10 +340,12 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
return centroids, labels, cost, itr, epoch_costs


def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state,
sample_weight):
"""Single iteration of k-modes clustering algorithm"""
moves = 0
for ipoint, curpoint in enumerate(X):
weight = sample_weight[ipoint] if sample_weight is not None else 1
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
if membship[clust, ipoint]:
# Point is already in its right place.
Expand All @@ -343,7 +356,8 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
old_clust = np.argwhere(membship[:, ipoint])[0][0]

cl_attr_freq, membship, centroids = _move_point_cat(
curpoint, ipoint, clust, old_clust, cl_attr_freq, membship, centroids
curpoint, ipoint, clust, old_clust, cl_attr_freq, membship, centroids,
weight
)

# In case of an empty cluster, reinitialize with a random point
Expand All @@ -354,14 +368,15 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
rindx = random_state.choice(choices)

cl_attr_freq, membship, centroids = _move_point_cat(
X[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship, centroids
X[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship,
centroids, weight
)

return centroids, cl_attr_freq, membship, moves


def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
membship, centroids, sample_weight=1):
membship, centroids, sample_weight):
"""Move point between clusters, categorical attributes."""
membship[to_clust, ipoint] = 1
membship[from_clust, ipoint] = 0
Expand Down Expand Up @@ -390,3 +405,16 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
centroids[from_clust][iattr] = get_max_value_key(from_attr_counts)

return cl_attr_freq, membship, centroids


def _validate_sample_weight(sample_weight, n_samples):
if sample_weight is not None:
if len(sample_weight) != n_samples:
raise ValueError("sample_weight should be of equal size as samples.")
if any(
not isinstance(weight, int) and not isinstance(weight, float)
for weight in sample_weight
):
raise ValueError("sample_weight elements should either be int or floats.")
if any(sample < 0 for sample in sample_weight):
raise ValueError("sample_weight elements should be positive.")
22 changes: 5 additions & 17 deletions kmodes/kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def fit(self, X, y=None, categorical=None, sample_weight=None):
X = pandas_to_numpy(X)

random_state = check_random_state(self.random_state)
_validate_sample_weight(sample_weight, n_samples=X.shape[0])
kmodes._validate_sample_weight(sample_weight, n_samples=X.shape[0])

# If self.gamma is None, gamma will be automatically determined from
# the data. The function below returns its value.
Expand Down Expand Up @@ -495,17 +495,18 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
rindx = random_state.choice(choices)

cl_attr_sum, cl_memb_sum = _move_point_num(
Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum
Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice find, small oversight from last PR

weight
)
cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat(
Xcat[rindx], rindx, old_clust, from_clust,
cl_attr_freq, membship, centroids[1]
cl_attr_freq, membship, centroids[1], weight
)

return centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq, membship, moves


def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum, sample_weight=1):
def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum, sample_weight):
"""Move point between clusters, numerical attributes."""
# Update sum of attributes in cluster.
for iattr, curattr in enumerate(point):
Expand All @@ -528,16 +529,3 @@ def _split_num_cat(X, categorical):
if ii not in categorical]]).astype(np.float64)
Xcat = np.asanyarray(X[:, categorical])
return Xnum, Xcat


def _validate_sample_weight(sample_weight, n_samples):
if sample_weight is not None:
if len(sample_weight) != n_samples:
raise ValueError("sample_weight should be of equal size as samples.")
if any(
not isinstance(weight, int) and not isinstance(weight, float)
for weight in sample_weight
):
raise ValueError("sample_weight elements should either be int or floats.")
if any(sample < 0 for sample in sample_weight):
raise ValueError("sample_weight elements should be positive.")
46 changes: 46 additions & 0 deletions kmodes/tests/test_kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,3 +521,49 @@ def test_kmodes_epoch_costs(self):
kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
kmodes.fit(SOYBEAN)
self.assertEqual(kmodes.epoch_costs_, [206.0, 204.0, 199.0, 199.0])

def test_kmodes_sample_weights_validation(self):
kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
sample_weight_too_few = [1] * (SOYBEAN.shape[0] - 1)
with self.assertRaisesRegex(
ValueError, "sample_weight should be of equal size as samples."
):
kmodes.fit_predict(SOYBEAN, sample_weight=sample_weight_too_few)
sample_weight_negative = [-1] + [1] * (SOYBEAN.shape[0] - 1)
with self.assertRaisesRegex(
ValueError, "sample_weight elements should be positive."
):
kmodes.fit_predict(SOYBEAN, sample_weight=sample_weight_negative)
sample_weight_non_numerical = [None] + [1] * (SOYBEAN.shape[0] - 1)
with self.assertRaisesRegex(
ValueError, "sample_weight elements should either be int or floats."
):
kmodes.fit_predict(SOYBEAN, sample_weight=sample_weight_non_numerical)

def test_kmodes_sample_weights_all_but_one_zero(self):
"""Test whether centroid collapses to single datapoint with non-zero weight."""
kmodes = KModes(n_clusters=1, init='Cao', random_state=42)
n_samples = 10
for indicator in range(n_samples):
sample_weight = np.zeros(n_samples)
sample_weight[indicator] = 1
model = kmodes.fit(
TEST_DATA[:n_samples, :], sample_weight=sample_weight
)
self.assertTrue((model.cluster_centroids_[0, :] == TEST_DATA[indicator, :]).all())

def test_k_modes_sample_weight_unchanged(self):
"""Test whether centroid definition remains unchanged when scaling uniformly."""
kmodes_baseline = KModes(n_clusters=4, init='Cao', random_state=42)
model_baseline = kmodes_baseline.fit(SOYBEAN)
expected = set(tuple(row) for row in model_baseline.cluster_centroids_)
for weight in [.5, 1, 1., 2]:
sample_weight = [weight] * SOYBEAN.shape[0]
kmodes_weighted = KModes(n_clusters=4, init='Cao', random_state=42)
model_weighted = kmodes_weighted.fit(SOYBEAN, sample_weight=sample_weight)
factual = set(tuple(row) for row in model_weighted.cluster_centroids_)
# Centroids might be ordered differently. To compare the centroids, we first
# sort them.
tuple_pairs = zip(sorted(expected), sorted(factual))
for tuple_expected, tuple_factual in tuple_pairs:
self.assertAlmostEqual(tuple_expected, tuple_factual)
8 changes: 5 additions & 3 deletions kmodes/tests/test_kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def test_kprototypes_sample_weights_validation(self):

def test_k_prototypes_sample_weight_all_but_one_zero(self):
"""Test whether centroid collapses to single datapoint with non-zero weight."""
kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', verbose=2)
kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', random_state=42)
n_samples = 2
for indicator in range(n_samples):
sample_weight = np.zeros(n_samples)
Expand All @@ -367,13 +367,15 @@ def test_k_prototypes_sample_weight_all_but_one_zero(self):
def test_k_prototypes_sample_weight_unchanged(self):
"""Test whether centroid definition remains unchanged when scaling uniformly."""
categorical = [1, 2]
kproto_baseline = kprototypes.KPrototypes(n_clusters=3, init='Cao')
kproto_baseline = kprototypes.KPrototypes(n_clusters=3, init='Cao', random_state=42)
model_baseline = kproto_baseline.fit(STOCKS, categorical=categorical)
expected = set(tuple(row) for row in model_baseline.cluster_centroids_)
# The exact value of a weight shouldn't matter if equal for all samples.
for weight in [.5, .1, 1, 1., 2]:
sample_weight = [weight] * STOCKS.shape[0]
kproto_weighted = kprototypes.KPrototypes(n_clusters=3, init='Cao')
kproto_weighted = kprototypes.KPrototypes(
n_clusters=3, init='Cao', random_state=42
)
model_weighted = kproto_weighted.fit(
STOCKS, categorical=categorical, sample_weight=sample_weight
)
Expand Down