From 2ecaa1ff628162724137e925d9bea01b34a80dd0 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 09:57:48 +0100 Subject: [PATCH 1/8] introduce noise for KDEs --- mostlyai/qa/_accuracy.py | 16 ++++------------ mostlyai/qa/_similarity.py | 7 ++----- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index cbba4d8..43d6c15 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -250,18 +250,10 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") - if len(series_vals) > 1: - try: - series_kde = scipy.stats.gaussian_kde(series_vals) - val_y = series_kde(val_x.to_numpy("float")) - val_y = (val_y / (val_y.sum() + 1e-30)).round(5) - except np.linalg.LinAlgError: - # handle `singular matrix` error that can occur for constants - val_y = [1] * len(val_x) - elif len(series_vals) == 1: - val_y = [1] * len(val_x) - else: - val_y = [np.nan] * len(val_x) + series_vals = series_vals + np.random.normal(0, 1e-5, size=series.shape) + series_kde = scipy.stats.gaussian_kde(series_vals) + val_y = series_kde(val_x.to_numpy("float")) + val_y = (val_y / (val_y.sum() + 1e-30)).round(5) col_kdes[col] = pd.Series(val_y, index=val_x, name=col) if trn_kdes is not None: diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 0dd2df5..38da2d3 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -131,11 +131,8 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T - try: - Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) - except Exception as e: - _LOG.warning(f"gaussian_kde failed, using ones instead: {e}") - Z = np.ones_like(X) + data = data + np.random.normal(0, 1e-5, size=data.shape) + Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) # make contour contour = go.Contour( From fe37b6392009e86e5c0b5169e0b0f9244143e917 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 10:00:54 +0100 Subject: [PATCH 2/8] smaller noise --- mostlyai/qa/_accuracy.py | 2 +- mostlyai/qa/_similarity.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index 43d6c15..9ee22ce 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -250,7 +250,7 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") - series_vals = series_vals + np.random.normal(0, 1e-5, size=series.shape) + series_vals = series_vals + np.random.normal(0, 1e-10, size=series.shape) series_kde = scipy.stats.gaussian_kde(series_vals) val_y = series_kde(val_x.to_numpy("float")) val_y = (val_y / (val_y.sum() + 1e-30)).round(5) diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 38da2d3..0b026cf 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -131,7 +131,7 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T - data = data + np.random.normal(0, 1e-5, size=data.shape) + data = data + np.random.normal(0, 1e-10, size=data.shape) Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) # make contour From 8b77216d16a98c146b8bbdf590a2d02b2bbf71dd Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 10:01:33 +0100 Subject: [PATCH 3/8] wip --- mostlyai/qa/_accuracy.py | 2 +- mostlyai/qa/_similarity.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index 9ee22ce..43a401e 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -250,7 +250,7 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") - series_vals = series_vals + np.random.normal(0, 1e-10, size=series.shape) + series_vals += np.random.normal(0, 1e-10, size=series_vals.shape) series_kde = scipy.stats.gaussian_kde(series_vals) val_y = series_kde(val_x.to_numpy("float")) val_y = (val_y / (val_y.sum() + 1e-30)).round(5) diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 0b026cf..2e2c23b 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -131,7 +131,7 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T - data = data + np.random.normal(0, 1e-10, size=data.shape) + data += np.random.normal(0, 1e-10, size=data.shape) Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) # make contour From 1e46aee21939695c0b47175a33073704d0dbf7cf Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 11:18:36 +0100 Subject: [PATCH 4/8] wip --- mostlyai/qa/_accuracy.py | 4 +++- mostlyai/qa/_similarity.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index 43a401e..12372fd 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -250,7 +250,9 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") - series_vals += np.random.normal(0, 1e-10, size=series_vals.shape) + # avoid singular matrix error by adding 0.1% noise + noise = _min * 1e-3 if (_min := np.min(series_vals)) != 0 else 1e-3 + series_vals += np.random.normal(0, noise, size=series_vals.shape) series_kde = scipy.stats.gaussian_kde(series_vals) val_y = series_kde(val_x.to_numpy("float")) val_y = (val_y / (val_y.sum() + 1e-30)).round(5) diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 2e2c23b..8f6c98d 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -131,7 +131,9 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T - data += np.random.normal(0, 1e-10, size=data.shape) + # avoid singular matrix error by adding 0.1% noise + noise = _min * 1e-3 if (_min := np.min(data)) == 0 else 1e-3 + data += np.random.normal(0, noise, size=data.shape) Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) # make contour From 6e5ec191ce813c17da6101299b71d1c089c652ec Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 11:30:28 +0100 Subject: [PATCH 5/8] wip --- mostlyai/qa/_accuracy.py | 2 +- mostlyai/qa/_similarity.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index 12372fd..0cdb013 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -251,7 +251,7 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") # avoid singular matrix error by adding 0.1% noise - noise = _min * 1e-3 if (_min := np.min(series_vals)) != 0 else 1e-3 + noise = np.abs(_min * 1e-3 if (_min := np.min(series_vals)) == 0 else 1e-3) series_vals += np.random.normal(0, noise, size=series_vals.shape) series_kde = scipy.stats.gaussian_kde(series_vals) val_y = series_kde(val_x.to_numpy("float")) diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 8f6c98d..95f7aa2 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -132,7 +132,7 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T # avoid singular matrix error by adding 0.1% noise - noise = _min * 1e-3 if (_min := np.min(data)) == 0 else 1e-3 + noise = np.abs(_min * 1e-3 if (_min := np.min(data)) == 0 else 1e-3) data += np.random.normal(0, noise, size=data.shape) Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) From 50fdc5223ca4842bc9883353925ecd59a3a86129 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 11:35:50 +0100 Subject: [PATCH 6/8] wip --- mostlyai/qa/_accuracy.py | 8 ++++++-- mostlyai/qa/_similarity.py | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index 0cdb013..ba01f10 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -253,8 +253,12 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # avoid singular matrix error by adding 0.1% noise noise = np.abs(_min * 1e-3 if (_min := np.min(series_vals)) == 0 else 1e-3) series_vals += np.random.normal(0, noise, size=series_vals.shape) - series_kde = scipy.stats.gaussian_kde(series_vals) - val_y = series_kde(val_x.to_numpy("float")) + try: + series_kde = scipy.stats.gaussian_kde(series_vals) + val_y = series_kde(val_x.to_numpy("float")) + except Exception as e: + _LOG.warning(f"gaussian_kde failed, using ones instead: {e}") + val_y = [1] * len(val_x) val_y = (val_y / (val_y.sum() + 1e-30)).round(5) col_kdes[col] = pd.Series(val_y, index=val_x, name=col) diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 95f7aa2..0fc5afe 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -134,7 +134,11 @@ def make_contour_and_centroid_traces( # avoid singular matrix error by adding 0.1% noise noise = np.abs(_min * 1e-3 if (_min := np.min(data)) == 0 else 1e-3) data += np.random.normal(0, noise, size=data.shape) - Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) + try: + Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) + except Exception as e: + _LOG.warning(f"gaussian_kde failed, using ones instead: {e}") + Z = np.ones_like(X) # make contour contour = go.Contour( From 099700ec64095fd1d3fff53766be48d829072766 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 16 Dec 2024 12:04:14 +0100 Subject: [PATCH 7/8] wip --- mostlyai/qa/_accuracy.py | 4 ++-- mostlyai/qa/_similarity.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index ba01f10..f5e3695 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -251,15 +251,15 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") # avoid singular matrix error by adding 0.1% noise - noise = np.abs(_min * 1e-3 if (_min := np.min(series_vals)) == 0 else 1e-3) + noise = np.abs(_min * 1e-3 if (_min := np.min(series_vals)) != 0 else 1e-3) series_vals += np.random.normal(0, noise, size=series_vals.shape) try: series_kde = scipy.stats.gaussian_kde(series_vals) val_y = series_kde(val_x.to_numpy("float")) + val_y = (val_y / (val_y.sum() + 1e-30)).round(5) except Exception as e: _LOG.warning(f"gaussian_kde failed, using ones instead: {e}") val_y = [1] * len(val_x) - val_y = (val_y / (val_y.sum() + 1e-30)).round(5) col_kdes[col] = pd.Series(val_y, index=val_x, name=col) if trn_kdes is not None: diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 0fc5afe..6688ff4 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -132,7 +132,7 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T # avoid singular matrix error by adding 0.1% noise - noise = np.abs(_min * 1e-3 if (_min := np.min(data)) == 0 else 1e-3) + noise = np.abs(_min * 1e-3 if (_min := np.min(data)) != 0 else 1e-3) data += np.random.normal(0, noise, size=data.shape) try: Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) From 5a4c2d0a2765f6df6be319d1912b68907b759bd3 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 17 Dec 2024 05:01:11 +0100 Subject: [PATCH 8/8] misc --- mostlyai/qa/_accuracy.py | 6 +++--- mostlyai/qa/_similarity.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py index f5e3695..ea4795a 100644 --- a/mostlyai/qa/_accuracy.py +++ b/mostlyai/qa/_accuracy.py @@ -250,9 +250,9 @@ def calculate_numeric_uni_kdes(df: pd.DataFrame, trn_kdes: dict[str, pd.Series] # estimate gaussian kernels series_vals = series.dropna().to_numpy("float") - # avoid singular matrix error by adding 0.1% noise - noise = np.abs(_min * 1e-3 if (_min := np.min(series_vals)) != 0 else 1e-3) - series_vals += np.random.normal(0, noise, size=series_vals.shape) + # avoid singular matrix error by adding some noise + noise = np.abs(minimum * 1e-3 if (minimum := np.min(series_vals)) != 0 else 1e-18) + series_vals += np.random.normal(loc=0, scale=noise, size=series_vals.shape) try: series_kde = scipy.stats.gaussian_kde(series_vals) val_y = series_kde(val_x.to_numpy("float")) diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py index 6688ff4..8ec86f6 100644 --- a/mostlyai/qa/_similarity.py +++ b/mostlyai/qa/_similarity.py @@ -131,9 +131,9 @@ def make_contour_and_centroid_traces( # estimate gaussian kernels data = data.T - # avoid singular matrix error by adding 0.1% noise - noise = np.abs(_min * 1e-3 if (_min := np.min(data)) != 0 else 1e-3) - data += np.random.normal(0, noise, size=data.shape) + # avoid singular matrix error by adding some noise + noise = np.abs(minimum * 1e-3 if (minimum := np.min(data)) != 0 else 1e-18) + data += np.random.normal(loc=0, scale=noise, size=data.shape) try: Z = scipy.stats.gaussian_kde(data)(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape) except Exception as e: