Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 9 additions & 23 deletions mostlyai/qa/_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
# limitations under the License.

import logging
import platform
import time
import numpy as np
import networkx as nx
import xxhash
from sklearn.neighbors import NearestNeighbors
from joblib import cpu_count

from mostlyai.qa._common import (
CHARTS_COLORS,
Expand All @@ -42,22 +43,9 @@ def calculate_dcrs_nndrs(
t0 = time.time()
data = data[data[:, 0].argsort()] # sort data by first dimension to enforce deterministic results

if platform.system() == "Linux":
# use FAISS on Linux for best performance
import faiss # type: ignore

index = faiss.IndexFlatL2(data.shape[1])
index.add(data)
dcrs, _ = index.search(query, 2)
dcrs = np.sqrt(dcrs) # FAISS returns squared distances
else:
# use sklearn as a fallback on non-Linux systems to avoid segfaults; these occurred when using QA as part of SDK
from sklearn.neighbors import NearestNeighbors # type: ignore
from joblib import cpu_count # type: ignore

index = NearestNeighbors(n_neighbors=2, algorithm="auto", metric="l2", n_jobs=min(16, max(1, cpu_count() - 1)))
index.fit(data)
dcrs, _ = index.kneighbors(query)
index = NearestNeighbors(n_neighbors=2, algorithm="auto", metric="l2", n_jobs=min(16, max(1, cpu_count() - 1)))
index.fit(data)
dcrs, _ = index.kneighbors(query)
dcr = dcrs[:, 0]
nndr = (dcrs[:, 0] + 1e-8) / (dcrs[:, 1] + 1e-8)
_LOG.info(f"calculated DCRs for {data.shape=} and {query.shape=} in {time.time() - t0:.2f}s")
Expand Down Expand Up @@ -85,14 +73,12 @@ def calculate_distances(
groups = []
# check all columns together
groups += [np.arange(ori_embeds.shape[1])]
# check subsets of correlated columns together
# check 3 correlated subsets of columns
if ori_embeds.shape[1] > 10:
k = max(3, ori_embeds.shape[1] // 10)
groups += split_columns_into_correlated_groups(ori_embeds, k=k)
# check random subsets of columns
groups += split_columns_into_correlated_groups(ori_embeds, k=3)
# check 3 random subsets of columns
if ori_embeds.shape[1] > 10:
k = max(3, ori_embeds.shape[1] // 10)
groups += split_columns_into_random_groups(ori_embeds, k=k)
groups += split_columns_into_random_groups(ori_embeds, k=3)
Comment on lines +78 to +81
Copy link

Copilot AI May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Hardcoding the groups count to 3 for correlated and random groups may not suit all datasets; consider whether a dynamic calculation based on the number of features could yield more robust grouping.

Copilot uses AI. Check for mistakes.
dcr_share = 0.0
nndr_ratio = 1.0
for columns in groups:
Expand Down
8 changes: 5 additions & 3 deletions mostlyai/qa/_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,13 @@ def prepare_data_for_embeddings(

# cap to Q95 sequence length of original to avoid excessive samples per group distorting results
if tgt_context_key is not None:
cap_sequence_length = 100
q95_sequence_length = trn_tgt_data.groupby(key).size().quantile(0.95)
syn_tgt_data = syn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=q95_sequence_length)
trn_tgt_data = trn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=q95_sequence_length)
max_sequence_length = min(q95_sequence_length, cap_sequence_length)
Comment on lines +303 to +305
Copy link

Copilot AI May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Consider defining cap_sequence_length as a module-level constant if similar caps might be used elsewhere, to improve maintainability and ease future adjustments.

Copilot uses AI. Check for mistakes.
syn_tgt_data = syn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=max_sequence_length)
trn_tgt_data = trn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=max_sequence_length)
hol_tgt_data = (
hol_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=q95_sequence_length) if hol else None
hol_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=max_sequence_length) if hol else None
)

# drop key from data as its not relevant for embeddings
Expand Down
9 changes: 9 additions & 0 deletions mostlyai/qa/_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ def calculate_mean_auc(embeds1, embeds2):
for a ML model to discriminate between two embedding arrays.
"""

# limit the number of samples to 10000
embeds1 = embeds1[:10000]
embeds2 = embeds2[:10000]

# create labels for the data
labels1 = np.zeros(embeds1.shape[0])
labels2 = np.ones(embeds2.shape[0])
Expand Down Expand Up @@ -195,6 +199,11 @@ def plot_store_similarity_contours(
if trn_embeds.shape[1] < 3:
return

# limit the number of samples to 10000
syn_embeds = syn_embeds[:10000]
trn_embeds = trn_embeds[:10000]
hol_embeds = hol_embeds[:10000] if hol_embeds is not None else None

# perform PCA on trn embeddings
pca_model = PCA(n_components=3)
pca_model.fit(trn_embeds)
Expand Down
18 changes: 11 additions & 7 deletions mostlyai/qa/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ def report(
check_min_sample_size(trn_sample_size, 90, "training")
if hol_tgt_data is not None:
check_min_sample_size(hol_sample_size, 10, "holdout")
Copy link

Copilot AI May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While handling empty target data for training and synthetic datasets, consider also checking hol_tgt_data (if available) for empty columns to maintain consistency in error handling.

Suggested change
check_min_sample_size(hol_sample_size, 10, "holdout")
check_min_sample_size(hol_sample_size, 10, "holdout")
if hol_tgt_data.shape[1] == 0:
raise PrerequisiteNotMetError("Holdout data has no columns.")

Copilot uses AI. Check for mistakes.
if trn_tgt_data.shape[1] == 0 or syn_tgt_data.shape[1] == 0:
raise PrerequisiteNotMetError("Provided data has no columns.")
except PrerequisiteNotMetError as err:
_LOG.info(err)
statistics.mark_early_exit()
Expand All @@ -205,7 +207,6 @@ def report(
else:
setup = "1:1"

_LOG.info("prepare training data for accuracy")
trn = prepare_data_for_accuracy(
df_tgt=trn_tgt_data,
df_ctx=trn_ctx_data,
Expand All @@ -214,8 +215,8 @@ def report(
max_sample_size=max_sample_size_accuracy,
setup=setup,
)
_LOG.info(f"prepared training data for accuracy: {trn.shape}")
if hol_tgt_data is not None:
_LOG.info("prepare holdout data for accuracy")
hol = prepare_data_for_accuracy(
df_tgt=hol_tgt_data,
df_ctx=hol_ctx_data,
Expand All @@ -225,13 +226,13 @@ def report(
setup=setup,
ori_dtypes=trn.dtypes.to_dict(),
)
_LOG.info(f"prepared holdout data for accuracy: {hol.shape}")
ori = pd.concat([trn, hol], axis=0, ignore_index=True)
else:
hol = None
ori = trn
progress.update(completed=5, total=100)

_LOG.info("prepare synthetic data for accuracy")
syn = prepare_data_for_accuracy(
df_tgt=syn_tgt_data,
df_ctx=syn_ctx_data,
Expand All @@ -241,29 +242,29 @@ def report(
setup=setup,
ori_dtypes=trn.dtypes.to_dict(),
)
_LOG.info(f"prepared synthetic data for accuracy: {syn.shape}")
progress.update(completed=10, total=100)

# do coherence analysis only if there are non-fk columns in the target data
do_coherence = setup == "1:N" and len(trn_tgt_data.columns) > 1
if do_coherence:
_LOG.info("prepare original data for coherence started")
ori_coh, ori_coh_bins = prepare_data_for_coherence(
df_tgt=pd.concat([trn_tgt_data, hol_tgt_data]) if hol_tgt_data is not None else trn_tgt_data,
tgt_context_key=tgt_context_key,
max_sample_size=max_sample_size_coherence,
)
_LOG.info("prepare synthetic data for coherence started")
_LOG.info(f"prepared original data for coherence: {ori_coh.shape}")
syn_coh, _ = prepare_data_for_coherence(
df_tgt=syn_tgt_data,
tgt_context_key=tgt_context_key,
bins=ori_coh_bins,
max_sample_size=max_sample_size_coherence,
)
_LOG.info("store bins used for training data for coherence")
_LOG.info(f"prepared synthetic data for coherence: {syn_coh.shape}")
statistics.store_coherence_bins(bins=ori_coh_bins)
_LOG.info("stored bins used for training data for coherence")
progress.update(completed=15, total=100)

_LOG.info("calculate embeddings")
syn_embeds, trn_embeds, hol_embeds = prepare_data_for_embeddings(
syn_tgt_data=syn_tgt_data,
trn_tgt_data=trn_tgt_data,
Expand All @@ -275,6 +276,9 @@ def report(
tgt_context_key=tgt_context_key,
max_sample_size=max_sample_size_embeddings,
)
_LOG.info(
f"calculated embeddings: syn={syn_embeds.shape}, trn={trn_embeds.shape}, hol={hol_embeds.shape if hol_embeds is not None else None}"
)
progress.update(completed=20, total=100)

## 1. ACCURACY ##
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ dependencies = [
"accelerate>=1.5.0",
"torch>=2.6.0",
"xxhash>=3.5.0",
"faiss-cpu>=1.7.0",
]

[project.urls]
Expand Down
Loading