# Experiment: user prediction

Cluster Membership Hit

1- Is the correct user among those active in the predicted cluster for a new QID?

Ranking Quality

2- Where does the correct user rank among users from the predicted cluster?

Metrics: Hits@1, Hits@3, Hits@10

- Step 1: Train-Test Split with User Constraint

In [None]:
from sklearn.model_selection import train_test_split

# Get unique QIDs
all_qids = df_valid['qid'].unique()

# Split QIDs
train_qids, test_qids = train_test_split(all_qids, test_size=0.2, random_state=42)

# Create train and test DataFrames
df_train = df_valid[df_valid['qid'].isin(train_qids)].copy()
df_test = df_valid[df_valid['qid'].isin(test_qids)].copy()

# Filter test QIDs: retain only if all their users exist in train set
train_users = set(df_train['username'])
df_test = df_test[df_test['username'].isin(train_users)].copy()

print(f"Train QIDs: {len(train_qids)}, Test QIDs: {df_test['qid'].nunique()}, Valid Test Rows: {len(df_test)}")

Train QIDs: 5291, Test QIDs: 1259, Valid Test Rows: 1475


## Train Cluster Model on 80% QIDs

In [None]:
# Use only training QIDs
train_embeddings = np.stack([qid_to_embedding[qid] for qid in train_qids if qid in qid_to_embedding])

# Step 1: Fit UMAP
umap_model = umap.UMAP(n_components=50, random_state=42)
train_umap = umap_model.fit_transform(train_embeddings)

# Step 2: Fit HDBSCAN
hdb = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True)
hdb.fit(train_umap)

  warn(


0,1,2
,min_cluster_size,10
,min_samples,
,cluster_selection_epsilon,0.0
,max_cluster_size,0
,metric,'euclidean'
,alpha,1.0
,p,
,algorithm,'best'
,leaf_size,40
,memory,Memory(location=None)


In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Get cluster labels
train_labels = hdb.labels_

# Filter out noise
valid_mask = train_labels != -1
X_valid = train_umap[valid_mask]
labels_valid = train_labels[valid_mask]

# Compute scores
silhouette = silhouette_score(X_valid, labels_valid)
ch_score = calinski_harabasz_score(X_valid, labels_valid)
db_score = davies_bouldin_score(X_valid, labels_valid)

# Print scores
print("\n📊 Clustering Quality Metrics (Training Data Only)")
print(f"Silhouette Score:        {silhouette:.3f}")
print(f"Calinski-Harabasz Index: {ch_score:.2f}")
print(f"Davies-Bouldin Index:    {db_score:.3f}")
print(f"Clusters Found (excl. noise): {len(set(labels_valid))}")


📊 Clustering Quality Metrics (Training Data Only)
Silhouette Score:        0.539
Calinski-Harabasz Index: 9772.66
Davies-Bouldin Index:    0.571
Clusters Found (excl. noise): 62


In [None]:
from hdbscan.prediction import approximate_predict

# Filter test QIDs that have embeddings
test_qids_eval = [qid for qid in df_test['qid'].unique() if qid in qid_to_embedding]
test_embeddings = np.stack([qid_to_embedding[qid] for qid in test_qids_eval])

# Project and predict
test_umap = umap_model.transform(test_embeddings)
test_cluster_labels, _ = approximate_predict(hdb, test_umap)

# Map test QIDs to predicted clusters
qid_to_pred_cluster = dict(zip(test_qids_eval, test_cluster_labels))
df_test['predicted_cluster'] = df_test['qid'].map(qid_to_pred_cluster)

# Drop rows where the cluster couldn't be predicted (e.g., if -1)
df_test = df_test[df_test['predicted_cluster'] != -1].copy()

print(f"✅ Test QIDs with cluster assignments: {df_test['qid'].nunique()}")
print(f"✅ Test records after filtering: {len(df_test)}")

✅ Test QIDs with cluster assignments: 778
✅ Test records after filtering: 922


In [None]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Precompute: cluster → (qid, user) pairs (from training)
cluster_to_qid_user = defaultdict(list)
qid_to_cluster_train = {}

# Assign training QIDs to clusters
train_qids_used = df_train['qid'].unique()
train_embeddings = np.stack([qid_to_embedding[qid] for qid in train_qids_used if qid in qid_to_embedding])
train_umap_proj = umap_model.transform(train_embeddings)
train_labels = hdb.labels_

for qid, label in zip(train_qids_used, train_labels):
    if label != -1:
        qid_to_cluster_train[qid] = label

for _, row in df_train.iterrows():
    qid = row['qid']
    user = row['username']
    cluster = qid_to_cluster_train.get(qid)
    if cluster is not None:
        cluster_to_qid_user[cluster].append((qid, user))

# ------------------------------
# Evaluation Loop
# ------------------------------
hits_freq = {1: 0, 3: 0, 10: 0}
hits_sim = {1: 0, 3: 0, 10: 0}
membership_hit = 0
total = 0

for _, row in df_test.iterrows():
    true_user = row['username']
    qid = row['qid']
    cluster = row['predicted_cluster']
    if cluster == -1 or qid not in qid_to_embedding:
        continue

    qid_vec = qid_to_embedding[qid]
    user_freq = defaultdict(int)
    user_sims = defaultdict(list)

    for q_train, u_train in cluster_to_qid_user[cluster]:
        if q_train in qid_to_embedding:
            user_freq[u_train] += 1
            sim = cosine_similarity(
                qid_vec.reshape(1, -1),
                qid_to_embedding[q_train].reshape(1, -1)
            )[0, 0]
            user_sims[u_train].append(sim)

    if not user_freq:
        continue

    total += 1

    # Membership
    if true_user in user_freq:
        membership_hit += 1

    # Frequency ranking
    freq_ranking = sorted(user_freq.keys(), key=lambda u: user_freq[u], reverse=True)
    # Similarity ranking
    sim_ranking = sorted(user_sims.keys(), key=lambda u: np.mean(user_sims[u]), reverse=True)

    for k in [1, 3, 10]:
        if true_user in freq_ranking[:k]:
            hits_freq[k] += 1
        if true_user in sim_ranking[:k]:
            hits_sim[k] += 1

# ------------------------------
# Print Results
# ------------------------------
print("\n📈 Evaluation Results (Frequency-Based Ranking)")
for k in [1, 3, 10]:
    print(f"Hits@{k}: {hits_freq[k] / total:.3f}")

print("\n📈 Evaluation Results (Cosine Similarity Ranking)")
for k in [1, 3, 10]:
    print(f"Hits@{k}: {hits_sim[k] / total:.3f}")

print(f"\n🔍 Membership Hit Rate: {membership_hit / total:.3f}")
print(f"🔢 Total evaluated examples: {total}")


📈 Evaluation Results (Frequency-Based Ranking)
Hits@1: 0.192
Hits@3: 0.371
Hits@10: 0.568

📈 Evaluation Results (Cosine Similarity Ranking)
Hits@1: 0.110
Hits@3: 0.190
Hits@10: 0.386

🔍 Membership Hit Rate: 0.745
🔢 Total evaluated examples: 922
