In [16]:
import itertools
import re
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score, mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from scipy.optimize import linear_sum_assignment
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
from sacrebleu.metrics import CHRF
from k_means_constrained import KMeansConstrained
from tqdm import tqdm
from scipy.stats import gaussian_kde
import os
from collections import Counter

df = pd.read_csv("data/wiki_df.csv")
df['x'] = df['x'].apply(lambda x: np.fromstring(x[1:-1], sep=',')).tolist()  # Convert string back to numpy array
df['yv'] = df['yv'].apply(lambda x: np.fromstring(x[1:-1], sep=',')).tolist()  # Convert string back to numpy array
df['zv'] = df['zv'].apply(lambda x: np.fromstring(x[1:-1], sep=',')).tolist()  # Convert string back to numpy array
X = np.vstack(df['zv'].values)  # shape: (n_samples, embedding_dim)
db = DBSCAN(eps=0.36, min_samples=12, metric='cosine').fit(X)
labels = db.labels_
df = df.assign(cluster=labels)
df_valid = df[df['cluster'] != -1].copy()
cluster_sizes = df_valid['cluster'].value_counts()
eligible = cluster_sizes[cluster_sizes >= 12].index
print(cluster_sizes)

  from .autonotebook import tqdm as notebook_tqdm


cluster
1     211
6     150
5     129
9     113
26     86
2      76
7      50
12     49
8      48
13     39
4      39
29     38
11     31
31     29
3      28
30     27
14     26
23     25
24     22
16     22
21     21
10     21
33     20
18     17
32     15
15     14
27     13
25     13
19     13
28     13
35     13
22     12
37     12
34     12
20     12
17     12
0      12
36     10
Name: count, dtype: int64


In [None]:
from collections import Counter

# define a set of stopwords to ignore
stop_words = {
    'on','in','of','to','for','with','a','an','the','and','or','but',
    'is','are','be','as','by','at','from','that','this','these','those',
    # add more as needed...
}

pruned_clusters = []
for cl in eligible:
    sub = df_valid[df_valid['cluster'] == cl]
    # split z into words, lowercase, filter out stopwords & non-alpha tokens
    word_lists = sub['z'].str.split(',').apply(
        lambda shards: [
            w.lower()
            for shard in shards
            for w in shard.strip().split()
            if w.isalpha() and w.lower() not in stop_words
        ]
    )
    # count only the filtered words
    word_counts = Counter(w for words in word_lists for w in words)
    if not word_counts:
        # no valid words in this cluster
        continue

    most_common_word, count = word_counts.most_common(1)[0]
    print(f"Most common non-stopword in cluster {cl}: {most_common_word} (count: {count})")

    # keep only rows containing that word
    mask = word_lists.apply(lambda words: most_common_word in words)
    pruned = sub[mask]
    if len(pruned) >= 12:
        pruned_clusters.append(pruned)

# concatenate and re-compute eligibility
df_pruned       = pd.concat(pruned_clusters, ignore_index=True)
pruned_counts   = df_pruned['cluster'].value_counts()
eligible_pruned = pruned_counts[pruned_counts >= 30].index
eligible_pruned

In [19]:
import os
import re
import itertools
import numpy as np
import pandas as pd

from collections import Counter
from tqdm import tqdm

import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sacrebleu.metrics import CHRF

from k_means_constrained import KMeansConstrained


### Columns of this dataset:
# - 'image_emb': image embeddings
# - 'caption': text captions
# - 'caption_emb': text embeddings
# - 'img_id': unique image identifier

# ─────────────────────────────────────────────────────────────────────────────
# 1) Load & initial TF–IDF + DBSCAN on captions to get “clusters” for sampling
# ─────────────────────────────────────────────────────────────────────────────
df = pd.read_parquet("data/flickr30k.parquet")
df = df.reset_index(drop=True)
df.drop(columns=['sentids', 'split','filename'], inplace=True)
df['_pair'] = df.apply(lambda r: list(zip(r['caption'], r['caption_embs'])), axis=1)
df = df.explode('_pair').reset_index(drop=True)
df[['caption', 'caption_emb']] = pd.DataFrame(df['_pair'].tolist(), index=df.index)
df = df.drop(columns=['caption_embs', '_pair'])
# df = df[:100000]

def tfidf_encode(captions, max_df=0.9, min_df=3, stop_words='english'):
    vect = TfidfVectorizer(max_df=max_df, min_df=min_df,
                           stop_words=stop_words)
    X = vect.fit_transform(captions)
    return X, vect

X_tfidf, vect = tfidf_encode(df['caption'])
db = DBSCAN(eps=0.6, min_samples=12, metric='euclidean')
df['cluster'] = db.fit_predict(X_tfidf)

df_valid     = df[df['cluster'] != -1].copy()
cluster_sz   = df_valid['cluster'].value_counts()
eligible     = cluster_sz[cluster_sz >= 25].index
# eligible = eligible[2:]
print("Eligible clusters:")
print(cluster_sz.loc[eligible].sort_values(ascending=False))

Eligible clusters:
cluster
2     432
6     331
1     278
9     136
3     106
31     90
33     72
10     56
77     55
46     53
22     43
66     40
0      37
30     36
27     35
13     34
36     34
42     34
26     32
16     30
32     30
20     27
75     26
91     25
57     25
60     25
65     25
Name: count, dtype: int64
