In [1]:
import os
from topic_gen.data.data import Data
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


inject = Data()

user_id = "8c525b6b-27c9-4379-a454-2c3b3a781124"

df = inject.query_bookmarks(user_id)

print(df)

                               bookmark_id  \
0     a4f37ecc-172a-48e4-8570-8d059398d77d   
1     a4f37ecc-172a-48e4-8570-8d059398d77d   
2     a4f37ecc-172a-48e4-8570-8d059398d77d   
3     06ba62ab-870f-432a-87a6-a5be07f7b2e6   
4     06ba62ab-870f-432a-87a6-a5be07f7b2e6   
...                                    ...   
2187  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2188  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2189  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2190  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2191  6edcd6df-68d6-4a11-9c8e-c9558175118a   

                                                    url  \
0                https://www.latent.space/p/2025-papers   
1                https://www.latent.space/p/2025-papers   
2                https://www.latent.space/p/2025-papers   
3     https://www.interconnects.ai/p/papers-im-readi...   
4     https://www.interconnects.ai/p/papers-im-readi...   
...                                                 ...   
2187  https://magazine.sebastianra

In [None]:
grouped_df = (
    df.groupby(["tag_id", "key", "name"]).apply(lambda g: g[[
        "bookmark_id", "title", "description", "language", "created_at", "updated_at",
    ]].to_dict(orient="records"))
    .reset_index(name="bookmarks")
)

print(grouped_df)

In [None]:
validated_df = df.copy()

# Remove rows with empty title
validated_df = validated_df[validated_df["title"].notna()]

# Remove rows with empty url
validated_df = validated_df[validated_df["url"].notna()]

# Remove rows with empty tag key
validated_df = validated_df[validated_df["key"].notna()]

In [None]:
# Group by tag_id, key, and name, then count bookmarks for each tag_id
bookmark_counts = validated_df.groupby(["tag_id", "key", "name"]).size().reset_index(name="bookmark_count")

# Filter tags with more than 5 bookmarks
filtered_tags = bookmark_counts[bookmark_counts["bookmark_count"] > 5]

# print(filtered_tags)

# Filter the original dataframe to only include bookmarks with the filtered tags
filtered_df = validated_df[validated_df["tag_id"].isin(filtered_tags["tag_id"])]

# print(filtered_df)

tag_keys_df = filtered_df[["tag_id", "key"]].drop_duplicates()

print(tag_keys_df["key"].unique())



### Clustering


In [None]:
# CLUSTERING
from topic_gen.clustering.cluster import AgglomerativeCluster, KMeansCluster


cluster = AgglomerativeCluster(distance_threshold=1.2)
# kmeans_cluster = KMeansCluster(n_clusters=10)

# kmeans_cluster_df = kmeans_cluster.fit(filtered_df)

cluster_df = cluster.fit(tag_keys_df)
print(cluster_df)




In [None]:
# Print keys belonging to each cluster
for cluster_id in cluster_df["cluster"].unique():
    print(f"Cluster {cluster_id}:")
    keys = cluster_df[cluster_df["cluster"] == cluster_id]["key"].tolist()
    print(keys, len(keys))
    print()

In [None]:
joined_df = pd.merge(cluster_df, filtered_df, on="tag_id", how="left")
print(joined_df.head(10))

In [None]:
cleaned_df = joined_df.drop(columns=["key_x"])
cleaned_df = cleaned_df.rename(columns={"key_y": "key"})

In [None]:
# Print keys belonging to each cluster
for cluster_id in cleaned_df["cluster"].unique():
    print(f"Cluster {cluster_id}:")
    keys_df = cleaned_df[cleaned_df["cluster"] == cluster_id]["key"]
    keys = keys_df.unique().tolist()
    bookmarks_df = cleaned_df[cleaned_df["cluster"] == cluster_id]
    bookmarks = bookmarks_df.groupby(["bookmark_id","title", "description", "language", "created_at", "updated_at"])
    print(keys, len(keys))
    print(bookmarks, len(bookmarks))
    print()

In [None]:
clus_4_df = cleaned_df[cleaned_df["cluster"] == 4]

clus_tags_df = clus_4_df[["tag_id", "key"]].drop_duplicates()
clus_tags = clus_tags_df["key"].unique().tolist()

clus_bookmarks_df = clus_4_df[["bookmark_id","title", "description", "language", "created_at", "updated_at"]].drop_duplicates()
clus_titles = clus_bookmarks_df["title"].unique().tolist()

print(clus_titles, len(clus_titles))
print(clus_tags, len(clus_tags))

In [None]:
# Join the tag_keys_df with the filtered_df on tag_id
# joined_df = pd.merge(tag_keys_df, filtered_df, on="tag_id", how="left")

In [None]:
# NOT NEEDED

# Find non-unique keys and drop rows with least bookmark count
from difflib import SequenceMatcher
import numpy as np

def similarity(a, b):
    """Calculate similarity between two strings"""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def find_similar_keys(df, threshold=0.8):
    """Find groups of similar keys and keep only the one with highest bookmark count"""
    
    # Get unique keys with their counts
    key_counts = df.groupby('key')['bookmark_count'].sum().reset_index()
    key_counts = key_counts.sort_values('bookmark_count', ascending=False)
    
    # Find similar key groups
    similar_groups = []
    processed_keys = set()
    
    for i, row1 in key_counts.iterrows():
        if row1['key'] in processed_keys:
            continue
            
        similar_keys = [row1['key']]
        processed_keys.add(row1['key'])
        
        for j, row2 in key_counts.iterrows():
            if i != j and row2['key'] not in processed_keys:
                if similarity(row1['key'], row2['key']) >= threshold:
                    similar_keys.append(row2['key'])
                    processed_keys.add(row2['key'])
        
        if len(similar_keys) > 1:
            similar_groups.append(similar_keys)
    
    return similar_groups, key_counts

# Find similar key groups
similar_groups, key_counts = find_similar_keys(bookmark_counts, threshold=0.8)

print("Similar key groups found:")
for i, group in enumerate(similar_groups):
    print(f"Group {i+1}: {group}")
    
    # Show counts for each key in the group
    group_counts = key_counts[key_counts['key'].isin(group)]
    print(f"Counts: {dict(zip(group_counts['key'], group_counts['bookmark_count']))}")
    print()

# Create a list of keys to keep (highest count from each similar group)
keys_to_keep = set()
keys_to_drop = set()

for group in similar_groups:
    group_counts = key_counts[key_counts['key'].isin(group)]
    # Keep the key with highest count
    best_key = group_counts.loc[group_counts['bookmark_count'].idxmax(), 'key']
    keys_to_keep.add(best_key)
    
    # Mark others for dropping
    for key in group:
        if key != best_key:
            keys_to_drop.add(key)

print(f"Keys to keep: {keys_to_keep}")
print(f"Keys to drop: {keys_to_drop}")

# Filter the dataframe to remove rows with keys that should be dropped
filtered_bookmark_counts = bookmark_counts[~bookmark_counts['key'].isin(keys_to_drop)]

print(f"\nOriginal rows: {len(bookmark_counts)}")
print(f"Filtered rows: {len(filtered_bookmark_counts)}")
print(f"Rows removed: {len(bookmark_counts) - len(filtered_bookmark_counts)}")

print("\nFiltered bookmark counts:")
print(filtered_bookmark_counts.head(10))


In [1]:
from topic_gen.topic_gen import TopicGen
from dotenv import load_dotenv

load_dotenv()

user_id = "8c525b6b-27c9-4379-a454-2c3b3a781124"

topic_gen = TopicGen()
topic_gen.user_id = user_id

out_df = topic_gen.ingest(user_id)
# print(out_df)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# print(out_df["title"].unique())

# print description, score of unique titles
for title in out_df["title"].unique():
    print(f"Title: {title}")
    print(
        f"Description: {out_df[out_df['title'] == title]['description'].unique()[0]} Score: {out_df[out_df['title'] == title]['score'].unique()[0]}"
    )

Title: The 2025 AI Engineering Reading List
Description: Explore the cutting-edge developments in artificial intelligence agents, focusing on their reasoning capabilities, programming integration, and the evolution of language models that push the boundaries of machine intelligence. Score: 90.0
Title: Recent reasoning research: GRPO tweaks, base model RL, and data curation
Description: Explore the cutting-edge developments in artificial intelligence agents, focusing on their reasoning capabilities, programming integration, and the evolution of language models that push the boundaries of machine intelligence. Score: 90.0
Title: Introducing ChatGPT
Description: Explore the cutting-edge developments in artificial intelligence agents, focusing on their reasoning capabilities, programming integration, and the evolution of language models that push the boundaries of machine intelligence. Score: 90.0
Title: OpenAI Realtime API: The Missing Manual
Description: Explore the cutting-edge developm

In [None]:
topic_gen = TopicGen()
topic_gen.user_id = user_id
topic_gen._persist_topics(out_df)

## Output

In [None]:
cluster_ids = out_df["cluster"].unique()

out = []
for cluster_id in cluster_ids:
    cluster_df = out_df[out_df["cluster"] == cluster_id]
    title = cluster_df["topic"].unique()[0]
    data = {
        "cluster_id": int(cluster_id),
        "title": str(title),
        "tags": cluster_df[["key", "tag_id"]]
        .drop_duplicates()
        .to_dict(orient="records"),
        "bookmarks": cluster_df[
            [
                "bookmark_id",
                "title",
                "description",
                "language",
                "created_at",
                "updated_at",
            ]
        ].to_dict(orient="records"),
    }
    out.append(data)

print(out)