In [1]:
import os
from topic_gen.data.data import Data
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


inject = Data()

user_id = "8c525b6b-27c9-4379-a454-2c3b3a781124"

df = inject.query_bookmarks(user_id)

print(df)

                               bookmark_id  \
0     a4f37ecc-172a-48e4-8570-8d059398d77d   
1     a4f37ecc-172a-48e4-8570-8d059398d77d   
2     a4f37ecc-172a-48e4-8570-8d059398d77d   
3     06ba62ab-870f-432a-87a6-a5be07f7b2e6   
4     06ba62ab-870f-432a-87a6-a5be07f7b2e6   
...                                    ...   
2187  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2188  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2189  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2190  6edcd6df-68d6-4a11-9c8e-c9558175118a   
2191  6edcd6df-68d6-4a11-9c8e-c9558175118a   

                                                    url  \
0                https://www.latent.space/p/2025-papers   
1                https://www.latent.space/p/2025-papers   
2                https://www.latent.space/p/2025-papers   
3     https://www.interconnects.ai/p/papers-im-readi...   
4     https://www.interconnects.ai/p/papers-im-readi...   
...                                                 ...   
2187  https://magazine.sebastianra

In [None]:
grouped_df = (
    df.groupby(["tag_id", "key", "name"]).apply(lambda g: g[[
        "bookmark_id", "title", "description", "language", "created_at", "updated_at",
    ]].to_dict(orient="records"))
    .reset_index(name="bookmarks")
)

print(grouped_df)

In [None]:
validated_df = df.copy()

# Remove rows with empty title
validated_df = validated_df[validated_df["title"].notna()]

# Remove rows with empty url
validated_df = validated_df[validated_df["url"].notna()]

# Remove rows with empty tag key
validated_df = validated_df[validated_df["key"].notna()]

In [None]:
# Group by tag_id, key, and name, then count bookmarks for each tag_id
bookmark_counts = validated_df.groupby(["tag_id", "key", "name"]).size().reset_index(name="bookmark_count")

# Filter tags with more than 5 bookmarks
filtered_tags = bookmark_counts[bookmark_counts["bookmark_count"] > 5]

# print(filtered_tags)

# Filter the original dataframe to only include bookmarks with the filtered tags
filtered_df = validated_df[validated_df["tag_id"].isin(filtered_tags["tag_id"])]

# print(filtered_df)

tag_keys_df = filtered_df[["tag_id", "key"]].drop_duplicates()

print(tag_keys_df["key"].unique())



### Clustering


In [None]:
# CLUSTERING
from topic_gen.clustering.cluster import AgglomerativeCluster, KMeansCluster


cluster = AgglomerativeCluster(distance_threshold=1.2)
# kmeans_cluster = KMeansCluster(n_clusters=10)

# kmeans_cluster_df = kmeans_cluster.fit(filtered_df)

cluster_df = cluster.fit(tag_keys_df)
print(cluster_df)




In [None]:
# Print keys belonging to each cluster
for cluster_id in cluster_df["cluster"].unique():
    print(f"Cluster {cluster_id}:")
    keys = cluster_df[cluster_df["cluster"] == cluster_id]["key"].tolist()
    print(keys, len(keys))
    print()

In [None]:
joined_df = pd.merge(cluster_df, filtered_df, on="tag_id", how="left")
print(joined_df.head(10))

In [None]:
cleaned_df = joined_df.drop(columns=["key_x"])
cleaned_df = cleaned_df.rename(columns={"key_y": "key"})

In [None]:
# Print keys belonging to each cluster
for cluster_id in cleaned_df["cluster"].unique():
    print(f"Cluster {cluster_id}:")
    keys_df = cleaned_df[cleaned_df["cluster"] == cluster_id]["key"]
    keys = keys_df.unique().tolist()
    bookmarks_df = cleaned_df[cleaned_df["cluster"] == cluster_id]
    bookmarks = bookmarks_df.groupby(["bookmark_id","title", "description", "language", "created_at", "updated_at"])
    print(keys, len(keys))
    print(bookmarks, len(bookmarks))
    print()

In [None]:
clus_4_df = cleaned_df[cleaned_df["cluster"] == 4]

clus_tags_df = clus_4_df[["tag_id", "key"]].drop_duplicates()
clus_tags = clus_tags_df["key"].unique().tolist()

clus_bookmarks_df = clus_4_df[["bookmark_id","title", "description", "language", "created_at", "updated_at"]].drop_duplicates()
clus_titles = clus_bookmarks_df["title"].unique().tolist()

print(clus_titles, len(clus_titles))
print(clus_tags, len(clus_tags))

In [None]:
# Join the tag_keys_df with the filtered_df on tag_id
# joined_df = pd.merge(tag_keys_df, filtered_df, on="tag_id", how="left")

In [None]:
# NOT NEEDED

# Find non-unique keys and drop rows with least bookmark count
from difflib import SequenceMatcher
import numpy as np

def similarity(a, b):
    """Calculate similarity between two strings"""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def find_similar_keys(df, threshold=0.8):
    """Find groups of similar keys and keep only the one with highest bookmark count"""
    
    # Get unique keys with their counts
    key_counts = df.groupby('key')['bookmark_count'].sum().reset_index()
    key_counts = key_counts.sort_values('bookmark_count', ascending=False)
    
    # Find similar key groups
    similar_groups = []
    processed_keys = set()
    
    for i, row1 in key_counts.iterrows():
        if row1['key'] in processed_keys:
            continue
            
        similar_keys = [row1['key']]
        processed_keys.add(row1['key'])
        
        for j, row2 in key_counts.iterrows():
            if i != j and row2['key'] not in processed_keys:
                if similarity(row1['key'], row2['key']) >= threshold:
                    similar_keys.append(row2['key'])
                    processed_keys.add(row2['key'])
        
        if len(similar_keys) > 1:
            similar_groups.append(similar_keys)
    
    return similar_groups, key_counts

# Find similar key groups
similar_groups, key_counts = find_similar_keys(bookmark_counts, threshold=0.8)

print("Similar key groups found:")
for i, group in enumerate(similar_groups):
    print(f"Group {i+1}: {group}")
    
    # Show counts for each key in the group
    group_counts = key_counts[key_counts['key'].isin(group)]
    print(f"Counts: {dict(zip(group_counts['key'], group_counts['bookmark_count']))}")
    print()

# Create a list of keys to keep (highest count from each similar group)
keys_to_keep = set()
keys_to_drop = set()

for group in similar_groups:
    group_counts = key_counts[key_counts['key'].isin(group)]
    # Keep the key with highest count
    best_key = group_counts.loc[group_counts['bookmark_count'].idxmax(), 'key']
    keys_to_keep.add(best_key)
    
    # Mark others for dropping
    for key in group:
        if key != best_key:
            keys_to_drop.add(key)

print(f"Keys to keep: {keys_to_keep}")
print(f"Keys to drop: {keys_to_drop}")

# Filter the dataframe to remove rows with keys that should be dropped
filtered_bookmark_counts = bookmark_counts[~bookmark_counts['key'].isin(keys_to_drop)]

print(f"\nOriginal rows: {len(bookmark_counts)}")
print(f"Filtered rows: {len(filtered_bookmark_counts)}")
print(f"Rows removed: {len(bookmark_counts) - len(filtered_bookmark_counts)}")

print("\nFiltered bookmark counts:")
print(filtered_bookmark_counts.head(10))


In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


In [5]:
from topic_gen.topic_gen import TopicGen
from dotenv import load_dotenv

load_dotenv()

user_id = "8c525b6b-27c9-4379-a454-2c3b3a781124"

topic_gen = TopicGen()
topic_gen.user_id = user_id

out_df = topic_gen.ingest(user_id)
# print(out_df)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionp

In [10]:
bookmarks_count = out_df["bookmark_id"].unique()
print(len(bookmarks_count))

tags_count = out_df["tag_id"].unique()
print(tags_count)

24
['73bc60ad-9083-40a8-897c-801ccabdaddd'
 'a5eef93b-1922-4dfd-bfb0-b00039c0511d'
 '50c2b3d1-3be8-4d7d-987e-1a37873c73b8'
 '747690b6-9d6c-432c-b4e4-d2c603c27a20'
 '565d4ce1-41b5-4e9d-9d82-cfe4452ea983'
 '5b7533ea-c1e1-4f22-bfbd-282670e92fa5'
 'caf39826-fd71-4ad3-a47d-ffa2b06a259b'
 '0ef6e82c-8d89-475f-93d8-caea97b995d7'
 'a7077fc1-36e4-494c-b5db-e5dc57b0bf87'
 'c94429a6-0170-4eb3-a677-1f98084ecd85'
 'acb92764-fa4e-4f81-ab92-a325e8535413'
 '6815a3f0-2a4f-4780-b48c-45b812bdfae4'
 'f39825a6-65fb-471b-b230-6290b902d8c2'
 '947989c6-490d-432e-9fe6-3e1e6763ad74'
 'c9d9f291-4671-4019-8de9-ee0e3b45aabe'
 '1b5dc062-e565-4848-b0a1-db419c8ad443'
 '2f6ea15d-1c59-439d-8a18-949228ce7e91'
 'a5ef0c95-492d-4d26-93bc-70107f73a15b'
 '3ceb26a9-130e-4cbe-a7b7-ba87e6ed67cd'
 '159c5322-ceeb-4da4-b176-1e37caf1f3a8'
 '6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074'
 '4a27b78e-5b5e-4d3f-84bd-9714bde391a6'
 '06225a15-a62f-4ee4-ba1a-58bde57006ae'
 'c923abda-7a9c-4fef-9282-26c53a643585'
 '7cd06b48-14ca-4228-90e9-6c3c4a1801c

In [3]:
df_2 = topic_gen.filter_data(out_df, 2)

DEBUG:topic_gen.topic_gen:Filtered dataframe size: 82, dimensions: (82, 11)


In [7]:
df = out_df.copy()

bookmark_counts = (
    df.groupby(["tag_id", "key", "name"])
    .size()
    .reset_index(name="bookmark_count")
)
print(bookmark_counts)
# Filter tags with more than 5 bookmarks
filtered_tags = bookmark_counts[
    bookmark_counts["bookmark_count"] > 2
]
# Filter the original dataframe to only include bookmarks with the filtered tags
df = df[df["tag_id"].isin(filtered_tags["tag_id"])]
logger.debug(f"Filtered dataframe size: {len(df)}, dimensions: {df.shape}")

DEBUG:root:Filtered dataframe size: 82, dimensions: (82, 11)


                                  tag_id                            key  \
0   06225a15-a62f-4ee4-ba1a-58bde57006ae        plant-disease-pathogens   
1   0ef6e82c-8d89-475f-93d8-caea97b995d7             new-plant-diseases   
2   159c5322-ceeb-4da4-b176-1e37caf1f3a8           plant-disease-basics   
3   1b5dc062-e565-4848-b0a1-db419c8ad443                   leaf-disease   
4   1c57d77f-d4ba-4e46-89ab-e4dc54c9750e       plant-disease-management   
5   2acc2b8a-24c0-440c-bd12-0a78ce812d67                    soil-health   
6   2f6ea15d-1c59-439d-8a18-949228ce7e91                disease-control   
7   3ceb26a9-130e-4cbe-a7b7-ba87e6ed67cd                 treatment-tips   
8   4a27b78e-5b5e-4d3f-84bd-9714bde391a6    good-agricultural-practices   
9   50c2b3d1-3be8-4d7d-987e-1a37873c73b8                  plant-disease   
10  565d4ce1-41b5-4e9d-9d82-cfe4452ea983                computer-vision   
11  5b7533ea-c1e1-4f22-bfbd-282670e92fa5                  deep-learning   
12  6815a3f0-2a4f-4780-b4

In [5]:
cluster_df = topic_gen.cluster_data(df_2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:topic_gen.topic_gen:Clustered dataframe size: 82, dimensions: (82, 12)


In [6]:
topics_df = topic_gen.generate_topics(cluster_df)

DEBUG:topic_gen.topic_gen:Generating topics for 6 clusters
DEBUG:topic_gen.topic_gen:Using 4 workers for topic generation
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'headers': {'X-Stainless-Raw-Response': 'true'}, 'files': None, 'idempotency_key': 'stainless-python-retry-db26bb0d-c762-4c76-986f-d43647f70ff1', 'json_data': {'messages': [{'content': '\nYou are tasked with generating a cohesive and engaging topic based on a provided list of keywords and article titles. \nYour goal is to synthesize the information to create a topic that encapsulates the overarching theme or subject matter implied by the inputs. \n\n**Instructions:**\n\n1. **Input Format:**\n   - **Keywords:** A list of relevant keywords (e.g., [keyword1, keyword2, keyword3, ...]).\n   - **Article Titles:** A list of article titles (e.g., ["Title 1", "Title 2", "Title 3", ...]).\n\n2. **Output Requirements:**\n   - Generate a single topic that clearly reflects the common theme 

Generating topic for cluster                                  tag_id  cluster  \
0  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
1  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
2  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
3  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
4  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
5  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
6  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
7  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
8  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
9  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   

                            bookmark_id  \
0  42c86434-e7c4-4118-98d2-e617dc4c33de   
1  1583f068-55a3-4a02-bcd1-bba7c8cba08f   
2  b044cde3-a210-430b-900f-5081cf9255ff   
3  85c6cad1-834e-4274-85f6-095be7b74c5e   
4  837d5e10-a243-421a-a1a6-7a12ea323a66   
5  f99e02e9-f2ac-4c60-9352-c6555ee8ef6d   
6  41c8c794-5b05-4007-a4ab-b6e586b714b7   
7  ae92c39b-fedd-4b0e-8473-fd18d95962bc   
8  bcbf4126

DEBUG:httpcore.connection:start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x00000150B85858E0>
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.connection:start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x00000150B85639B0>
DEBUG:httpcore.connection:start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x00000150B836EC90>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.connection:start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x00000150B8585F70>
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:ht

ResponseFormat(topic='Harnessing AI for Advanced Plant Disease Detection', description='Explore how artificial intelligence and deep learning are revolutionizing the detection and diagnosis of plant diseases, enhancing agricultural practices and food security.', score=90)
Generating topic for cluster                                   tag_id  cluster  \
68  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
69  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
70  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
71  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
72  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
73  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
74  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
75  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
76  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   

                             bookmark_id  \
68  234a84c2-c693-4883-8c09-7f8d15515a6d   
69  e71d8fbf-f3b3-449c-9b59-2aee37a0592b   
70  cf589edb-6353-4f59-ab6e-bf47461

DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 08:06:47 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'1545'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'1575'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9995'), (b'x-ratelimit-remaining-tokens', b'198999'), (b'x-ratelimit-reset-requests', b'39.599s'), (b'x-ratelimit-reset-tokens', b'300ms'), (b'x-request-id', b'req_95d9fae35b4c4e93a2b07b97363d7a66'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; prel

ResponseFormat(topic='Advancements in Computer Vision for Plant Disease Detection', description='Explore the latest advancements in computer vision technologies aimed at enhancing the detection and diagnosis of plant diseases. This topic delves into innovative techniques and models that leverage deep learning to improve accuracy and efficiency in agricultural practices.', score=90)


DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 08:06:48 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'1247'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'1273'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9992'), (b'x-ratelimit-remaining-tokens', b'198293'), (b'x-ratelimit-reset-requests', b'1m4.627s'), (b'x-ratelimit-reset-tokens', b'511ms'), (b'x-request-id', b'req_ad129162a9264e23882a5e74c987790a'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; pre

ResponseFormat(topic='Innovative Approaches to Sustainable Agricultural Practices', description='Delve into the latest advancements and principles of sustainable agricultural practices, focusing on good agricultural practices that promote environmental health and food safety.', score=85)
ResponseFormat(topic='Enhancing Soil Health Through Crop Rotation Techniques', description='Discover how crop rotation techniques can significantly improve soil health, boost agricultural productivity, and promote sustainable farming practices.', score=88)


DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 08:06:48 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'1654'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'1676'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9992'), (b'x-ratelimit-remaining-tokens', b'198873'), (b'x-ratelimit-reset-requests', b'1m4.72s'), (b'x-ratelimit-reset-tokens', b'338ms'), (b'x-request-id', b'req_45b182b550924a7baaa492744b8fc6cf'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; prel

ResponseFormat(topic='Advancements in Plant Disease Detection and Management', description='Explore the latest advancements in identifying and managing plant diseases, focusing on innovative techniques and technologies that enhance detection and treatment methods. This topic delves into the intersection of agriculture and technology, showcasing how new approaches can improve crop health and yield.', score=90)


In [1]:
from topic_gen.topic_gen import TopicGen
from dotenv import load_dotenv

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

load_dotenv()

user_id = "8c525b6b-27c9-4379-a454-2c3b3a781124"

topic_gen = TopicGen()
topic_gen.user_id = user_id

out_df = topic_gen.generate(user_id)

DEBUG:root:test
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json HTTP/1.1" 200 0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:topic_gen.topic_gen:Clustered dataframe size: 82, dimensions: (82, 12)
DEBUG:topic_gen.topic_gen:Generating topics for 6 clusters
DEBUG:topic_gen.topic_gen:Using 4 workers for topic generation
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'headers': {'X-Stainless-Raw-Response': 'true'}, 'files': None, 'idempotency_key': 'stainless-python-retry-7e0db212-e953-4026-8042-fcb3224ea450', 'json_data': {'messages': [{'content': '\nYou are tasked with generating a cohesive and engaging topic based on a provided list of keywords and article titles. \nYour goal is to synthesize the information to create a topic that encapsulates the overarching theme or subject matter implied by the inputs. \n\n**Instructions:**\n\n1. **Input Format:**\n   - **Keywords:** A list of relevant keywords (e.g., [keyword1, keyword2, keyword3, ...]).\n   - **Article Titles:** A list of article titles (e.g., ["Title 1", "Title 2", "Title 3", ...]).\n\n2. **Output Requirem

Generating topic for cluster                                  tag_id  cluster  \
0  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
1  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
2  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
3  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
4  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
5  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
6  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
7  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
8  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   
9  50c2b3d1-3be8-4d7d-987e-1a37873c73b8        1   

                            bookmark_id  \
0  42c86434-e7c4-4118-98d2-e617dc4c33de   
1  1583f068-55a3-4a02-bcd1-bba7c8cba08f   
2  b044cde3-a210-430b-900f-5081cf9255ff   
3  85c6cad1-834e-4274-85f6-095be7b74c5e   
4  837d5e10-a243-421a-a1a6-7a12ea323a66   
5  f99e02e9-f2ac-4c60-9352-c6555ee8ef6d   
6  41c8c794-5b05-4007-a4ab-b6e586b714b7   
7  ae92c39b-fedd-4b0e-8473-fd18d95962bc   
8  bcbf4126

DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x0000028382459430>
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x00000283813BFEC0>
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x0000028382458CB0>
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x0000028382459A60>
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x0000028380225C50> server_hostname='api.openai.com' timeout=None
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x0000028380225C50> server_hostname='api.openai.com' timeout=None
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x0000028380225C50> server_hostname='api.openai.com' timeout=None
DEBUG:httpcore.connection:start_tls.started ssl_cont

ResponseFormat(topic='Comprehensive Approaches to Plant Disease Control', description='This topic delves into comprehensive approaches for managing plant diseases, emphasizing the importance of understanding pathogens, their transmission, and the implementation of good agricultural practices.', score=85)
Generating topic for cluster                                   tag_id  cluster  \
68  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
69  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
70  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
71  6ef4a6cb-5fc9-4c09-8f32-f2ab489c3074        5   
72  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
73  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
74  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
75  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   
76  4a27b78e-5b5e-4d3f-84bd-9714bde391a6        5   

                             bookmark_id  \
68  234a84c2-c693-4883-8c09-7f8d15515a6d   
69  e71d8fbf-f3b3-449c-9b59-2aee37a0592b   
70

DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 13:56:35 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'1634'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'1667'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9994'), (b'x-ratelimit-remaining-tokens', b'198537'), (b'x-ratelimit-reset-requests', b'47.772s'), (b'x-ratelimit-reset-tokens', b'438ms'), (b'x-request-id', b'req_a88ffbf28f1d438fbe2a722f91924800'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; prel

ResponseFormat(topic='Innovative Techniques in Computer Vision for Plant Disease Detection', description='This topic explores cutting-edge techniques in computer vision that revolutionize the detection and diagnosis of plant diseases, highlighting the integration of deep learning methods to enhance accuracy and efficiency in agriculture.', score=92)
Generating topic for cluster                                   tag_id  cluster  \
77  838a5b7f-31e8-44d9-9e6c-b054a2d506f6        3   
78  838a5b7f-31e8-44d9-9e6c-b054a2d506f6        3   
79  838a5b7f-31e8-44d9-9e6c-b054a2d506f6        3   
80  838a5b7f-31e8-44d9-9e6c-b054a2d506f6        3   
81  838a5b7f-31e8-44d9-9e6c-b054a2d506f6        3   

                             bookmark_id  \
77  bc0b8bcd-44ad-4b26-a459-db769874188e   
78  7c8d78fc-e09c-4ef5-a980-06018ce8ea51   
79  53faa573-9e79-4b5c-8fbd-f5f43df0faa2   
80  1ecc0421-9b35-4c49-a281-8d441e5f625c   
81  e71d8fbf-f3b3-449c-9b59-2aee37a0592b   

                                   

DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 13:56:35 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'2085'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'2262'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9995'), (b'x-ratelimit-remaining-tokens', b'198873'), (b'x-ratelimit-reset-requests', b'39.261s'), (b'x-ratelimit-reset-tokens', b'338ms'), (b'x-request-id', b'req_0a8972c36cae485eb4d1d6d02ec92b57'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; prel

ResponseFormat(topic='Advanced Techniques for Identifying and Managing Plant Diseases', description='Explore cutting-edge methods and technologies for detecting and managing various plant diseases, emphasizing the role of artificial intelligence and deep learning in enhancing agricultural practices.', score=90)
ResponseFormat(topic='Advancements in AI for Detecting Plant Diseases', description='This topic explores the latest advancements in artificial intelligence and deep learning techniques for accurately identifying and diagnosing plant diseases, highlighting innovative approaches and practical applications in agriculture.', score=90)


DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 13:56:36 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'1438'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'1536'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9991'), (b'x-ratelimit-remaining-tokens', b'198930'), (b'x-ratelimit-reset-requests', b'1m12.101s'), (b'x-ratelimit-reset-tokens', b'321ms'), (b'x-request-id', b'req_5e2f7538055843b29490d49f84865333'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; pr

ResponseFormat(topic='Innovative Techniques for Sustainable Agricultural Practices', description='This topic explores innovative techniques that enhance sustainable agricultural practices, focusing on the principles of good agricultural practices to promote safety, efficiency, and environmental stewardship in farming.', score=88)


DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 01 Nov 2025 13:56:37 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-organization', b'user-il8cyjlyvrqffsixlqyafac8'), (b'openai-processing-ms', b'1241'), (b'openai-project', b'proj_sXNDftCX8ISqcvsvAc4uSxkW'), (b'openai-version', b'2020-10-01'), (b'x-envoy-upstream-service-time', b'1257'), (b'x-ratelimit-limit-requests', b'10000'), (b'x-ratelimit-limit-tokens', b'200000'), (b'x-ratelimit-remaining-requests', b'9990'), (b'x-ratelimit-remaining-tokens', b'198790'), (b'x-ratelimit-reset-requests', b'1m19.561s'), (b'x-ratelimit-reset-tokens', b'363ms'), (b'x-request-id', b'req_dce84a0a08ae4ae0b0af9fed245b48fc'), (b'x-openai-proxy-wasm', b'v0.1'), (b'cf-cache-status', b'DYNAMIC'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; pr

ResponseFormat(topic='The Role of Crop Rotation in Sustainable Agriculture', description='This topic examines the vital role of crop rotation in promoting sustainable agricultural practices, enhancing soil health, and improving crop yields through diverse planting strategies.', score=88)


DEBUG:topic_gen.data.data:Inserting topics for user 8c525b6b-27c9-4379-a454-2c3b3a781124
DEBUG:topic_gen.data.data:Inserting topics for user 8c525b6b-27c9-4379-a454-2c3b3a781124
DEBUG:topic_gen.data.data:Session started
DEBUG:topic_gen.data.data:Inserting topics for user 8c525b6b-27c9-4379-a454-2c3b3a781124
DEBUG:topic_gen.data.data:Inserting topics for user 8c525b6b-27c9-4379-a454-2c3b3a781124
DEBUG:topic_gen.data.data:Session started
DEBUG:topic_gen.data.data:Session started
DEBUG:topic_gen.data.data:Session started
DEBUG:topic_gen.data.data:Session committed
DEBUG:topic_gen.data.data:Session closed
DEBUG:topic_gen.data.data:Topics inserted successfully for user 8c525b6b-27c9-4379-a454-2c3b3a781124
DEBUG:topic_gen.data.data:Inserting topics for user 8c525b6b-27c9-4379-a454-2c3b3a781124
DEBUG:topic_gen.data.data:Session started
DEBUG:topic_gen.data.data:Session committed
DEBUG:topic_gen.data.data:Session closed
DEBUG:topic_gen.data.data:Topics inserted successfully for user 8c525b6b-2

In [7]:
# print(out_df["title"].unique())

# print description, score of unique titles
for title in topics_df["title"].unique():
    print(f"Title: {title}")
    print(
        f"Description: {topics_df[topics_df['title'] == title]['description'].unique()[0]} Score: {topics_df[topics_df['title'] == title]['score'].unique()[0]}"
    )

Title: Plant disease detection using hybrid model based on convolutional autoencoder and convolutional neural network
Description: Explore the latest advancements in identifying and managing plant diseases, focusing on innovative techniques and technologies that enhance detection and treatment methods. This topic delves into the intersection of agriculture and technology, showcasing how new approaches can improve crop health and yield. Score: 90.0
Title: Plant disease detection and classification techniques: a comparative study of the performances - Journal of Big Data
Description: Explore the latest advancements in identifying and managing plant diseases, focusing on innovative techniques and technologies that enhance detection and treatment methods. This topic delves into the intersection of agriculture and technology, showcasing how new approaches can improve crop health and yield. Score: 90.0
Title: Plant Leaf Disease Detection, Classification, and Diagnosis Using Computer Vision a

In [None]:
topic_gen = TopicGen()
topic_gen.user_id = user_id
topic_gen._persist_topics(out_df)

## Output

In [None]:
cluster_ids = out_df["cluster"].unique()

out = []
for cluster_id in cluster_ids:
    cluster_df = out_df[out_df["cluster"] == cluster_id]
    title = cluster_df["topic"].unique()[0]
    data = {
        "cluster_id": int(cluster_id),
        "title": str(title),
        "tags": cluster_df[["key", "tag_id"]]
        .drop_duplicates()
        .to_dict(orient="records"),
        "bookmarks": cluster_df[
            [
                "bookmark_id",
                "title",
                "description",
                "language",
                "created_at",
                "updated_at",
            ]
        ].to_dict(orient="records"),
    }
    out.append(data)

print(out)

In [19]:
import numpy as np

all_keys = out_df["key"].unique()

sorted_keys = np.sort(all_keys)
cluster_key =  np.array2string(sorted_keys, separator=".", prefix='', suffix='')

print(str(cluster_key))

['agricultural-practices'.'artificial-intelligence'.'computer-vision'.
 'crop-rotation'.'deep-learning'.'detection-techniques'.'disease-control'.
 'good-agricultural-practices'.'leaf-disease'.'new-plant-diseases'.
 'plant-disease'.'plant-disease-basics']
