In [None]:
def generate_cluster_topic(cluster_data):
    """Generate a topic name for a cluster based on its top posts"""
    
    # Extract titles and text for embedding from top posts
    posts_summary = []
    for post in cluster_data['top_20_posts']:
        posts_summary.append({
            'title': post['title'],
            'subreddit': post['subreddit'],
            'score': post['score']
        })
    
    prompt = f"""Analyze the following top 20 posts from a cluster and generate:
1. A concise cluster topic name (3-5 words)
2. A brief description (1-2 sentences) explaining what this cluster is about
3. Key themes (list 3-5 main themes)

Cluster contains {cluster_data['total_posts_in_cluster']} total posts.

Top 20 posts:
{json.dumps(posts_summary, indent=2)}

Respond in JSON format:
{{
    "topic_name": "...",
    "description": "...",
    "key_themes": ["...", "...", "..."]
}}
"""
    
    response = model.generate_content(prompt)
    
    # Parse the response
    try:
        # Extract JSON from response
        response_text = response.text
        # Handle markdown code blocks if present
        if '```json' in response_text:
            response_text = response_text.split('```json')[1].split('```')[0]
        elif '```' in response_text:
            response_text = response_text.split('```')[1].split('```')[0]
        
        result = json.loads(response_text.strip())
        return result
    except json.JSONDecodeError:
        return {
            "topic_name": "Unknown",
            "description": response.text,
            "key_themes": []
        }

In [None]:
# Get all cluster files
cluster_dir = Path('top20percluster')
cluster_files = sorted(cluster_dir.glob('cluster_*_top20.json'))

print(f"Found {len(cluster_files)} cluster files:")
for f in cluster_files:
    print(f"  - {f.name}")

In [None]:
# Generate topics for each cluster
cluster_topics = {}

for cluster_file in cluster_files:
    print(f"\nProcessing {cluster_file.name}...")
    
    # Load cluster data
    with open(cluster_file, 'r') as f:
        cluster_data = json.load(f)
    
    cluster_id = cluster_data['cluster_id']
    
    # Generate topic using Gemini
    topic_info = generate_cluster_topic(cluster_data)
    
    cluster_topics[cluster_id] = {
        'cluster_id': cluster_id,
        'total_posts': cluster_data['total_posts_in_cluster'],
        **topic_info
    }
    
    print(f"  Cluster {cluster_id}: {topic_info['topic_name']}")
    print(f"  Description: {topic_info['description']}")
    print(f"  Key themes: {', '.join(topic_info.get('key_themes', []))}")

In [None]:
# Save cluster topics to a summary file
output_file = 'cluster_topics_summary.json'

with open(output_file, 'w') as f:
    json.dump(cluster_topics, f, indent=2)

print(f"\nCluster topics saved to {output_file}")

# Also display as a nice summary
print("\n" + "="*60)
print("CLUSTER TOPICS SUMMARY")
print("="*60)

for cluster_id, info in sorted(cluster_topics.items()):
    print(f"\nCluster {cluster_id}: {info['topic_name']}")
    print(f"  Posts: {info['total_posts']}")
    print(f"  Description: {info['description']}")
    print(f"  Themes: {', '.join(info.get('key_themes', []))}")