In [10]:
import pandas as pd

In [22]:
# define manually!
CLUSTER_TOPICS = {
        0: 'ai: agi',
        1: 'ai: safety',
        2: 'ai: model capabilities',
        3: 'rationality: mental health & productivity', 
        4: 'economics: macroeconomics',
        5: 'nuclear preparedness',
        6: 'philosophy: consciousness',
        7: 'ai: neural network circuits', 
        8: 'ai: alignment',
        9: 'rationality: abstract world problems',
        10: 'rationality: concrete world problems',
        11: 'ai: mech interp',
        12: 'community: personal stories',
        13: 'community: lesswrong auditing',
        14: 'ai: reinforcement learning',
        15: 'ai: model training & optimization',
        16: 'bio: covid',
        17: 'ai: tech giants',
        18: 'bio: evolution',
        19: 'economics: decision theory',
        20: 'philosophy: simulations',
        21: 'ai: model reasoning',
        22: 'community: media and media threads',
        23: 'ai: research',
        24: 'rationality: forecasting',
        25: 'miscelleanous',
        26: 'rationality: social problems',
        27: 'ai: alignment problems', 
        28: 'philosophy: moral philosophy',
        29: 'community: effective altruism',
        30: 'ai: scaling',
        31: 'ai: probability theory', 
        32: 'economics: decision theory',
        33: 'economics: game theory',
        34: 'ai: data & information',
        35: 'rationality: parenting',
    }

In [23]:
"""
Append topic columns (cluster index + label) to LW CSV files 
using clustering results.
"""
clustering_results_csv = 'results/blog_lda_results_36.csv'

print(f"Loading clustering results from {clustering_results_csv}...")
results_df = pd.read_csv(clustering_results_csv)
print(f"Loaded {len(results_df)} clustered posts")

updated_files = 0
total_matches = 0

# Add human-readable topic labels
results_df["topic_label"] = results_df["dominant_topic"].map(CLUSTER_TOPICS)

# Process each LW CSV file separately
for file_path, group in results_df.groupby("file"):
    print(f"\nProcessing {file_path}...")

    try:
        # Extract just the filename and build correct path
        import os
        filename = os.path.basename(file_path)  # Gets just "2025-01.csv"
        correct_file_path = f'lw_csv_cleaned/{file_path.split("lw_csv_cleaned/")[1]}'  # Gets "lw_csv_cleaned/2025/2025-01.csv"
        
        # Load the original file with correct path
        df = pd.read_csv(correct_file_path)
        original_count = len(df)

        merge_data = group[["title", "dominant_topic", "topic_label"]]
        merged = df.merge(merge_data, how="left", on="title")

        # Rename cluster column
        merged.rename(columns={"dominant_topic": "topic_cluster_id"}, inplace=True)

        # Fill none values
        merged["topic_cluster_id"] = merged["topic_cluster_id"].fillna(-1).astype(int)
        merged["topic_label"] = merged["topic_label"].fillna("No Topic")
        
        mask = (merged["topic_label"] == "No Topic") & (
            merged["cleaned_htmlBody"].str[:50].str.lower().str.contains("meetup", na=False)
            | merged["title"].str.lower().str.contains("meetup", na=False)
        )
        merged.loc[mask, "topic_label"] = "community: meetup"
        merged.loc[mask, "topic_cluster_id"] = -2  # special cluster id for rule-based label

        # Save back with proper filename
        output_path = f'lw_csv_cleaned_topic/{file_path.split("lw_csv_cleaned/")[1]}'
        
        # Make sure output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        merged.to_csv(output_path, index=False)

        matches = (merged["topic_cluster_id"] != -1).sum()
        updated_files += 1
        total_matches += matches

        print(f"  SUCCESS: {matches}/{original_count} posts matched")

    except Exception as e:
        print(f'Error processing {file_path}: {e}')
        print(f'Error type: {type(e).__name__}')

Loading clustering results from results/blog_lda_results_36.csv...
Loaded 31405 clustered posts

Processing x-risk-data/lw_csv_cleaned/2016/2016-01.csv...
  SUCCESS: 132/122 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-02.csv...
  SUCCESS: 104/105 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-03.csv...
  SUCCESS: 101/101 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-04.csv...
  SUCCESS: 107/107 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-05.csv...
  SUCCESS: 79/79 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-06.csv...
  SUCCESS: 103/104 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-07.csv...
  SUCCESS: 91/93 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-08.csv...
  SUCCESS: 81/87 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-09.csv...
  SUCCESS: 127/114 posts matched

Processing x-risk-data/lw_csv_cleaned/2016/2016-10.csv...
  SUCCESS: 106/107 post