In [1]:
import pandas as pd

In [None]:
# define manually!
CLUSTER_TOPICS = {
        0: 'lw community',
        1: 'ssc meetups',
        2: 'petrov day',
        3: '(toy) language models',
        4: 'ai alignment',
        5: 'lw weekly summary',
        6: 'ai risk & governance',
    }

In [4]:
"""
Append topic columns (cluster index + label) to LW CSV files 
using clustering results.
"""
clustering_results_csv = 'blog_clustering_results.csv'

print(f"Loading clustering results from {clustering_results_csv}...")
results_df = pd.read_csv(clustering_results_csv)
print(f"Loaded {len(results_df)} clustered posts")

updated_files = 0
total_matches = 0

# Add human-readable topic labels
results_df["topic_label"] = results_df["cluster"].map(CLUSTER_TOPICS)

# Process each LW CSV file separately
for file_path, group in results_df.groupby("file"):
    print(f"\nProcessing {file_path}...")

    try:
        # Load the original file
        df = pd.read_csv(file_path)
        original_count = len(df)

        merged = df.merge(
            group[["text", "cluster", "topic_label"]],
            how="left",
            left_on="title",
            right_on="text"
        )

        merged.drop(columns=["text"], inplace=True)

        # Rename cluster column
        merged.rename(columns={"cluster": "topic_cluster_id"}, inplace=True)

        # Fill missing values for unmatched rows
        merged["topic_cluster_id"] = merged["topic_cluster_id"].fillna(-1).astype(int)
        merged["topic_label"] = merged["topic_label"].fillna("No Topic")

        # Save back
        merged.to_csv(file_path, index=False)

        matches = (merged["topic_cluster_id"] != -1).sum()
        updated_files += 1
        total_matches += matches

        print(f"  Updated: {matches}/{original_count} posts matched")
        cluster_counts = merged[merged["topic_cluster_id"] != -1]["topic_cluster_id"].value_counts()
        if len(cluster_counts) > 0:
            print(f"  Topics: {dict(cluster_counts)}")

    except Exception as e:
        print(f"  Error processing {file_path}: {e}")

print("\n" + "=" * 50)
print("SUMMARY:")
print(f"Files updated: {updated_files}")
print(f"Total matches: {total_matches}")
print("Done! Each CSV now has 'topic_cluster_id' and 'topic_label' columns")

Loading clustering results from blog_clustering_results.csv...
Loaded 31078 clustered posts

Processing lw_csv/2016/2016-01.csv...
  Updated: 0/122 posts matched

Processing lw_csv/2016/2016-02.csv...
  Updated: 0/105 posts matched

Processing lw_csv/2016/2016-03.csv...
  Updated: 0/101 posts matched

Processing lw_csv/2016/2016-04.csv...
  Updated: 0/107 posts matched

Processing lw_csv/2016/2016-05.csv...
  Updated: 0/79 posts matched

Processing lw_csv/2016/2016-06.csv...
  Updated: 0/104 posts matched

Processing lw_csv/2016/2016-07.csv...
  Updated: 0/93 posts matched

Processing lw_csv/2016/2016-08.csv...
  Updated: 0/87 posts matched

Processing lw_csv/2016/2016-09.csv...
  Updated: 0/114 posts matched

Processing lw_csv/2016/2016-10.csv...
  Updated: 0/107 posts matched

Processing lw_csv/2016/2016-11.csv...
  Updated: 0/134 posts matched

Processing lw_csv/2016/2016-12.csv...
  Updated: 0/163 posts matched

Processing lw_csv/2017/2017-01.csv...
  Updated: 0/182 posts matched

