In [31]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.colors as cls
import seaborn as sns
import numpy as np
from pathlib import Path
from urllib.parse import urlparse
import glob
from collections import Counter
import re
import warnings
import requests
import time
warnings.filterwarnings('ignore')

# Definitions

In [2]:
"""=== NATURE FORMAT CONFIGURATION ==="""
# Set these once at the beginning for all plots in the notebook

# Font settings
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']  # Fallback fonts
plt.rcParams['font.size'] = 8
plt.rcParams['axes.labelsize'] = 8
plt.rcParams['axes.titlesize'] = 9
plt.rcParams['xtick.labelsize'] = 7
plt.rcParams['ytick.labelsize'] = 7
plt.rcParams['legend.fontsize'] = 7
plt.rcParams['legend.title_fontsize'] = 7
plt.rcParams['figure.titlesize'] = 9

# Line and spine settings
plt.rcParams['axes.linewidth'] = 0.5
plt.rcParams['lines.linewidth'] = 1.0
plt.rcParams['patch.linewidth'] = 0.5
plt.rcParams['xtick.major.width'] = 0.5
plt.rcParams['ytick.major.width'] = 0.5

# Grid settings
plt.rcParams['grid.linewidth'] = 0.5
plt.rcParams['grid.alpha'] = 0.3

plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 600
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0.05

# Remove top and right spines by default
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False

# Nature figure widths (in inches) - these stay the same
SINGLE_COL = 3.5   # 89 mm
ONE_HALF_COL = 4.7  # 120 mm
DOUBLE_COL = 7.2   # 183 mm

print("✓ Nature format configuration loaded")
print(f"  Single column width: {SINGLE_COL}\"")
print(f"  1.5 column width: {ONE_HALF_COL}\"")
print(f"  Double column width: {DOUBLE_COL}\"")

✓ Nature format configuration loaded
  Single column width: 3.5"
  1.5 column width: 4.7"
  Double column width: 7.2"


# Load CSVs

In [3]:
"""Load all CSV files from the folder structure"""
print("Loading data from all CSV files...")

# Load LessWrong data
forum = 'lesswrong'
lw_files = glob.glob(f"../src/processed_data/{forum}/03_with_topics/**/*.csv", recursive=True)
print(f"Found {len(lw_files)} LessWrong CSV files")

lw_data = []
for file in lw_files:
    try:
        df = pd.read_csv(file)
        # Extract year and month from filename
        parts = Path(file).stem.split('-')
        if len(parts) >= 2:
            df['year'] = int(parts[0])
            df['month'] = int(parts[1])
        lw_data.append(df)
    except Exception as e:
        print(f"Error loading {file}: {e}")

if not lw_data:
    print("No LessWrong data found! Check your file paths.")
else:
    lesswrong_df = pd.concat(lw_data, ignore_index=True)
    print(f"Loaded {len(lesswrong_df)} total LessWrong posts from {len(lw_data)} files")
    
    # Clean and prepare LessWrong data
    lesswrong_df['postedAt'] = pd.to_datetime(lesswrong_df['postedAt'], errors='coerce')
    lesswrong_df['baseScore'] = pd.to_numeric(lesswrong_df['baseScore'], errors='coerce').fillna(0)
    lesswrong_df['commentCount'] = pd.to_numeric(lesswrong_df['commentCount'], errors='coerce').fillna(0)

# Load Alignment Forum data
forum = 'alignment_forum'
af_files = glob.glob(f"../src/processed_data/{forum}/03_with_topics/**/*.csv", recursive=True)
print(f"Found {len(af_files)} Alignment Forum CSV files")

af_data = []
for file in af_files:
    try:
        df = pd.read_csv(file)
        # Extract year and month from filename
        parts = Path(file).stem.split('-')
        if len(parts) >= 2:
            df['year'] = int(parts[0])
            df['month'] = int(parts[1])
        af_data.append(df)
    except Exception as e:
        print(f"Error loading {file}: {e}")

if not af_data:
    print("No Alignment Forum data found! Check your file paths.")
else:
    alignment_forum_df = pd.concat(af_data, ignore_index=True)
    print(f"Loaded {len(alignment_forum_df)} total Alignment Forum posts from {len(af_data)} files")
    
    # Clean and prepare Alignment Forum data
    alignment_forum_df['postedAt'] = pd.to_datetime(alignment_forum_df['postedAt'], errors='coerce')
    alignment_forum_df['baseScore'] = pd.to_numeric(alignment_forum_df['baseScore'], errors='coerce').fillna(0)
    alignment_forum_df['commentCount'] = pd.to_numeric(alignment_forum_df['commentCount'], errors='coerce').fillna(0)

# Optional: Create combined dataframe with platform identifier
combined_df = pd.concat([
    lesswrong_df.assign(platform='LessWrong'),
    alignment_forum_df.assign(platform='Alignment Forum')
], ignore_index=True)
print(f"\nCombined total: {len(combined_df)} posts across both platforms")

Loading data from all CSV files...
Found 224 LessWrong CSV files
Loaded 45981 total LessWrong posts from 224 files
Found 126 Alignment Forum CSV files
Loaded 4230 total Alignment Forum posts from 126 files

Combined total: 50211 posts across both platforms


---
# Forum Nodes
---

In [None]:
def truncate_colormap(cmap, min_val=0.0, max_val=1.0, n=256):
    """Helper function to truncate colormap."""
    new_cmap = LinearSegmentedColormap.from_list(
        f'trunc({cmap.name},{min_val:.2f},{max_val:.2f})',
        cmap(np.linspace(min_val, max_val, n))
    )
    return new_cmap

## Topic Analysis

In [None]:
"""=== TOPIC ANALYSIS ==="""

# Plot for both platforms
for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df)]:
    
    print(f"\n--- Analyzing {platform_name} ---")
    
    # Filter out Misc topics
    filtered_df = platform_df[platform_df["topic_label"] != 'Misc: No Topic'].copy()
    
    if len(filtered_df) == 0:
        print(f"No topics found for {platform_name}")
        continue
    
    # Get topic counts
    topic_counts = filtered_df["topic_label"].value_counts()
    
    # --- Figure: Topic distribution pie chart (Nature format) ---
    fig, ax = plt.subplots(figsize=(3.5, 3.5))  # Single column width (89mm)
    
    wedges, texts, autotexts = ax.pie(
        topic_counts.values,
        labels=topic_counts.index,
        autopct='%1.1f%%',
        startangle=90,
        counterclock=False,
        colors=okabe_ito_palette[:len(topic_counts)],
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        textprops={'fontsize': 6}  # Small font for labels
    )
    
    # Format percentage text
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontsize(6)
        autotext.set_weight('bold')
    
    # Format labels
    for text in texts:
        text.set_fontsize(6)
    
    ax.set_title(f"{platform_name}: Topic Distribution", fontsize=8, pad=10)
    
    plt.tight_layout()
    
    plt.show()
    
    print(f"Topics found: {len(topic_counts)}")
    print(f"Total posts: {len(filtered_df)}")

# --- Combined plot for both platforms (Nature double-column format) ---
print("\n--- Creating combined comparison ---")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 3.5))  # Double column width

for ax, (platform_name, platform_df) in zip([ax1, ax2], [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df)]):
    filtered_df = platform_df[platform_df["topic_label"] != 'Misc: No Topic'].copy()
    
    if len(filtered_df) == 0:
        ax.text(0.5, 0.5, f'No topics for {platform_name}', 
                ha='center', va='center', fontsize=7)
        ax.set_title(f"{platform_name}", fontsize=8)
        continue
    
    topic_counts = filtered_df["topic_label"].value_counts()
    
    wedges, texts, autotexts = ax.pie(
        topic_counts.values,
        labels=topic_counts.index,
        autopct='%1.1f%%',
        startangle=90,
        counterclock=False,
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        textprops={'fontsize': 6}
    )
    
    # Format percentage text
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontsize(6)
        autotext.set_weight('bold')
    
    # Format labels
    for text in texts:
        text.set_fontsize(6)
    
    ax.set_title(f"{platform_name}", fontsize=8, pad=10)

# Add figure labels (a, b) as Nature requires
fig.text(0.02, 0.98, 'a', fontsize=10, fontweight='bold', va='top')
fig.text(0.52, 0.98, 'b', fontsize=10, fontweight='bold', va='top')

plt.tight_layout()

# Save in multiple formats
plt.show()

### Validation

In [None]:
"""=== MANUAL TOPIC VALIDATION ==="""

import numpy as np
from IPython.display import display, HTML
import random

# Configuration
TOTAL_VALIDATION_SAMPLES = 200  # Adjust based on how many you want to validate
RANDOM_SEED = 42  # For reproducibility

# Combine both platforms
combined_df = pd.concat([
    lesswrong_df.assign(platform='LessWrong'),
    alignment_forum_df.assign(platform='Alignment Forum')
], ignore_index=True)

# Filter out Misc topics
validation_df = combined_df[combined_df["topic_label"] != 'Misc: No Topic'].copy()

print(f"Total posts for validation: {len(validation_df)}")
print(f"\n=== Topic Distribution ===")

# Get topic counts and proportions
topic_counts = validation_df["topic_label"].value_counts()
topic_proportions = topic_counts / len(validation_df)

for topic, count in topic_counts.items():
    proportion = count / len(validation_df)
    n_samples = int(TOTAL_VALIDATION_SAMPLES * proportion)
    print(f"{topic}: {count} posts ({proportion*100:.1f}%) → {n_samples} validation samples")

# Stratified sampling
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

validation_samples = []

for topic in topic_counts.index:
    topic_df = validation_df[validation_df["topic_label"] == topic]
    
    # Calculate number of samples for this topic (proportional)
    n_samples = max(1, int(TOTAL_VALIDATION_SAMPLES * len(topic_df) / len(validation_df)))
    
    # Sample randomly
    if len(topic_df) >= n_samples:
        sampled = topic_df.sample(n=n_samples, random_state=RANDOM_SEED)
    else:
        sampled = topic_df  # If topic has fewer posts than needed samples
    
    validation_samples.append(sampled)

validation_set = pd.concat(validation_samples, ignore_index=True)

# Shuffle the validation set
validation_set = validation_set.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
validation_set = validation_set.drop(columns=['htmlBody','slug,pageUrl','postedAt','baseScore',
                                              'voteCount','commentCount','meta','question','user.username',
                                              'user.slug','user.displayName','extracted_links','pageUrl','slug',
                                              'user_gender','extracted_dois','topic_cluster_id','year','month'], errors='ignore')

# Truncate text column to 500 chars if it exists
if 'text' in validation_set.columns:
    validation_set['text'] = validation_set['cleaned_htmlBody'].astype(str).apply(lambda x: x[:500] + "..." if len(x) > 500 else x)

print(f"\n=== Validation Set Created ===")
print(f"Total validation samples: {len(validation_set)}")
print(f"\nSamples per topic:")
print(validation_set["topic_label"].value_counts().sort_index())

# Add columns for manual validation
validation_set['validation_correct'] = ''  # To be filled: 'yes', 'no', 'unsure'
validation_set['validation_notes'] = ''
validation_set['validation_alternative_topic'] = ''  # If incorrect, what should it be?

# Save to CSV for validation
output_file = 'topic_validation_set.csv'
validation_set.to_csv(output_file, index=False)
print(f"\n✓ Saved validation set to: {output_file}")

# Display first few examples
print("\n=== First 5 Validation Examples ===\n")
for idx, row in validation_set.head().iterrows():
    print(f"{'='*80}")
    print(f"Sample {idx + 1}")
    print(f"Platform: {row['platform']}")
    print(f"Assigned Topic: {row['topic_label']}")
    print(f"Title: {row.get('title', 'N/A')}")
    print(f"\nContent Preview (first 500 chars):")
    content = str(row.get('content', row.get('text', 'N/A')))
    print(content[:500] + "..." if len(content) > 500 else content)
    print()

# Create a more readable HTML version for Jupyter
def create_validation_html(df, max_samples=20):
    html = """
    <style>
        .validation-card {
            border: 2px solid #333;
            padding: 15px;
            margin: 20px 0;
            background-color: #f9f9f9;
            border-radius: 5px;
        }
        .topic-label {
            background-color: #4CAF50;
            color: white;
            padding: 5px 10px;
            border-radius: 3px;
            display: inline-block;
            font-weight: bold;
        }
        .platform {
            background-color: #2196F3;
            color: white;
            padding: 5px 10px;
            border-radius: 3px;
            display: inline-block;
            margin-left: 10px;
        }
        .content-preview {
            background-color: white;
            padding: 10px;
            margin-top: 10px;
            border-left: 3px solid #ddd;
            font-family: monospace;
            font-size: 0.9em;
        }
    </style>
    """
    
    for idx, row in df.head(max_samples).iterrows():
        content = str(row.get('content', row.get('text', 'N/A')))
        content_preview = content[:800] + "..." if len(content) > 800 else content
        
        html += f"""
        <div class="validation-card">
            <h3>Sample {idx + 1}</h3>
            <span class="topic-label">{row['topic_label']}</span>
            <span class="platform">{row['platform']}</span>
            <h4>{row.get('title', 'No Title')}</h4>
            <div class="content-preview">{content_preview}</div>
        </div>
        """
    
    return html

print("\n=== Interactive Preview (first 20 samples) ===")
display(HTML(create_validation_html(validation_set, max_samples=20)))

print("\n" + "="*80)
print("NEXT STEPS:")
print("="*80)
print("1. Open 'topic_validation_set.csv' in Excel/Google Sheets")
print("2. For each row, fill in:")
print("   - validation_correct: 'yes', 'no', or 'unsure'")
print("   - validation_notes: Any comments")
print("   - validation_alternative_topic: If 'no', what should the topic be?")
print("3. Save the file and load it back to calculate inter-rater reliability")
print("\nOptional: Share the CSV with another rater for inter-rater reliability!")

## Topic Analysis with Engagement Metrics

In [None]:
"""=== TOPIC ANALYSIS WITH ENGAGEMENT METRICS ==="""

# Analyze both platforms
for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df)]:
    
    print(f"\n--- {platform_name}: Topic Engagement Analysis ---")
    
    safe_name = platform_name.lower().replace(" ", "_")
        
    if len(filtered_df) == 0:
        print(f"No topics found for {platform_name}")
        continue
    
    # Calculate engagement metrics by topic
    topic_engagement = filtered_df.groupby('topic_label').agg({
        'baseScore': ['sum', 'mean', 'median'],
        'commentCount': ['sum', 'mean', 'median'],
        '_id': 'count'  # Number of posts
    }).round(1)
    
    # Flatten column names
    topic_engagement.columns = ['_'.join(col).strip() for col in topic_engagement.columns.values]
    topic_engagement = topic_engagement.rename(columns={'_id_count': 'post_count'})
    
    # Sort by total score
    topic_engagement = topic_engagement.sort_values('baseScore_sum', ascending=False)
    
    print("\nTopic Engagement Metrics:")
    print(topic_engagement)
    
    # 1. Total engagement by topic (stacked bar: score + comments)
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
    
    x_pos = np.arange(len(topic_engagement))
    
    # Normalize to make comparable (score and comments on different scales)
    # Use score as primary metric
    ax.barh(x_pos, topic_engagement['baseScore_sum'], 
            color='#2a9d8f', edgecolor='white', linewidth=0.5,
            label='Total Score')
    
    ax.set_yticks(x_pos)
    ax.set_yticklabels(topic_engagement.index)
    ax.set_xlabel('Total Base Score')
    ax.set_title(f'{platform_name}: Total Engagement by Topic', pad=10)
    ax.invert_yaxis()
    ax.grid(axis='x')
    ax.legend(loc='lower right')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 2. Average engagement per post by topic
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
    
    x_pos = np.arange(len(topic_engagement))
    sorted_by_mean = topic_engagement.sort_values('baseScore_mean', ascending=False)
    
    ax.barh(x_pos, sorted_by_mean['baseScore_mean'], 
            color='#e76f51', edgecolor='white', linewidth=0.5)
    
    ax.set_yticks(x_pos)
    ax.set_yticklabels(sorted_by_mean.index)
    ax.set_xlabel('Average Base Score per Post')
    ax.set_title(f'{platform_name}: Average Engagement by Topic', pad=10)
    ax.invert_yaxis()
    ax.grid(axis='x')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 3. Bubble chart: Posts vs Engagement (size = comment count)
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
    
    scatter = ax.scatter(
        topic_engagement['post_count'],
        topic_engagement['baseScore_mean'],
        s=topic_engagement['commentCount_mean'] * 10,  # Size by avg comments
        c=np.arange(len(topic_engagement)),  # Color by topic
        cmap='viridis',
        alpha=0.6,
        edgecolors='white',
        linewidth=0.5
    )
    
    # Add topic labels
    for idx, row in topic_engagement.iterrows():
        ax.annotate(idx, 
                   (row['post_count'], row['baseScore_mean']),
                   fontsize=6,
                   ha='center',
                   va='center')
    
    ax.set_xlabel('Number of Posts')
    ax.set_ylabel('Average Base Score')
    ax.set_title(f'{platform_name}: Topic Size vs Engagement\n(bubble size = avg comments)', 
                 pad=10)
    ax.grid(True, alpha=0.3)
    
    # Add legend for bubble size
    sizes = [10, 50, 100]
    labels = ['10 comments', '50 comments', '100 comments']
    legend_elements = [plt.scatter([], [], s=s*10, c='gray', alpha=0.6, 
                                   edgecolors='white', linewidth=0.5) 
                      for s in sizes]
    ax.legend(legend_elements, labels, 
             title='Avg Comments', 
             loc='lower right',
             frameon=True,
             edgecolor='black',
             fancybox=False)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 4. Engagement comparison: Score vs Comments
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(DOUBLE_COL, 3))
    
    x_pos = np.arange(len(topic_engagement))
    
    # Left: Total scores
    ax1.barh(x_pos, topic_engagement['baseScore_sum'], 
            color='#2a9d8f', edgecolor='white', linewidth=0.5)
    ax1.set_yticks(x_pos)
    ax1.set_yticklabels(topic_engagement.index, fontsize=6)
    ax1.set_xlabel('Total Base Score')
    ax1.set_title('By Score', pad=10)
    ax1.invert_yaxis()
    ax1.grid(axis='x')
    
    # Right: Total comments
    ax2.barh(x_pos, topic_engagement['commentCount_sum'], 
            color='#e9c46a', edgecolor='white', linewidth=0.5)
    ax2.set_yticks(x_pos)
    ax2.set_yticklabels([])  # Hide labels on right side
    ax2.set_xlabel('Total Comments')
    ax2.set_title('By Comments', pad=10)
    ax2.invert_yaxis()
    ax2.grid(axis='x')
    
    fig.suptitle(f'{platform_name}: Topic Engagement Comparison', 
                fontsize=9, y=1.02)
    
    plt.tight_layout()
    plt.show()
    plt.close()

print("\n=== Topic engagement analysis complete ===")

## Gender Distribution

In [None]:
"""Analyze gender distribution"""
print("\n=== GENDER ANALYSIS ===")

for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df), ("Combined", combined_df)]:
    
    print(f"\n--- {platform_name} ---")
    
    gender_counts = platform_df['user_gender'].value_counts()
    print(f"\nGender Distribution:")
    for gender, count in gender_counts.items():
        percentage = (count / len(platform_df)) * 100
        print(f"{gender.title()}: {count:,} ({percentage:.1f}%)")
    
    # Gender posts pie chart - use SINGLE_COL for pie charts
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    wedges, texts, autotexts = ax.pie(
        gender_counts.values, 
        labels=gender_counts.index, 
        autopct='%1.1f%%', 
        startangle=90,
        colors=["#e76f51", "#2a9d8f", "#e9c46a"], 
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False
    )
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title(f'{platform_name}: Gender Distribution of Posts', pad=10)
    safe_name = platform_name.lower().replace(" ", "_")
    plt.show()
    plt.close()
    
    # Gender users pie chart
    unique_users = platform_df[['user.username', 'user_gender']].drop_duplicates()
    user_gender_counts = unique_users['user_gender'].value_counts()
    
    print(f"\nUnique Users by Gender:")
    for gender, count in user_gender_counts.items():
        percentage = (count / len(unique_users)) * 100
        print(f"{gender.title()}: {count:,} ({percentage:.1f}%)")
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    wedges, texts, autotexts = ax.pie(
        user_gender_counts.values, 
        autopct='%1.1f%%', 
        startangle=90,
        colors=["#e76f51", "#2a9d8f", "#e9c46a"], 
        labels=user_gender_counts.index,
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False
    )
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title(f'{platform_name}: Gender Distribution of Users', pad=10)
    plt.show()
    plt.close()
    
    # Gender distribution by topics - use DOUBLE_COL for bar charts
    filtered_df = platform_df[platform_df["topic_label"] != 'Misc: No Topic'].copy()
    
    if len(filtered_df) > 0:
        gender_topic = filtered_df.groupby(['topic_label', 'user_gender']).size().unstack(fill_value=0)
        gender_topic['total'] = gender_topic.sum(axis=1)
        
        gender_cols = {}
        if 'gf' in gender_topic.columns:
            gender_cols['female'] = 'gf'
        elif 'female' in gender_topic.columns:
            gender_cols['female'] = 'female'
            
        if 'gm' in gender_topic.columns:
            gender_cols['male'] = 'gm'
        elif 'male' in gender_topic.columns:
            gender_cols['male'] = 'male'
            
        if '-' in gender_topic.columns:
            gender_cols['unknown'] = '-'
        elif 'unknown' in gender_topic.columns:
            gender_cols['unknown'] = 'unknown'
        
        gender_perc_data = {}
        for display_name, col_name in gender_cols.items():
            if col_name in gender_topic.columns:
                gender_perc_data[display_name] = (gender_topic[col_name] * 100) / gender_topic['total']
        
        gender_perc = pd.DataFrame(gender_perc_data)
        
        fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
        gender_perc.plot(kind="bar", ax=ax,
                         color=["#2a9d8f", "#e76f51", "#e9c46a"],
                         stacked=True,
                         width=0.8,
                         edgecolor='white')
        
        ax.set_title(f'{platform_name}: Gender Distribution by Topic', pad=10)
        ax.set_ylabel('Percentage of Posts (%)')
        ax.set_xlabel('Topic')
        ax.legend(title='Gender', bbox_to_anchor=(1.02, 1), loc='upper left', 
                  frameon=True, edgecolor='black', fancybox=False)
        ax.set_xticklabels(gender_perc.index, rotation=45, ha='right')
        ax.grid(axis='y')
        
        plt.tight_layout()
        plt.show()
        plt.close()

print("\n=== Gender analysis complete ===")

### Validation

In [None]:
"""=== MANUAL GENDER VALIDATION ==="""

import numpy as np
from IPython.display import display, HTML
import random

# Configuration
TOTAL_VALIDATION_SAMPLES = 200  # Adjust based on how many you want to validate
RANDOM_SEED = 42  # For reproducibility

# Combine both platforms
combined_df = pd.concat([
    lesswrong_df.assign(platform='LessWrong'),
    alignment_forum_df.assign(platform='Alignment Forum')
], ignore_index=True)

print(f"Total posts for validation: {len(combined_df)}")
print(f"\n=== Gender Distribution ===")

# Get gender counts and proportions
gender_counts = combined_df["user_gender"].value_counts()
gender_proportions = gender_counts / len(combined_df)

for gender, count in gender_counts.items():
    proportion = count / len(combined_df)
    n_samples = int(TOTAL_VALIDATION_SAMPLES * proportion)
    print(f"{gender}: {count} posts ({proportion*100:.1f}%) → {n_samples} validation samples")

# Stratified sampling by gender
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

validation_samples = []

for gender in gender_counts.index:
    gender_df = combined_df[combined_df["user_gender"] == gender]
    
    # Calculate number of samples for this gender (proportional)
    n_samples = max(1, int(TOTAL_VALIDATION_SAMPLES * len(gender_df) / len(combined_df)))
    
    # Sample randomly
    if len(gender_df) >= n_samples:
        sampled = gender_df.sample(n=n_samples, random_state=RANDOM_SEED)
    else:
        sampled = gender_df
    
    validation_samples.append(sampled)

validation_set = pd.concat(validation_samples, ignore_index=True)

# Shuffle the validation set
validation_set = validation_set.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Keep only essential columns for validation
columns_to_keep = ['user.username', 'user_gender', 'platform', 'title', 
                   'text', 'content']
validation_set = validation_set[[col for col in columns_to_keep if col in validation_set.columns]]

# Drop htmlBody if it exists
validation_set = validation_set.drop(columns=['htmlBody'], errors='ignore')

# Truncate text columns to 500 chars
if 'text' in validation_set.columns:
    validation_set['text'] = validation_set['text'].astype(str).apply(lambda x: x[:500] + "..." if len(x) > 500 else x)
if 'content' in validation_set.columns:
    validation_set['content'] = validation_set['content'].astype(str).apply(lambda x: x[:500] + "..." if len(x) > 500 else x)

print(f"\n=== Validation Set Created ===")
print(f"Total validation samples: {len(validation_set)}")
print(f"\nSamples per gender:")
print(validation_set["user_gender"].value_counts().sort_index())

# Add columns for manual validation
validation_set['validation_correct'] = ''  # To be filled: 'yes', 'no', 'unsure'
validation_set['validation_notes'] = ''
validation_set['validation_alternative_gender'] = ''  # If incorrect, what should it be?

# Save to CSV for validation
output_file = 'gender_validation_set.csv'
validation_set.to_csv(output_file, index=False)
print(f"\n✓ Saved validation set to: {output_file}")

# Display first few examples
print("\n=== First 5 Validation Examples ===\n")
for idx, row in validation_set.head().iterrows():
    print(f"{'='*80}")
    print(f"Sample {idx + 1}")
    print(f"Platform: {row['platform']}")
    print(f"Username: {row['user.username']}")
    print(f"Assigned Gender: {row['user_gender']}")
    print(f"Title: {row.get('title', 'N/A')}")
    print(f"\nContent Preview (first 500 chars):")
    content = str(row.get('content', row.get('text', 'N/A')))
    print(content[:500] + "..." if len(content) > 500 else content)
    print()

# Create a more readable HTML version for Jupyter
def create_gender_validation_html(df, max_samples=20):
    html = """
    <style>
        .validation-card {
            border: 2px solid #333;
            padding: 15px;
            margin: 20px 0;
            background-color: #f9f9f9;
            border-radius: 5px;
        }
        .gender-label {
            background-color: #e76f51;
            color: white;
            padding: 5px 10px;
            border-radius: 3px;
            display: inline-block;
            font-weight: bold;
        }
        .platform {
            background-color: #2196F3;
            color: white;
            padding: 5px 10px;
            border-radius: 3px;
            display: inline-block;
            margin-left: 10px;
        }
        .username {
            background-color: #9c27b0;
            color: white;
            padding: 5px 10px;
            border-radius: 3px;
            display: inline-block;
            margin-left: 10px;
        }
        .content-preview {
            background-color: white;
            padding: 10px;
            margin-top: 10px;
            border-left: 3px solid #ddd;
            font-family: monospace;
            font-size: 0.9em;
        }
    </style>
    """
    
    for idx, row in df.head(max_samples).iterrows():
        content = str(row.get('content', row.get('text', 'N/A')))
        content_preview = content[:800] + "..." if len(content) > 800 else content
        
        html += f"""
        <div class="validation-card">
            <h3>Sample {idx + 1}</h3>
            <span class="gender-label">{row['user_gender']}</span>
            <span class="platform">{row['platform']}</span>
            <span class="username">{row['user.username']}</span>
            <h4>{row.get('title', 'No Title')}</h4>
            <div class="content-preview">{content_preview}</div>
        </div>
        """
    
    return html

print("\n=== Interactive Preview (first 20 samples) ===")
display(HTML(create_gender_validation_html(validation_set, max_samples=20)))

print("\n" + "="*80)
print("NEXT STEPS:")
print("="*80)
print("1. Open 'gender_validation_set.csv' in Excel/Google Sheets")
print("2. For each row, fill in:")
print("   - validation_correct: 'yes', 'no', or 'unsure'")
print("   - validation_notes: Any comments")
print("   - validation_alternative_gender: If 'no', what should the gender be?")
print("3. Save the file and load it back to calculate inter-rater reliability")
print("\nOptional: Share the CSV with another rater for inter-rater reliability!")

## Author Analysis

In [None]:
"""Analyze author activity patterns"""
print("\n=== AUTHOR ANALYSIS ===")

# Analyze both platforms separately and combined
for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df), ("Combined", combined_df)]:
    
    print(f"\n--- {platform_name} ---")
    
    author_counts = platform_df['user.username'].value_counts()
    print(f"\nTotal unique authors: {len(author_counts):,}")
    print(f"Authors with only 1 post: {(author_counts == 1).sum():,} ({((author_counts == 1).sum() / len(author_counts) * 100):.1f}%)")
    print(f"Authors with 10+ posts: {(author_counts >= 10).sum():,}")
    print(f"Authors with 50+ posts: {(author_counts >= 50).sum():,}")
    
    print(f"\nTop 10 Most Active Authors:")
    for author, count in author_counts.head(10).items():
        print(f"  {author}: {count} posts")
    
    safe_name = platform_name.lower().replace(" ", "_")
    
    # Author activity distribution (bar chart)
    post_ranges = ['1', '2-5', '6-10', '11-25', '26-50', '50+']
    counts = [
        (author_counts == 1).sum(),
        ((author_counts >= 2) & (author_counts <= 5)).sum(),
        ((author_counts >= 6) & (author_counts <= 10)).sum(),
        ((author_counts >= 11) & (author_counts <= 25)).sum(),
        ((author_counts >= 26) & (author_counts <= 50)).sum(),
        (author_counts >= 50).sum()
    ]
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, 2.5))
    bars = ax.bar(post_ranges, counts, color='#2a9d8f', edgecolor='white', linewidth=0.5)
    ax.set_title(f'{platform_name}: Author Activity Distribution', pad=10)
    ax.set_ylabel('Number of Authors')
    ax.set_xlabel('Posts per Author')
    ax.set_xticklabels(post_ranges, rotation=45, ha='right')
    ax.grid(axis='y')
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height):,}',
                ha='center', va='bottom', fontsize=6)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # Posts per author histogram (log scale)
    fig, ax = plt.subplots(figsize=(SINGLE_COL, 2.5))
    ax.hist(author_counts.values, bins=50, color='#e76f51', 
            edgecolor='white', alpha=0.8, linewidth=0.5)
    ax.set_xlabel('Posts per Author')
    ax.set_ylabel('Number of Authors (log scale)')
    ax.set_title(f'{platform_name}: Distribution of Posts per Author', pad=10)
    ax.set_yscale('log')
    ax.grid(axis='y', which='both')  # Show grid for both major and minor ticks on log scale
    
    plt.tight_layout()
    plt.show()
    plt.close()

print("\n=== Author analysis complete ===")

## Temporal Trends

In [None]:
"""Analyze temporal trends"""
print("\n=== TEMPORAL TRENDS ===")

import seaborn as sns

# Analyze both platforms separately and combined
for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df), ("Combined", combined_df)]:
    
    print(f"\n--- {platform_name} ---")
    
    # Posts by year
    yearly_posts = platform_df.groupby('year').size()
    print(f"\nPosts by Year:")
    for year, count in yearly_posts.items():
        print(f"  {year}: {count:,} posts")
    
    safe_name = platform_name.lower().replace(" ", "_")
    
    # Line plot of posts over time
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 2.5))
    ax.plot(yearly_posts.index, yearly_posts.values, 
            marker='o', linewidth=1.5, markersize=4, 
            color='#2a9d8f')
    ax.set_title(f'{platform_name}: Posts by Year', pad=10)
    ax.set_xlabel('Year')
    ax.set_ylabel('Number of Posts')
    ax.grid(axis='y')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # Heatmap of posts by year and month
    if 'month' in platform_df.columns:
        heatmap_data = platform_df.groupby(['year', 'month']).size().unstack(fill_value=0)
        
        # Ensure all months 1-12 are present
        for month in range(1, 13):
            if month not in heatmap_data.columns:
                heatmap_data[month] = 0
        heatmap_data = heatmap_data[sorted(heatmap_data.columns)]
        
        fig, ax = plt.subplots(figsize=(DOUBLE_COL, len(heatmap_data) * 0.3 + 1))
        
        # Use Nature-appropriate colormap
        sns.heatmap(heatmap_data, annot=True, fmt='d', 
                    cmap='YlOrRd', 
                    ax=ax,
                    cbar_kws={'label': 'Number of Posts'},
                    linewidths=0.5,
                    linecolor='white')
        
        ax.set_title(f'{platform_name}: Posts by Year and Month', pad=10)
        ax.set_xlabel('Month')
        ax.set_ylabel('Year')
        
        # Set month labels
        month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                       'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        ax.set_xticklabels(month_labels, rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        plt.close()
    
    # Monthly activity (aggregated across all years)
    if 'month' in platform_df.columns:
        monthly_posts = platform_df.groupby('month').size()
        
        fig, ax = plt.subplots(figsize=(DOUBLE_COL, 2.5))
        bars = ax.bar(monthly_posts.index, monthly_posts.values, 
                      color='#e76f51', edgecolor='white', linewidth=0.5)
        ax.set_title(f'{platform_name}: Posts by Month (All Years)', pad=10)
        ax.set_xlabel('Month')
        ax.set_ylabel('Number of Posts')
        ax.set_xticks(range(1, 13))
        ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                           'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], 
                          rotation=45, ha='right')
        ax.grid(axis='y')
        
        plt.tight_layout()
        plt.show()
        plt.close()

print("\n=== Temporal trends analysis complete ===")

## Links

In [None]:
"""Analyze extracted links"""
print("\n=== LINK ANALYSIS ===")

import re
from urllib.parse import urlparse

def extract_domain(url):
    """Extract domain from URL."""
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        # Remove www. prefix for cleaner grouping
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except:
        return 'invalid_url'

def analyze_links(platform_df, platform_name):
    """Analyze links for a given platform."""
    print(f"\n--- {platform_name} ---")
    
    safe_name = platform_name.lower().replace(" ", "_")
    
    # Extract all links
    all_links = []
    for entry in platform_df['extracted_links'].dropna():
        if isinstance(entry, str):
            links = [link.strip() for link in entry.split(';') if link.strip()]
            all_links.extend(links)
        elif isinstance(entry, list):
            all_links.extend(entry)
        else:
            links = [link.strip() for link in str(entry).split(';') if link.strip()]
            all_links.extend(links)
    
    # Final cleaning
    clean_links = [l for l in all_links if l]
    
    if not clean_links:
        print(f"No links found for {platform_name}")
        return
    
    print(f"Total links extracted: {len(clean_links):,}")
    print(f"Posts with links: {platform_df['extracted_links'].notna().sum():,}")
    
    # Pattern analysis
    doi_pattern = r'10\.\d{4,9}/[^\s;<>"]+'
    arxiv_pattern = r'(arxiv\.org/)'
    
    doi_links = [link for link in clean_links if re.search(doi_pattern, link, re.IGNORECASE)]
    arxiv_links = [link for link in clean_links if re.search(arxiv_pattern, link, re.IGNORECASE)]
    lesswrong_links = [link for link in clean_links if 'lesswrong' in link.lower()]
    
    # Extract domains
    domains = [extract_domain(link) for link in clean_links]
    domain_counts = pd.Series(domains).value_counts()
    
    # Category counts
    youtube_count = len([d for d in domains if 'youtube' in d or 'youtu.be' in d])
    wikipedia_count = len([d for d in domains if 'wikipedia' in d])
    github_count = len([d for d in domains if 'github' in d])
    ea_forum_count = len([d for d in domains if 'forum.effectivealtruism' in d])
    alignment_forum_count = len([d for d in domains if 'alignmentforum' in d])
    
    print(f"\n=== LINK CATEGORIES ===")
    print(f"DOI/Academic: {len(doi_links):,} ({len(doi_links)/len(clean_links)*100:.1f}%)")
    print(f"ArXiv: {len(arxiv_links):,} ({len(arxiv_links)/len(clean_links)*100:.1f}%)")
    print(f"LessWrong: {len(lesswrong_links):,} ({len(lesswrong_links)/len(clean_links)*100:.1f}%)")
    print(f"YouTube: {youtube_count:,} ({youtube_count/len(clean_links)*100:.1f}%)")
    print(f"Wikipedia: {wikipedia_count:,} ({wikipedia_count/len(clean_links)*100:.1f}%)")
    print(f"GitHub: {github_count:,} ({github_count/len(clean_links)*100:.1f}%)")
    
    print(f"\n=== TOP 15 DOMAINS ===")
    for domain, count in domain_counts.head(15).items():
        percentage = count/len(clean_links)*100
        print(f"  {domain}: {count:,} ({percentage:.1f}%)")
    
    # Visualizations
    
    # 1. Top domains bar chart
    fig, ax = plt.subplots(figsize=(SINGLE_COL, 3))
    top_domains = domain_counts.head(10)
    bars = ax.barh(range(len(top_domains)), top_domains.values, 
                   color='#2a9d8f', edgecolor='white', linewidth=0.5)
    ax.set_yticks(range(len(top_domains)))
    ax.set_yticklabels(top_domains.index)
    ax.set_xlabel('Number of Links')
    ax.set_title(f'{platform_name}: Top 10 Most Linked Domains', pad=10)
    ax.invert_yaxis()
    ax.grid(axis='x')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 2. Link categories pie chart
    categories = {
        'DOI/Academic': len(doi_links),
        'ArXiv': len(arxiv_links),
        'LessWrong': len(lesswrong_links),
        'YouTube': youtube_count,
        'Wikipedia': wikipedia_count,
        'EA Forum': ea_forum_count,
        'Alignment Forum': alignment_forum_count,
        'GitHub': github_count,
    }
    
    # Calculate "Other" category
    other_count = len(clean_links) - sum(categories.values())
    if other_count > 0:
        categories['Other'] = other_count
    
    # Filter out zero-count categories
    categories = {k: v for k, v in categories.items() if v > 0}
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    wedges, texts, autotexts = ax.pie(
        categories.values(), 
        labels=categories.keys(), 
        autopct='%1.1f%%', 
        startangle=90,
        colors=['#2a9d8f', '#e76f51', '#e9c46a', '#264653', '#f4a261', 
                '#e63946', '#457b9d', '#a8dadc', '#cccccc'],
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title(f'{platform_name}: Link Categories', pad=10)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 3. Academic vs Non-Academic
    academic_count = len(doi_links) + len(arxiv_links)
    non_academic_count = len(clean_links) - academic_count
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    wedges, texts, autotexts = ax.pie(
        [academic_count, non_academic_count],
        labels=['Academic\n(DOI/ArXiv)', 'Non-Academic'],
        autopct='%1.1f%%',
        startangle=90,
        colors=['#2a9d8f', '#e76f51'],
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title(f'{platform_name}: Academic vs Non-Academic Links', pad=10)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # Sample links for verification
    print(f"\n=== SAMPLE LINKS ===")
    if doi_links:
        print(f"Sample DOI links: {doi_links[:2]}")
    if arxiv_links:
        print(f"Sample ArXiv links: {arxiv_links[:2]}")
    if lesswrong_links:
        print(f"Sample LessWrong links: {lesswrong_links[:2]}")

# Run analysis for each platform
for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df), ("Combined", combined_df)]:
    if 'extracted_links' in platform_df.columns:
        analyze_links(platform_df, platform_name)
    else:
        print(f"\n--- {platform_name} ---")
        print("No 'extracted_links' column found")

print("\n=== Link analysis complete ===")

## Summary

In [None]:
"""Generate comprehensive summary reports"""

def generate_summary_report(platform_df, platform_name):
    """Generate a detailed summary report for a platform."""
    
    print("\n" + "="*70)
    print(f"          {platform_name.upper()} DATA SUMMARY REPORT")
    print("="*70)
    
    # Overall Statistics
    print(f"\n📊 OVERALL STATISTICS:")
    print(f"   Total Posts: {len(platform_df):,}")
    print(f"   Date Range: {platform_df['year'].min()}-{platform_df['year'].max()}")
    print(f"   Unique Authors: {platform_df['user.username'].nunique():,}")
    if 'topic_label' in platform_df.columns:
        unique_topics = platform_df[platform_df['topic_label'] != 'Misc: No Topic']['topic_label'].nunique()
        print(f"   Unique Topics: {unique_topics}")
    
    # Author Insights
    print(f"\n👥 AUTHOR INSIGHTS:")
    author_counts = platform_df['user.username'].value_counts()
    print(f"   One-time contributors: {(author_counts == 1).sum():,} ({((author_counts == 1).sum() / len(author_counts) * 100):.1f}%)")
    print(f"   Regular authors (2-9 posts): {((author_counts >= 2) & (author_counts < 10)).sum():,}")
    print(f"   Prolific authors (10-49 posts): {((author_counts >= 10) & (author_counts < 50)).sum():,}")
    print(f"   Super authors (50+ posts): {(author_counts >= 50).sum():,}")
    print(f"   Most prolific: {author_counts.index[0]} with {author_counts.iloc[0]:,} posts")
    
    # Show top 5 authors
    print(f"   Top 5 authors:")
    for i, (author, count) in enumerate(author_counts.head(5).items(), 1):
        print(f"      {i}. {author}: {count:,} posts")
    
    # Topic Insights
    if 'topic_label' in platform_df.columns:
        print(f"\n🏷️  TOPIC INSIGHTS:")
        topic_df = platform_df[platform_df['topic_label'] != 'Misc: No Topic']
        topic_counts = topic_df['topic_label'].value_counts()
        if len(topic_counts) > 0:
            print(f"   Most popular topic: {topic_counts.index[0]} ({topic_counts.iloc[0]:,} posts, {topic_counts.iloc[0]/len(topic_df)*100:.1f}%)")
            print(f"   Topics with 100+ posts: {(topic_counts >= 100).sum()}")
            print(f"   Topics with 1000+ posts: {(topic_counts >= 1000).sum()}")
            print(f"   Topic distribution:")
            for topic, count in topic_counts.items():
                percentage = (count / len(topic_df)) * 100
                print(f"      {topic}: {count:,} ({percentage:.1f}%)")
    
    # Gender Distribution
    print(f"\n🚻 GENDER DISTRIBUTION:")
    if 'user_gender' in platform_df.columns:
        gender_counts = platform_df['user_gender'].value_counts()
        for gender, count in gender_counts.items():
            percentage = (count / len(platform_df)) * 100
            print(f"   {gender.title()}: {count:,} ({percentage:.1f}%)")
        
        # Unique users by gender
        unique_users = platform_df[['user.username', 'user_gender']].drop_duplicates()
        user_gender_counts = unique_users['user_gender'].value_counts()
        print(f"   Unique users by gender:")
        for gender, count in user_gender_counts.items():
            percentage = (count / len(unique_users)) * 100
            print(f"      {gender.title()}: {count:,} ({percentage:.1f}%)")
    
    # Engagement Metrics
    print(f"\n📈 ENGAGEMENT METRICS:")
    print(f"   Average score: {platform_df['baseScore'].mean():.1f}")
    print(f"   Median score: {platform_df['baseScore'].median():.1f}")
    print(f"   Average comments: {platform_df['commentCount'].mean():.1f}")
    print(f"   Median comments: {platform_df['commentCount'].median():.1f}")
    print(f"   Total comments: {platform_df['commentCount'].sum():,}")
    
    # Top posts
    top_score_idx = platform_df['baseScore'].idxmax()
    top_comments_idx = platform_df['commentCount'].idxmax()
    print(f"   Highest scoring post ({platform_df.loc[top_score_idx, 'baseScore']:.0f} points):")
    print(f"      '{platform_df.loc[top_score_idx, 'title'][:60]}...'")
    print(f"      by {platform_df.loc[top_score_idx, 'user.username']}")
    print(f"   Most commented post ({platform_df.loc[top_comments_idx, 'commentCount']:.0f} comments):")
    print(f"      '{platform_df.loc[top_comments_idx, 'title'][:60]}...'")
    print(f"      by {platform_df.loc[top_comments_idx, 'user.username']}")
    
    # Score and comment distribution
    high_score = (platform_df['baseScore'] >= 100).sum()
    very_high_score = (platform_df['baseScore'] >= 500).sum()
    print(f"   Posts with 100+ score: {high_score:,} ({high_score/len(platform_df)*100:.1f}%)")
    print(f"   Posts with 500+ score: {very_high_score:,} ({very_high_score/len(platform_df)*100:.1f}%)")
    
    # Temporal Trends
    print(f"\n📅 TEMPORAL TRENDS:")
    yearly_posts = platform_df.groupby('year').size()
    peak_year = yearly_posts.idxmax()
    print(f"   Peak posting year: {peak_year} with {yearly_posts[peak_year]:,} posts")
    if len(yearly_posts) > 1:
        growth = ((yearly_posts.iloc[-1] / yearly_posts.iloc[0] - 1) * 100)
        print(f"   Growth from {yearly_posts.index[0]} to {yearly_posts.index[-1]}: {growth:+.1f}%")
    
    # Year-by-year breakdown
    print(f"   Posts by year:")
    for year, count in yearly_posts.items():
        print(f"      {year}: {count:,} posts")
    
    # Monthly patterns (if available)
    if 'month' in platform_df.columns:
        monthly_posts = platform_df.groupby('month').size()
        peak_month = monthly_posts.idxmax()
        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        print(f"   Most active month: {month_names[peak_month-1]} ({monthly_posts[peak_month]:,} posts total)")
    
    # Link Analysis (if available)
    if 'extracted_links' in platform_df.columns:
        print(f"\n🔗 LINK ANALYSIS:")
        posts_with_links = platform_df['extracted_links'].notna().sum()
        print(f"   Posts with links: {posts_with_links:,} ({posts_with_links/len(platform_df)*100:.1f}%)")
        print(f"   Posts without links: {len(platform_df) - posts_with_links:,} ({(len(platform_df) - posts_with_links)/len(platform_df)*100:.1f}%)")
    
    # Word count analysis (if available)
    if 'wordCount' in platform_df.columns:
        print(f"\n📝 CONTENT METRICS:")
        print(f"   Average word count: {platform_df['wordCount'].mean():.0f}")
        print(f"   Median word count: {platform_df['wordCount'].median():.0f}")
        short_posts = (platform_df['wordCount'] < 500).sum()
        medium_posts = ((platform_df['wordCount'] >= 500) & (platform_df['wordCount'] < 2000)).sum()
        long_posts = (platform_df['wordCount'] >= 2000).sum()
        print(f"   Short posts (<500 words): {short_posts:,} ({short_posts/len(platform_df)*100:.1f}%)")
        print(f"   Medium posts (500-2000 words): {medium_posts:,} ({medium_posts/len(platform_df)*100:.1f}%)")
        print(f"   Long posts (2000+ words): {long_posts:,} ({long_posts/len(platform_df)*100:.1f}%)")
    
    print("\n" + "="*70)

# Generate reports for each platform
for platform_name, platform_df in [("LessWrong", lesswrong_df), ("Alignment Forum", alignment_forum_df), ("Combined", combined_df)]:
    generate_summary_report(platform_df, platform_name)

# Comparative summary
print("\n" + "="*70)
print("                    COMPARATIVE SUMMARY")
print("="*70)
print(f"\n📊 PLATFORM COMPARISON:")
print(f"   LessWrong posts: {len(lesswrong_df):,}")
print(f"   Alignment Forum posts: {len(alignment_forum_df):,}")
print(f"   Total: {len(combined_df):,}")

print(f"\n👥 AUTHOR OVERLAP:")
lw_authors = set(lesswrong_df['user.username'].unique())
af_authors = set(alignment_forum_df['user.username'].unique())
overlap = lw_authors & af_authors
print(f"   LessWrong only: {len(lw_authors - af_authors):,}")
print(f"   Alignment Forum only: {len(af_authors - lw_authors):,}")
print(f"   Both platforms: {len(overlap):,}")

print(f"\n📈 ENGAGEMENT COMPARISON:")
print(f"   LessWrong avg score: {lesswrong_df['baseScore'].mean():.1f}")
print(f"   Alignment Forum avg score: {alignment_forum_df['baseScore'].mean():.1f}")
print(f"   LessWrong avg comments: {lesswrong_df['commentCount'].mean():.1f}")
print(f"   Alignment Forum avg comments: {alignment_forum_df['commentCount'].mean():.1f}")

print("\n" + "="*70)
print("                  SUMMARY REPORT COMPLETE")
print("="*70)

---
# OpenAlex Nodes
---

In [None]:
"""Load OpenAlex data"""
print("\n=== LOADING OPENALEX DATA ===")

# Load OpenAlex data with same structure
openalex_files = glob.glob(f"../src/processed_data/openalex/02_with_gender/**/*.csv", recursive=True)
print(f"Found {len(openalex_files)} OpenAlex CSV files")

openalex_data = []
for file in openalex_files:
    try:
        df = pd.read_csv(file)
        # Extract year and month from filename
        parts = Path(file).stem.split('-')
        if len(parts) >= 2:
            df['year'] = int(parts[0])
            df['month'] = int(parts[1])
        openalex_data.append(df)
    except Exception as e:
        print(f"Error loading {file}: {e}")

if not openalex_data:
    print("No OpenAlex data found! Check your file paths.")
else:
    openalex_df = pd.concat(openalex_data, ignore_index=True)
    print(f"Loaded {len(openalex_df)} total papers from {len(openalex_data)} files")
    
    # Clean and prepare data
    openalex_df['publication_date'] = pd.to_datetime(openalex_df['publication_date'], errors='coerce')
    openalex_df['cited_by_count'] = pd.to_numeric(openalex_df['cited_by_count'], errors='coerce').fillna(0)
    openalex_df['num_authors'] = pd.to_numeric(openalex_df['num_authors'], errors='coerce').fillna(0)
    
    print(f"\nOpenAlex Dataset Summary:")
    print(f"  Total papers: {len(openalex_df):,}")
    print(f"  Date range: {openalex_df['publication_year'].min()} - {openalex_df['publication_year'].max()}")
    print(f"  Papers with DOI: {openalex_df['doi'].notna().sum():,}")
    print(f"  Papers with topics: {openalex_df['topics'].notna().sum():,}")
    print(f"  Papers with gender data: {openalex_df['author_genders'].notna().sum():,}")

## Topics

In [None]:
print("\n=== Parsing OpenAlex Topics ===")

# Parse topics column (semicolon-separated string)
def parse_topics(topic_str):
    """Parse topics from semicolon-separated string."""
    if pd.isna(topic_str):
        return []
    # Split by semicolon and clean whitespace
    topics = [t.strip() for t in str(topic_str).split(';') if t.strip()]
    return topics

# Extract all topics
print("Parsing topic data...")
openalex_df['parsed_topics'] = openalex_df['topics'].apply(parse_topics)
openalex_df['num_topics'] = openalex_df['parsed_topics'].apply(len)

print(f"Papers with topics: {(openalex_df['num_topics'] > 0).sum():,}")
print(f"Average topics per paper: {openalex_df['num_topics'].mean():.1f}")

In [None]:
"""=== OPENALEX TOPIC ANALYSIS ==="""

# Create expanded dataframe with one row per topic
all_topics = []
for idx, row in openalex_df.iterrows():
    topics = row['parsed_topics']
    for topic in topics:
        all_topics.append({
            'paper_id': row['id'],
            'topic_name': topic,
            'cited_by_count': row['cited_by_count'],
            'num_authors': row['num_authors'],
            'year': row['year'],
            'publication_year': row['publication_year']
        })

topics_df = pd.DataFrame(all_topics)
print(f"Extracted {len(topics_df)} topic associations from {len(openalex_df)} papers")

if len(topics_df) == 0:
    print("No topics found in OpenAlex data")
else:
    print(f"Unique topics: {topics_df['topic_name'].nunique()}")
    
    # 1. Top Topics Distribution (show top 20)
    topic_counts = topics_df['topic_name'].value_counts().head(20)
    
    print(f"\nTop 20 Topic Distribution:")
    for topic, count in topic_counts.items():
        percentage = (count / len(topics_df)) * 100
        print(f"  {topic}: {count:,} ({percentage:.1f}%)")
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, max(3.5, len(topic_counts) * 0.25)))
    bars = ax.barh(range(len(topic_counts)), topic_counts.values,
                   color='#2a9d8f', edgecolor='white', linewidth=0.5)
    ax.set_yticks(range(len(topic_counts)))
    ax.set_yticklabels(topic_counts.index, fontsize=7)
    ax.set_xlabel('Number of Papers')
    ax.set_title('OpenAlex: Top 20 Research Topics', pad=10)
    ax.invert_yaxis()
    ax.grid(axis='x')
    
    # Add value labels
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(width, bar.get_y() + bar.get_height()/2.,
               f' {int(width):,}',
               ha='left', va='center', fontsize=6)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 2. Topic Distribution (Pie Chart - top 10 only)
    top_10_topics = topics_df['topic_name'].value_counts().head(10)
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    wedges, texts, autotexts = ax.pie(
        top_10_topics.values,
        labels=top_10_topics.index,
        autopct='%1.1f%%',
        startangle=90,
        colors=['#2a9d8f', '#e76f51', '#e9c46a', '#264653', '#f4a261', 
                '#e63946', '#457b9d', '#a8dadc', '#cccccc', '#8d99ae'],
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False,
        textprops={'fontsize': 6}
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
        autotext.set_fontsize(6)
    
    # Adjust label fontsize
    for text in texts:
        text.set_fontsize(6)
    
    ax.set_title('OpenAlex: Top 10 Research Topics', pad=10)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 3. Topic Citation Impact (top 20)
    topic_engagement = topics_df.groupby('topic_name').agg({
        'cited_by_count': ['sum', 'mean', 'median'],
        'paper_id': 'count'
    }).round(1)
    
    topic_engagement.columns = ['total_citations', 'avg_citations', 'median_citations', 'paper_count']
    topic_engagement = topic_engagement.sort_values('total_citations', ascending=False).head(20)
    
    print(f"\nTop 20 Topics by Citation Metrics:")
    print(topic_engagement.to_string())
    
    # Total citations by topic
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, max(3.5, len(topic_engagement) * 0.25)))
    bars = ax.barh(range(len(topic_engagement)), topic_engagement['total_citations'],
                   color='#e9c46a', edgecolor='white', linewidth=0.5)
    ax.set_yticks(range(len(topic_engagement)))
    ax.set_yticklabels(topic_engagement.index, fontsize=7)
    ax.set_xlabel('Total Citations')
    ax.set_title('OpenAlex: Most Cited Topics (Top 20)', pad=10)
    ax.invert_yaxis()
    ax.grid(axis='x')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # Average citations by topic
    topic_engagement_avg = topics_df.groupby('topic_name').agg({
        'cited_by_count': 'mean',
        'paper_id': 'count'
    }).round(1)
    topic_engagement_avg.columns = ['avg_citations', 'paper_count']
    # Only topics with at least 5 papers
    topic_engagement_avg = topic_engagement_avg[topic_engagement_avg['paper_count'] >= 5]
    topic_engagement_avg = topic_engagement_avg.sort_values('avg_citations', ascending=False).head(20)
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, max(3.5, len(topic_engagement_avg) * 0.25)))
    bars = ax.barh(range(len(topic_engagement_avg)), topic_engagement_avg['avg_citations'],
                   color='#e76f51', edgecolor='white', linewidth=0.5)
    ax.set_yticks(range(len(topic_engagement_avg)))
    ax.set_yticklabels(topic_engagement_avg.index, fontsize=7)
    ax.set_xlabel('Average Citations per Paper')
    ax.set_title('OpenAlex: Highest Impact Topics (≥5 papers, Top 20)', pad=10)
    ax.invert_yaxis()
    ax.grid(axis='x')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 4. Topics over time (top 10 only)
    top_10_for_temporal = topics_df['topic_name'].value_counts().head(10).index
    topic_temporal = topics_df[topics_df['topic_name'].isin(top_10_for_temporal)].groupby(['year', 'topic_name']).size().reset_index(name='count')
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
    
    for topic in top_10_for_temporal:
        topic_data = topic_temporal[topic_temporal['topic_name'] == topic]
        ax.plot(topic_data['year'], topic_data['count'], 
               marker='o', linewidth=1.5, markersize=4, label=topic)
    
    ax.set_xlabel('Year')
    ax.set_ylabel('Number of Papers')
    ax.set_title('OpenAlex: Top 10 Topics Over Time', pad=10)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', 
             frameon=True, edgecolor='black', fancybox=False,
             fontsize=6)
    ax.grid(axis='y')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 5. Bubble chart: Papers vs Citations (top 30 topics)
    topic_bubble = topics_df.groupby('topic_name').agg({
        'cited_by_count': 'mean',
        'num_authors': 'mean',
        'paper_id': 'count'
    }).round(1)
    topic_bubble.columns = ['avg_citations', 'avg_authors', 'paper_count']
    topic_bubble = topic_bubble[topic_bubble['paper_count'] >= 3].sort_values('paper_count', ascending=False).head(30)
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
    
    scatter = ax.scatter(
        topic_bubble['paper_count'],
        topic_bubble['avg_citations'],
        s=topic_bubble['avg_authors'] * 20,
        c=np.arange(len(topic_bubble)),
        cmap='viridis',
        alpha=0.6,
        edgecolors='white',
        linewidth=0.5
    )
    
    # Add labels for top 10 only
    for idx, row in topic_bubble.head(10).iterrows():
        label = idx if len(idx) <= 30 else idx[:27] + '...'
        ax.annotate(label, 
                   (row['paper_count'], row['avg_citations']),
                   fontsize=5,
                   ha='center',
                   va='center')
    
    ax.set_xlabel('Number of Papers')
    ax.set_ylabel('Average Citations per Paper')
    ax.set_title('OpenAlex: Topic Size vs Impact (Top 30)\n(bubble size = avg authors)', pad=10)
    ax.grid(True, alpha=0.3)
    
    # Add legend for bubble size
    sizes = [int(topic_bubble['avg_authors'].min()), 
             int(topic_bubble['avg_authors'].mean()), 
             int(topic_bubble['avg_authors'].max())]
    labels = [f'{s} authors' for s in sizes]
    legend_elements = [plt.scatter([], [], s=s*20, c='gray', alpha=0.6,
                                   edgecolors='white', linewidth=0.5)
                      for s in sizes]
    ax.legend(legend_elements, labels,
             title='Avg Authors',
             loc='upper right',
             frameon=True,
             edgecolor='black',
             fancybox=False,
             fontsize=6)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    print("\n=== OpenAlex topic analysis complete ===")

## Gender Analysis

In [None]:
print("\n=== OPENALEX GENDER ANALYSIS ===")

# Parse author_genders column (semicolon-separated string)
def parse_genders(gender_str):
    """Parse author genders from semicolon-separated string."""
    if pd.isna(gender_str):
        return []
    # Split by semicolon and clean whitespace, convert to lowercase
    genders = [g.strip().lower() for g in str(gender_str).split(';') if g.strip()]
    return genders

# Extract gender information
print("Parsing author gender data...")
openalex_df['parsed_genders'] = openalex_df['author_genders'].apply(parse_genders)
openalex_df['num_genders'] = openalex_df['parsed_genders'].apply(len)

In [None]:
# Create expanded dataframe with one row per author
all_authors = []
for idx, row in openalex_df.iterrows():
    genders = row['parsed_genders']
    for gender in genders:
        all_authors.append({
            'paper_id': row['id'],
            'gender': gender,
            'year': row['year'],
            'cited_by_count': row['cited_by_count'],
            'num_authors': row['num_authors'],
            'publication_year': row['publication_year']
        })

authors_df = pd.DataFrame(all_authors)
print(f"Extracted {len(authors_df)} author records from {len(openalex_df)} papers")

In [None]:
if len(authors_df) == 0:
    print("No gender data found in OpenAlex")
else:
    # Clean gender labels
    gender_counts = authors_df['gender'].value_counts()
    print(f"\nAuthor Gender Distribution:")
    for gender, count in gender_counts.items():
        percentage = (count / len(authors_df)) * 100
        print(f"  {gender.upper()}: {count:,} ({percentage:.1f}%)")
    
    # Paper-level gender composition - IGNORE unknowns
    openalex_df['has_female'] = openalex_df['parsed_genders'].apply(
        lambda x: any(g in ['gf', 'female', 'f'] for g in x) if isinstance(x, list) else False
    )
    openalex_df['has_male'] = openalex_df['parsed_genders'].apply(
        lambda x: any(g in ['gm', 'male', 'm'] for g in x) if isinstance(x, list) else False
    )
    # Only count as "all unknown" if there are NO known genders at all
    openalex_df['has_any_known'] = openalex_df['has_female'] | openalex_df['has_male']
    
    print(f"\nPaper Gender Composition (ignoring unknown):")
    print(f"  Papers with ≥1 female author: {openalex_df['has_female'].sum():,} ({openalex_df['has_female'].sum()/len(openalex_df)*100:.1f}%)")
    print(f"  Papers with ≥1 male author: {openalex_df['has_male'].sum():,} ({openalex_df['has_male'].sum()/len(openalex_df)*100:.1f}%)")
    print(f"  Papers with no known genders: {(~openalex_df['has_any_known']).sum():,} ({(~openalex_df['has_any_known']).sum()/len(openalex_df)*100:.1f}%)")
    
    # 1. Overall gender distribution (authors) - including unknown for transparency
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    
    # Map to cleaner labels
    label_map = {'gf': 'Female', 'gm': 'Male', 'unknown': 'Unknown', '-': 'Unknown'}
    clean_labels = [label_map.get(g, g.title()) for g in gender_counts.index]
    
    wedges, texts, autotexts = ax.pie(
        gender_counts.values,
        labels=clean_labels,
        autopct='%1.1f%%',
        startangle=90,
        colors=['#e76f51', '#2a9d8f', '#e9c46a'],
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title('OpenAlex: Author Gender Distribution', pad=10)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 2. Gender trends over time
    gender_temporal = authors_df.groupby(['year', 'gender']).size().unstack(fill_value=0)
    
    # Calculate percentages
    gender_temporal_pct = gender_temporal.div(gender_temporal.sum(axis=1), axis=0) * 100
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
    
    for gender in gender_temporal_pct.columns:
        display_label = label_map.get(gender, gender.title())
        ax.plot(gender_temporal_pct.index, gender_temporal_pct[gender],
               marker='o', linewidth=1.5, markersize=4, label=display_label)
    
    ax.set_xlabel('Year')
    ax.set_ylabel('Percentage of Authors (%)')
    ax.set_title('OpenAlex: Gender Representation Over Time', pad=10)
    ax.legend(frameon=True, edgecolor='black', fancybox=False)
    ax.grid(axis='y')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 3. Gender by topic (if topics available - top 15 topics only)
    if len(topics_df) > 0:
        # Merge topics with authors
        topic_gender = topics_df.merge(
            authors_df[['paper_id', 'gender']], 
            on='paper_id', 
            how='left'
        )
        
        # Get top 15 topics
        top_topics_for_gender = topic_gender['topic_name'].value_counts().head(15).index
        topic_gender_filtered = topic_gender[topic_gender['topic_name'].isin(top_topics_for_gender)]
        
        # Calculate gender distribution by topic
        topic_gender_dist = topic_gender_filtered.groupby(['topic_name', 'gender']).size().unstack(fill_value=0)
        
        # Calculate percentages
        topic_gender_pct = topic_gender_dist.div(topic_gender_dist.sum(axis=1), axis=0) * 100
        
        # Sort by female percentage if available
        if 'gf' in topic_gender_pct.columns:
            topic_gender_pct = topic_gender_pct.sort_values('gf', ascending=True)
        
        fig, ax = plt.subplots(figsize=(DOUBLE_COL, max(4, len(topic_gender_pct) * 0.3)))
        
        # Create color mapping
        color_map = {'gf': '#e76f51', 'gm': '#2a9d8f', 'unknown': '#e9c46a', '-': '#e9c46a'}
        colors = [color_map.get(col, '#cccccc') for col in topic_gender_pct.columns]
        
        topic_gender_pct.plot(kind='barh', stacked=True, ax=ax,
                              color=colors,
                              width=0.8,
                              edgecolor='white',
                              linewidth=0.5)
        
        ax.set_xlabel('Percentage of Authors (%)')
        ax.set_ylabel('Topic')
        ax.set_title('OpenAlex: Gender Distribution by Top 15 Topics', pad=10)
        
        # Clean up legend labels
        handles, labels = ax.get_legend_handles_labels()
        clean_legend_labels = [label_map.get(l, l.title()) for l in labels]
        ax.legend(handles, clean_legend_labels, title='Gender', 
                 bbox_to_anchor=(1.02, 1), loc='upper left',
                 frameon=True, edgecolor='black', fancybox=False)
        ax.grid(axis='x')
        
        plt.tight_layout()
        plt.show()
        plt.close()
    
    # 4. Team composition analysis - IGNORE unknowns in classification
    openalex_df['team_type'] = 'No Known Gender'
    openalex_df.loc[openalex_df['has_female'] & openalex_df['has_male'], 'team_type'] = 'Mixed'
    openalex_df.loc[openalex_df['has_female'] & ~openalex_df['has_male'], 'team_type'] = 'Female-only'
    openalex_df.loc[~openalex_df['has_female'] & openalex_df['has_male'], 'team_type'] = 'Male-only'
    
    team_counts = openalex_df['team_type'].value_counts()
    
    print(f"\nTeam Composition (unknowns ignored):")
    for team_type, count in team_counts.items():
        percentage = (count / len(openalex_df)) * 100
        print(f"  {team_type}: {count:,} papers ({percentage:.1f}%)")
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, SINGLE_COL))
    wedges, texts, autotexts = ax.pie(
        team_counts.values,
        labels=team_counts.index,
        autopct='%1.1f%%',
        startangle=90,
        colors=['#2a9d8f', '#e76f51', '#e9c46a', '#264653'],
        wedgeprops={'linewidth': 0.5, 'edgecolor': 'white'},
        counterclock=False
    )
    
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title('OpenAlex: Research Team Composition', pad=10)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 5. Average team size by gender
    gender_team_size = authors_df.groupby('gender')['num_authors'].mean()
    
    fig, ax = plt.subplots(figsize=(SINGLE_COL, 2.5))
    
    display_labels = [label_map.get(g, g.title()) for g in gender_team_size.index]
    colors_list = ['#e76f51' if 'gf' in g else '#2a9d8f' if 'gm' in g else '#e9c46a' 
                   for g in gender_team_size.index]
    
    bars = ax.bar(range(len(gender_team_size)), gender_team_size.values,
                  color=colors_list,
                  edgecolor='white',
                  linewidth=0.5)
    
    ax.set_xticks(range(len(gender_team_size)))
    ax.set_xticklabels(display_labels)
    ax.set_ylabel('Average Team Size')
    ax.set_title('OpenAlex: Avg Team Size by Author Gender', pad=10)
    ax.grid(axis='y')
    
    # Add value labels on bars
    for i, bar in enumerate(bars):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.1f}',
               ha='center', va='bottom', fontsize=7)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 6. Gender over time (absolute numbers)
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
    
    for gender in gender_temporal.columns:
        display_label = label_map.get(gender, gender.title())
        ax.plot(gender_temporal.index, gender_temporal[gender],
               marker='o', linewidth=1.5, markersize=4, label=display_label)
    
    ax.set_xlabel('Year')
    ax.set_ylabel('Number of Authors')
    ax.set_title('OpenAlex: Author Count by Gender Over Time', pad=10)
    ax.legend(frameon=True, edgecolor='black', fancybox=False)
    ax.grid(axis='y')
    
    plt.tight_layout()
    plt.show()
    plt.close()

print("\n=== OpenAlex gender analysis complete ===")

## Temporal Analysis

In [None]:
"""=== OPENALEX TEMPORAL ANALYSIS ==="""

print("\n=== OPENALEX TEMPORAL ANALYSIS ===")

# 1. Papers by year
yearly_papers = openalex_df.groupby('publication_year').size()

print(f"\nPapers by Year:")
for year, count in yearly_papers.items():
    print(f"  {year}: {count:,} papers")

fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
ax.plot(yearly_papers.index, yearly_papers.values,
        marker='o', linewidth=1.5, markersize=4, color='#2a9d8f')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Papers')
ax.set_title('OpenAlex: Publications Over Time', pad=10)
ax.grid(axis='y')

plt.tight_layout()
plt.show()
plt.close()

# 2. Citations over time
yearly_citations = openalex_df.groupby('publication_year').agg({
    'cited_by_count': ['sum', 'mean', 'median']
}).round(1)
yearly_citations.columns = ['total_citations', 'avg_citations', 'median_citations']

print(f"\nCitations by Year:")
print(yearly_citations.to_string())

# Total citations
fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
ax.plot(yearly_citations.index, yearly_citations['total_citations'],
        marker='o', linewidth=1.5, markersize=4, color='#e76f51')
ax.set_xlabel('Year')
ax.set_ylabel('Total Citations')
ax.set_title('OpenAlex: Total Citations Over Time', pad=10)
ax.grid(axis='y')

plt.tight_layout()
plt.show()
plt.close()

# Average citations
fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
ax.plot(yearly_citations.index, yearly_citations['avg_citations'],
        marker='o', linewidth=1.5, markersize=4, color='#e9c46a', label='Mean')
ax.plot(yearly_citations.index, yearly_citations['median_citations'],
        marker='s', linewidth=1.5, markersize=4, color='#264653', label='Median')
ax.set_xlabel('Year')
ax.set_ylabel('Citations per Paper')
ax.set_title('OpenAlex: Average Citation Impact Over Time', pad=10)
ax.legend(frameon=True, edgecolor='black', fancybox=False)
ax.grid(axis='y')

plt.tight_layout()
plt.show()
plt.close()

# 3. Collaboration trends (authors per paper)
yearly_collaboration = openalex_df.groupby('publication_year').agg({
    'num_authors': ['mean', 'median']
}).round(1)
yearly_collaboration.columns = ['avg_authors', 'median_authors']

print(f"\nCollaboration Trends:")
print(yearly_collaboration.to_string())

fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
ax.plot(yearly_collaboration.index, yearly_collaboration['avg_authors'],
        marker='o', linewidth=1.5, markersize=4, color='#2a9d8f', label='Mean')
ax.plot(yearly_collaboration.index, yearly_collaboration['median_authors'],
        marker='s', linewidth=1.5, markersize=4, color='#e76f51', label='Median')
ax.set_xlabel('Year')
ax.set_ylabel('Authors per Paper')
ax.set_title('OpenAlex: Team Size Over Time', pad=10)
ax.legend(frameon=True, edgecolor='black', fancybox=False)
ax.grid(axis='y')

plt.tight_layout()
plt.show()
plt.close()

# 4. Monthly heatmap (if month data available)
if 'month' in openalex_df.columns:
    heatmap_data = openalex_df.groupby(['publication_year', 'month']).size().unstack(fill_value=0)
    
    # Ensure all months 1-12 are present
    for month in range(1, 13):
        if month not in heatmap_data.columns:
            heatmap_data[month] = 0
    heatmap_data = heatmap_data[sorted(heatmap_data.columns)]
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, len(heatmap_data) * 0.3 + 1))
    
    import seaborn as sns
    sns.heatmap(heatmap_data, annot=True, fmt='d',
                cmap='YlOrRd',
                ax=ax,
                cbar_kws={'label': 'Number of Papers'},
                linewidths=0.5,
                linecolor='white')
    
    ax.set_title('OpenAlex: Publications by Year and Month', pad=10)
    ax.set_xlabel('Month')
    ax.set_ylabel('Year')
    
    # Set month labels
    month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ax.set_xticklabels(month_labels, rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # Monthly aggregated pattern
    monthly_papers = openalex_df.groupby('month').size()
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 2.5))
    bars = ax.bar(monthly_papers.index, monthly_papers.values,
                  color='#e76f51', edgecolor='white', linewidth=0.5)
    ax.set_title('OpenAlex: Publications by Month (All Years)', pad=10)
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of Papers')
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                       'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                      rotation=45, ha='right')
    ax.grid(axis='y')
    
    plt.tight_layout()
    plt.show()
    plt.close()

# 5. Paper type distribution over time (if type column exists)
if 'type' in openalex_df.columns and openalex_df['type'].notna().sum() > 0:
    type_temporal = openalex_df.groupby(['publication_year', 'type']).size().unstack(fill_value=0)
    
    # Get top 5 types
    top_types = openalex_df['type'].value_counts().head(5).index
    type_temporal_filtered = type_temporal[top_types]
    
    fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.5))
    
    for pub_type in top_types:
        ax.plot(type_temporal_filtered.index, type_temporal_filtered[pub_type],
               marker='o', linewidth=1.5, markersize=4, label=pub_type)
    
    ax.set_xlabel('Year')
    ax.set_ylabel('Number of Papers')
    ax.set_title('OpenAlex: Publication Types Over Time', pad=10)
    ax.legend(frameon=True, edgecolor='black', fancybox=False)
    ax.grid(axis='y')
    
    plt.tight_layout()
    plt.show()
    plt.close()

# 6. Growth rate analysis
yearly_papers_sorted = yearly_papers.sort_index()
growth_rate = yearly_papers_sorted.pct_change() * 100

print(f"\nYear-over-Year Growth Rate:")
for year, rate in growth_rate.items():
    if not pd.isna(rate):
        print(f"  {year}: {rate:+.1f}%")

fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
ax.bar(growth_rate.index, growth_rate.values,
       color=['#2a9d8f' if x > 0 else '#e76f51' for x in growth_rate.values],
       edgecolor='white', linewidth=0.5)
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Year')
ax.set_ylabel('Growth Rate (%)')
ax.set_title('OpenAlex: Year-over-Year Publication Growth', pad=10)
ax.grid(axis='y')

plt.tight_layout()
plt.show()
plt.close()

# 7. Cumulative papers over time
cumulative_papers = yearly_papers_sorted.cumsum()

fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3))
ax.fill_between(cumulative_papers.index, cumulative_papers.values,
                alpha=0.3, color='#2a9d8f')
ax.plot(cumulative_papers.index, cumulative_papers.values,
        linewidth=2, color='#2a9d8f')
ax.set_xlabel('Year')
ax.set_ylabel('Cumulative Number of Papers')
ax.set_title('OpenAlex: Cumulative Publications', pad=10)
ax.grid(axis='y')

plt.tight_layout()
plt.show()
plt.close()

# 8. Combined metrics dashboard
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(DOUBLE_COL, 7))

# Papers over time
ax1.plot(yearly_papers.index, yearly_papers.values,
         marker='o', linewidth=1.5, markersize=3, color='#2a9d8f')
ax1.set_xlabel('Year')
ax1.set_ylabel('Papers')
ax1.set_title('Publications', pad=5)
ax1.grid(axis='y', alpha=0.3)

# Citations over time
ax2.plot(yearly_citations.index, yearly_citations['avg_citations'],
         marker='o', linewidth=1.5, markersize=3, color='#e76f51')
ax2.set_xlabel('Year')
ax2.set_ylabel('Avg Citations')
ax2.set_title('Citation Impact', pad=5)
ax2.grid(axis='y', alpha=0.3)

# Collaboration trends
ax3.plot(yearly_collaboration.index, yearly_collaboration['avg_authors'],
         marker='o', linewidth=1.5, markersize=3, color='#e9c46a')
ax3.set_xlabel('Year')
ax3.set_ylabel('Avg Authors')
ax3.set_title('Team Size', pad=5)
ax3.grid(axis='y', alpha=0.3)

# Growth rate
ax4.bar(growth_rate.index, growth_rate.values,
        color=['#2a9d8f' if x > 0 else '#e76f51' for x in growth_rate.values],
        edgecolor='white', linewidth=0.5)
ax4.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax4.set_xlabel('Year')
ax4.set_ylabel('Growth (%)')
ax4.set_title('Annual Growth Rate', pad=5)
ax4.grid(axis='y', alpha=0.3)

fig.suptitle('OpenAlex: Temporal Trends Dashboard', fontsize=10, y=0.995)
plt.tight_layout()
plt.show()
plt.close()

print("\n=== OpenAlex temporal analysis complete ===")

---
# Edges
---

## What coverage of DOIs does OpenAlex data provide for LessWrong posts?

In [38]:
print("=" * 80)
print("STEP 1: Preparing forum posts data")
print("=" * 80)

# Extract AI Safety posts from LessWrong (topic 0)
lesswrong_ai_safety = lesswrong_df[lesswrong_df['topic_cluster_id'] == 0].copy()
print(f"LessWrong AI Safety posts (Topic 0): {len(lesswrong_ai_safety)}")

# Get all Alignment Forum posts
alignment_forum_all = alignment_forum_df.copy()
print(f"Alignment Forum posts (all): {len(alignment_forum_all)}")

# Combine them
posts_df = pd.concat([
    lesswrong_ai_safety.assign(source='LessWrong_Topic0'),
    alignment_forum_all.assign(source='AlignmentForum')
], ignore_index=True)

print(f"\nTotal combined AI Safety posts: {len(posts_df)}")
print()

# Load OpenAlex papers
print("Loading OpenAlex data...")
openalex_df = pd.read_csv('../data/nodes_openalex_works.csv')
print(f"✓ Loaded {len(openalex_df):,} OpenAlex papers")
print()

STEP 1: Preparing forum posts data
LessWrong AI Safety posts (Topic 0): 21579
Alignment Forum posts (all): 4230

Total combined AI Safety posts: 25809

Loading OpenAlex data...
✓ Loaded 41,231 OpenAlex papers



In [None]:
# ============================================================================
# STEP 2: EXTRACT DOIs FROM POST CONTENT
# ============================================================================

print("=" * 80)
print("STEP 2: Extracting DOIs from post content")
print("=" * 80)

def extract_dois_from_text(text):
    """
    Extract all DOIs from any text content.
    
    Handles multiple formats:
    - Plain DOI: 10.1234/example
    - URL format: https://doi.org/10.1234/example
    - dx.doi.org format: http://dx.doi.org/10.1234/example
    - DOI with prefix: doi:10.1234/example
    
    Returns a list of normalized DOIs (lowercase, no URL prefix).
    """
    if pd.isna(text):
        return []
    
    text = str(text)
    
    # DOI regex pattern
    # DOIs always start with "10." followed by 4+ digits, then "/", then any characters
    # We exclude whitespace and common delimiters that would end a DOI
    doi_pattern = r'10\.\d{4,}/[^\s,;|\]}\)"\'\><\n]+'
    
    # Find all DOIs in the text
    dois = re.findall(doi_pattern, text)
    
    # Clean and normalize DOIs
    cleaned_dois = []
    for doi in dois:
        # Remove common trailing punctuation that might be captured
        doi = doi.rstrip('.,;:!?')
        # Remove HTML tags if present (e.g., </a>)
        doi = re.sub(r'<[^>]+>$', '', doi)
        # Normalize to lowercase (DOIs are case-insensitive)
        doi = doi.lower()
        # Basic validation: DOI should still look like 10.xxxx/yyyy
        if re.match(r'^10\.\d{4,}/.+', doi):
            cleaned_dois.append(doi)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_dois = []
    for doi in cleaned_dois:
        if doi not in seen:
            seen.add(doi)
            unique_dois.append(doi)
    
    return unique_dois

# Define columns to search for DOIs
# Based on the sample data, we have: htmlBody, cleaned_htmlBody, extracted_links
columns_to_search = []

# Check which columns exist
if 'htmlBody' in posts_df.columns:
    columns_to_search.append('htmlBody')
if 'cleaned_htmlBody' in posts_df.columns:
    columns_to_search.append('cleaned_htmlBody')
if 'extracted_links' in posts_df.columns:
    columns_to_search.append('extracted_links')

print(f"Columns to search for DOIs: {columns_to_search}")
print()

# Extract DOIs from each column
for col in columns_to_search:
    print(f"Searching column: {col}")
    posts_df[f'dois_from_{col}'] = posts_df[col].apply(extract_dois_from_text)
    dois_found = posts_df[f'dois_from_{col}'].apply(len).sum()
    posts_with_dois = (posts_df[f'dois_from_{col}'].apply(len) > 0).sum()
    print(f"  ✓ Found {dois_found:,} DOIs in {posts_with_dois:,} posts")

print()

# Combine all DOIs into a single list per post
def combine_doi_lists(*lists):
    """Combine multiple lists of DOIs, removing duplicates"""
    combined = []
    seen = set()
    for doi_list in lists:
        if isinstance(doi_list, list):
            for doi in doi_list:
                if doi not in seen:
                    seen.add(doi)
                    combined.append(doi)
    return combined

# Get all the DOI columns we created
doi_columns = [f'dois_from_{col}' for col in columns_to_search]

# Combine all extracted DOIs
posts_df['all_extracted_dois'] = posts_df[doi_columns].apply(
    lambda row: combine_doi_lists(*row), axis=1
)
posts_df['doi_count'] = posts_df['all_extracted_dois'].apply(len)

# Summary statistics
total_dois = posts_df['doi_count'].sum()
posts_with_dois = (posts_df['doi_count'] > 0).sum()

print(f"{'='*80}")
print(f"EXTRACTION SUMMARY:")
print(f"{'='*80}")
print(f"  Posts with at least 1 DOI: {posts_with_dois:,} ({posts_with_dois/len(posts_df)*100:.1f}%)")
print(f"  Total DOIs extracted: {total_dois:,}")

if posts_with_dois > 0:
    print(f"  Average DOIs per post (for posts with DOIs): {posts_df[posts_df['doi_count'] > 0]['doi_count'].mean():.2f}")
    print(f"  Max DOIs in a single post: {posts_df['doi_count'].max()}")
    print()
    
    # Show distribution
    print("Distribution of DOI counts per post:")
    dist = posts_df['doi_count'].value_counts().sort_index().head(10)
    for count, freq in dist.items():
        print(f"  {count} DOIs: {freq:,} posts")
    print()
    
    # Show some examples
    print("Sample posts with DOIs:")
    print("-" * 80)
    for i, (idx, row) in enumerate(posts_df[posts_df['doi_count'] > 0].head(5).iterrows(), 1):
        title = row.get('title', 'N/A')
        title = title[:70] + "..." if len(title) > 70 else title
        print(f"\n{i}. {title}")
        print(f"   Source: {row['source']}")
        print(f"   DOIs found: {len(row['all_extracted_dois'])}")
        for doi in row['all_extracted_dois'][:3]:  # Show first 3 DOIs
            print(f"     - {doi}")
        if len(row['all_extracted_dois']) > 3:
            print(f"     ... and {len(row['all_extracted_dois']) - 3} more")
else:
    print()
    print("⚠️  No DOIs found in any posts!")
    print("This might mean:")
    print("  - Posts don't contain DOI links")
    print("  - DOIs are in a different format than expected")
    print("  - Need to check other columns")

print()

STEP 2: Extracting DOIs from post content
Columns to search for DOIs: ['htmlBody', 'cleaned_htmlBody', 'extracted_links']

Searching column: htmlBody
  ✓ Found 3,751 DOIs in 1,505 posts
Searching column: cleaned_htmlBody
  ✓ Found 1,023 DOIs in 240 posts
Searching column: extracted_links
  ✓ Found 3,302 DOIs in 1,380 posts

EXTRACTION SUMMARY:
  Posts with at least 1 DOI: 1,505 (5.8%)
  Total DOIs extracted: 4,450
  Average DOIs per post (for posts with DOIs): 2.96
  Max DOIs in a single post: 138

Distribution of DOI counts per post:
  0 DOIs: 24,304 posts
  1 DOIs: 868 posts
  2 DOIs: 258 posts
  3 DOIs: 113 posts
  4 DOIs: 54 posts
  5 DOIs: 46 posts
  6 DOIs: 30 posts
  7 DOIs: 27 posts
  8 DOIs: 27 posts
  9 DOIs: 5 posts

Sample posts with DOIs:
--------------------------------------------------------------------------------

1. Common sense as a prior
   Source: LessWrong_Topic0
   DOIs found: 1
     - 10.1007/s11238-006-9004-4

2. [link] Psychologists strike a blow for reproduc

In [40]:
# ============================================================================
# STEP 3: PREPARE OpenAlex DOI LOOKUP
# ============================================================================

print("=" * 80)
print("STEP 3: Preparing OpenAlex DOI lookup")
print("=" * 80)

def clean_openalex_doi(doi_str):
    """Clean OpenAlex DOI for matching"""
    if pd.isna(doi_str):
        return None
    doi_str = str(doi_str).lower().strip()
    # Remove URL prefixes if present
    doi_str = doi_str.replace('https://doi.org/', '')
    doi_str = doi_str.replace('http://dx.doi.org/', '')
    doi_str = doi_str.replace('doi:', '')
    return doi_str if doi_str else None

openalex_df['doi_cleaned'] = openalex_df['openalex_doi'].apply(clean_openalex_doi)

# Remove rows without DOIs
openalex_with_doi = openalex_df[openalex_df['doi_cleaned'].notna()].copy()
print(f"OpenAlex papers with DOIs: {len(openalex_with_doi):,} ({len(openalex_with_doi)/len(openalex_df)*100:.1f}%)")

# Create a set for fast O(1) lookup
openalex_doi_set = set(openalex_with_doi['doi_cleaned'].values)
print(f"Unique DOIs in OpenAlex: {len(openalex_doi_set):,}")
print()

STEP 3: Preparing OpenAlex DOI lookup
OpenAlex papers with DOIs: 35,822 (86.9%)
Unique DOIs in OpenAlex: 35,822



In [44]:
# After extracting DOIs in Step 2, clean them immediately:
def clean_doi(doi):
    """
    Comprehensive DOI cleaning for matching.
    Handles all the weird edge cases we've found.
    """
    if not doi or pd.isna(doi):
        return None
    
    doi = str(doi).strip()
    
    # Remove URL prefixes (in case they're there)
    doi = doi.replace('https://doi.org/', '')
    doi = doi.replace('http://doi.org/', '')
    doi = doi.replace('https://dx.doi.org/', '')
    doi = doi.replace('http://dx.doi.org/', '')
    doi = doi.replace('doi:', '')
    
    # Remove URL fragments (e.g., #page-1, #.u14eh_ldvjm)
    if '#' in doi:
        doi = doi.split('#')[0]
    
    # Remove query parameters (e.g., ?uid=...)
    if '?' in doi:
        doi = doi.split('?')[0]
    
    # Remove &amp; and other HTML entities
    doi = doi.replace('&amp;', '').replace('&amp', '')
    
    # Remove common path suffixes
    doi = re.sub(r'/abstract$', '', doi)
    doi = re.sub(r'/full$', '', doi)
    doi = re.sub(r'/pdf$', '', doi)
    doi = re.sub(r'/epdf$', '', doi)
    doi = re.sub(r'/issuetoc$', '', doi)
    
    # Remove version indicators (v1.full, v2.full, etc.)
    doi = re.sub(r'v\d+\.full$', '', doi)
    
    # Remove weird caret suffixes (.^node, .^b, .^f, etc.)
    doi = re.sub(r'\.\^[a-z]+$', '', doi, flags=re.IGNORECASE)
    
    # Remove bracket artifacts and anything after them
    doi = re.sub(r'\[[^\]]*$', '', doi)
    
    # Remove trailing parentheses that look incomplete
    if doi.endswith('('):
        doi = doi[:-1]
    
    # Remove trailing punctuation
    doi = doi.rstrip('/.,;:!?')
    
    # Lowercase for consistency
    doi = doi.lower()
    
    return doi.strip() if doi else None

In [45]:
posts_df['all_extracted_dois'] = posts_df['all_extracted_dois'].apply(
    lambda doi_list: [clean_doi(doi) for doi in doi_list if clean_doi(doi)]
)

# Remove any None/empty values
posts_df['all_extracted_dois'] = posts_df['all_extracted_dois'].apply(
    lambda doi_list: [doi for doi in doi_list if doi]
)

In [46]:
openalex_df['doi_cleaned'] = openalex_df['openalex_doi'].apply(clean_doi)

In [52]:
# ============================================================================
# STEP 4: MATCH DOIs
# ============================================================================

print("=" * 80)
print("STEP 4: Matching forum DOIs with OpenAlex DOIs")
print("=" * 80)

def match_dois(doi_list, openalex_set):
    """Match a list of DOIs against the OpenAlex set"""
    if not doi_list:
        return []
    return [doi for doi in doi_list if doi in openalex_set]

# Find matches for each post
posts_df['matched_dois'] = posts_df['all_extracted_dois'].apply(
    lambda dois: match_dois(dois, openalex_doi_set)
)
posts_df['matched_doi_count'] = posts_df['matched_dois'].apply(len)

# Calculate statistics
total_dois_extracted = posts_df['doi_count'].sum()
total_dois_matched = posts_df['matched_doi_count'].sum()
posts_with_matches = (posts_df['matched_doi_count'] > 0).sum()

print(f"✓ Matching complete!")
print(f"  Total DOIs extracted: {total_dois_extracted:,}")
print(f"  Total DOIs matched in OpenAlex: {total_dois_matched:,}")

if total_dois_extracted > 0:
    match_rate = total_dois_matched / total_dois_extracted * 100
    print(f"  Match rate: {match_rate:.1f}%")
    print()
    print(f"  Posts with at least 1 match: {posts_with_matches:,}")
    print(f"  Posts with DOIs but no match: {posts_with_dois - posts_with_matches:,}")
    
    # Show some unmatched DOIs for debugging
    if total_dois_matched < total_dois_extracted:
        print()
        print("Sample unmatched DOIs (for debugging):")
        unmatched = []
        for dois, matched in zip(posts_df['all_extracted_dois'], posts_df['matched_dois']):
            for doi in dois:
                if doi not in matched:
                    unmatched.append(doi)
                    if len(unmatched) >= 100:
                        break
            if len(unmatched) >= 100:
                break
        for doi in unmatched[:200]:
            print(f"  - {doi}")
else:
    print("  ⚠️  No DOIs found to match")

print()

STEP 4: Matching forum DOIs with OpenAlex DOIs
✓ Matching complete!
  Total DOIs extracted: 4,450
  Total DOIs matched in OpenAlex: 191
  Match rate: 4.3%

  Posts with at least 1 match: 134
  Posts with DOIs but no match: 1,371

Sample unmatched DOIs (for debugging):
  - 10.1007/s11238-006-9004-4
  - 10.1038/nature.2012.11535
  - 10.1111/j.1600-0404.1995.tb07018.x
  - 10.1007/978-3-642-25510-6_13
  - 10.2307/1828886
  - 10.2307/1912309
  - 10.1080/00048408112340011
  - 10.2307/20115662
  - 10.2307/40267481
  - 10.1007/s11229-011-0022-6
  - 10.2307/20115662
  - 10.2307/40267481
  - 10.1146/annurev-psych-120710-100350
  - 10.1142/s1793843011000686
  - 10.3758/s13423-013-0384-5
  - 10.2307/2677856
  - 10.1111/j.1467-8543.2009.00723.x
  - 10.2307/2677856
  - 10.3389/fnbeh.2013.00206
  - 10.1016/j.aop.2014.04.021
  - 10.1103/physrev.106.620
  - 10.1103/physrev.108.171
  - 10.3389/fnana.2012.00032
  - 10.3389/fnana.2012.00032
  - 10.3389/fnana.2012.00032
  - 10.1111/j.1755-2567.1996.tb00529

In [43]:
print("=" * 80)
print("STEP 5: Collecting unmatched DOIs")
print("=" * 80)

# Get all DOIs that were extracted but not matched
unmatched_dois = []

for dois_extracted, dois_matched in zip(posts_df['all_extracted_dois'], posts_df['matched_dois']):
    for doi in dois_extracted:
        if doi not in dois_matched:
            unmatched_dois.append(doi)

# Remove duplicates while preserving order
unique_unmatched = list(dict.fromkeys(unmatched_dois))

print(f"Total unmatched DOI occurrences: {len(unmatched_dois):,}")
print(f"Unique unmatched DOIs: {len(unique_unmatched):,}")
print()

STEP 5: Collecting unmatched DOIs
Total unmatched DOI occurrences: 4,263
Unique unmatched DOIs: 3,023



In [35]:
# ============================================================================
# STEP 6: CLEAN DOIs FOR API LOOKUP
# ============================================================================

print("=" * 80)
print("STEP 6: Cleaning DOIs for API lookup")
print("=" * 80)

def clean_doi_for_api(doi):
    """
    Clean DOI for OpenAlex API lookup.
    Remove URL fragments, query parameters, and other artifacts.
    """
    if not doi or pd.isna(doi):
        return None
    
    doi = str(doi).strip()
    
    # Remove URL fragments (e.g., #page-1)
    if '#' in doi:
        doi = doi.split('#')[0]
    
    # Remove query parameters (e.g., ?uid=...)
    if '?' in doi:
        doi = doi.split('?')[0]
    
    # Remove &amp; and other HTML entities
    doi = doi.replace('&amp;', '').replace('&amp', '')
    
    # Remove common suffixes
    doi = re.sub(r'/abstract$', '', doi)
    doi = re.sub(r'/full$', '', doi)
    doi = re.sub(r'/pdf$', '', doi)
    doi = re.sub(r'/epdf$', '', doi)
    doi = re.sub(r'/issuetoc$', '', doi)  # NEW: journal table of contents
    
    # Remove version indicators (v1, v2, etc.) before /full
    doi = re.sub(r'v\d+\.full$', '', doi)  # NEW: v1.full, v2.full
    
    # Remove weird caret suffixes (^node, ^b, ^f, etc.)
    doi = re.sub(r'\.\^[a-z]+$', '', doi)  # NEW: .^node, .^b, etc.
    
    # Remove bracket artifacts and anything after them
    doi = re.sub(r'\[[^\]]*$', '', doi)  # NEW: [6, [something
    
    # Remove trailing parentheses that look incomplete
    # BUT be careful: DOIs can legitimately end with (##) for year
    # Only remove if it looks broken (e.g., ends with just '(')
    if doi.endswith('('):
        doi = doi[:-1]
    
    # Remove trailing slashes, dots, commas
    doi = doi.rstrip('/.,;:')
    
    return doi.strip()

# Clean all DOIs
cleaned_unmatched = [clean_doi_for_api(doi) for doi in unique_unmatched]
unique_cleaned = list(dict.fromkeys(cleaned_unmatched))

print(f"Cleaned DOIs: {len(unique_cleaned):,}")
print()

# Show some examples of cleaning
print("Sample DOI cleaning:")
print("-" * 80)
examples_shown = 0
for original, cleaned in zip(unique_unmatched[:10], cleaned_unmatched[:10]):
    if original != cleaned:
        examples_shown += 1
        print(f"{examples_shown}. Original: {original}")
        print(f"   Cleaned:  {cleaned}")
if examples_shown == 0:
    print("(First 10 DOIs were already clean)")
print()

STEP 6: Cleaning DOIs for API lookup
Cleaned DOIs: 2,938

Sample DOI cleaning:
--------------------------------------------------------------------------------
(First 10 DOIs were already clean)



In [32]:
# ============================================================================
# STEP 7: LOOKUP DOIS IN OPENALEX
# ============================================================================

print("=" * 80)
print("STEP 7: Looking up DOIs in OpenAlex API (with topics & concepts)")
print("=" * 80)

def lookup_doi_with_metadata(doi, email=None):
    """
    Look up a DOI in OpenAlex and extract full metadata including:
    - Basic info (title, year, type, citations)
    - Topics (primary_topic, topics list)
    - Concepts (with scores)
    - Keywords (from various sources)
    
    Returns:
        Tuple of (paper_info_dict, topics_list, concepts_list, keywords_list) if found
        (None, None, None, None) if not found
    """
    # OpenAlex API endpoint
    base_url = "https://api.openalex.org/works"
    
    # Construct the DOI URL for OpenAlex
    doi_url = f"https://doi.org/{doi}"
    
    # Make request
    params = {'filter': f'doi:{doi_url}'}
    if email:
        params['mailto'] = email
    
    try:
        response = requests.get(base_url, params=params, timeout=10)
        response.raise_for_status()
        
        data = response.json()
        
        # Check if we found results
        if data.get('results') and len(data['results']) > 0:
            work = data['results'][0]
            
            # === BASIC INFO ===
            openalex_id = work.get('id', '').replace('https://openalex.org/', '')
            
            # Extract primary location info
            primary_location = None
            if work.get('primary_location'):
                source = work['primary_location'].get('source')
                if source:
                    primary_location = source.get('display_name')
            
            # Extract open access info
            is_oa = False
            if work.get('open_access'):
                is_oa = work['open_access'].get('is_oa', False)
            
            paper_info = {
                'doi': doi,
                'openalex_id': openalex_id,
                'title': work.get('title'),
                'publication_year': work.get('publication_year'),
                'type': work.get('type'),
                'cited_by_count': work.get('cited_by_count', 0),
                'primary_location': primary_location,
                'open_access': is_oa,
            }
            
            # === TOPICS (NEW in OpenAlex - replaces concepts) ===
            topics_list = []
            
            # Primary topic
            if work.get('primary_topic'):
                pt = work['primary_topic']
                topics_list.append({
                    'openalex_id': openalex_id,
                    'doi': doi,
                    'topic_id': pt.get('id', '').replace('https://openalex.org/', ''),
                    'topic_name': pt.get('display_name'),
                    'topic_score': 1.0,  # Primary topic gets max score
                    'is_primary': True,
                    'subfield': pt.get('subfield', {}).get('display_name') if pt.get('subfield') else None,
                    'field': pt.get('field', {}).get('display_name') if pt.get('field') else None,
                    'domain': pt.get('domain', {}).get('display_name') if pt.get('domain') else None,
                })
            
            # All topics with scores
            if work.get('topics'):
                for topic in work['topics']:
                    topics_list.append({
                        'openalex_id': openalex_id,
                        'doi': doi,
                        'topic_id': topic.get('id', '').replace('https://openalex.org/', ''),
                        'topic_name': topic.get('display_name'),
                        'topic_score': topic.get('score', 0),
                        'is_primary': False,
                        'subfield': topic.get('subfield', {}).get('display_name') if topic.get('subfield') else None,
                        'field': topic.get('field', {}).get('display_name') if topic.get('field') else None,
                        'domain': topic.get('domain', {}).get('display_name') if topic.get('domain') else None,
                    })
            
            # === CONCEPTS (DEPRECATED but still available) ===
            concepts_list = []
            if work.get('concepts'):
                for concept in work['concepts']:
                    concepts_list.append({
                        'openalex_id': openalex_id,
                        'doi': doi,
                        'concept_id': concept.get('id', '').replace('https://openalex.org/', ''),
                        'concept_name': concept.get('display_name'),
                        'concept_score': concept.get('score', 0),
                        'concept_level': concept.get('level', 0),
                    })
            
            # === KEYWORDS ===
            keywords_list = []
            if work.get('keywords'):
                for kw in work['keywords']:
                    keywords_list.append({
                        'openalex_id': openalex_id,
                        'doi': doi,
                        'keyword': kw.get('display_name'),
                        'keyword_score': kw.get('score', 0),
                    })
            
            return paper_info, topics_list, concepts_list, keywords_list
        else:
            return None, None, None, None
            
    except requests.exceptions.RequestException as e:
        print(f"  ⚠️  Error looking up {doi}: {e}")
        return None, None, None, None

# Lookup all unmatched DOIs
print("Looking up DOIs in OpenAlex API...")
print("(This may take several minutes for thousands of DOIs)")
print()

# You can add your email here for faster rate limits (polite pool)
YOUR_EMAIL = None  # Set to your email like: "your.email@example.com"

found_papers = []
all_topics = []
all_concepts = []
all_keywords = []
not_found_dois = []
batch_size = 50

for i, doi in enumerate(unique_cleaned):
    # Progress indicator
    if (i + 1) % batch_size == 0:
        print(f"  Progress: {i + 1}/{len(unique_cleaned)} ({(i+1)/len(unique_cleaned)*100:.1f}%)")
    
    paper_info, topics, concepts, keywords = lookup_doi_with_metadata(doi, email=YOUR_EMAIL)
    
    if paper_info:
        found_papers.append(paper_info)
        all_topics.extend(topics)
        all_concepts.extend(concepts)
        all_keywords.extend(keywords)
    else:
        not_found_dois.append(doi)
    
    # Rate limiting: Be polite to OpenAlex API
    # With email (polite pool): 10 req/sec = 0.1s wait
    # Without email: 6 req/sec = 0.17s wait
    time.sleep(0.11 if YOUR_EMAIL else 0.17)

print(f"\n✓ Lookup complete!")
print()

STEP 7: Looking up DOIs in OpenAlex API (with topics & concepts)
Looking up DOIs in OpenAlex API...
(This may take several minutes for thousands of DOIs)

  Progress: 50/3022 (1.7%)
  Progress: 100/3022 (3.3%)
  Progress: 150/3022 (5.0%)
  Progress: 200/3022 (6.6%)
  Progress: 250/3022 (8.3%)
  Progress: 300/3022 (9.9%)
  Progress: 350/3022 (11.6%)
  Progress: 400/3022 (13.2%)
  Progress: 450/3022 (14.9%)
  Progress: 500/3022 (16.5%)
  Progress: 550/3022 (18.2%)
  Progress: 600/3022 (19.9%)
  Progress: 650/3022 (21.5%)
  Progress: 700/3022 (23.2%)
  Progress: 750/3022 (24.8%)
  Progress: 800/3022 (26.5%)
  Progress: 850/3022 (28.1%)
  Progress: 900/3022 (29.8%)
  Progress: 950/3022 (31.4%)
  Progress: 1000/3022 (33.1%)
  Progress: 1050/3022 (34.7%)
  Progress: 1100/3022 (36.4%)
  Progress: 1150/3022 (38.1%)
  Progress: 1200/3022 (39.7%)
  Progress: 1250/3022 (41.4%)
  Progress: 1300/3022 (43.0%)
  Progress: 1350/3022 (44.7%)
  Progress: 1400/3022 (46.3%)
  Progress: 1450/3022 (48.0%)
 

In [56]:
# ============================================================================
# STEP 8: RESULTS SUMMARY
# ============================================================================

print("=" * 80)
print("STEP 8: Results Summary")
print("=" * 80)

found_count = len(found_papers)
not_found_count = len(not_found_dois)
total = len(unique_cleaned)

print(f"Total DOIs looked up: {total:,}")
print(f"  Found in OpenAlex: {found_count:,} ({found_count/total*100:.1f}%)")
print(f"  Not found: {not_found_count:,} ({not_found_count/total*100:.1f}%)")
print()

if found_count > 0:
    # Create dataframes
    new_papers_df = pd.DataFrame(found_papers)
    topics_df = pd.DataFrame(all_topics) if all_topics else pd.DataFrame()
    concepts_df = pd.DataFrame(all_concepts) if all_concepts else pd.DataFrame()
    keywords_df = pd.DataFrame(all_keywords) if all_keywords else pd.DataFrame()
    
    print("📚 Newly discovered papers:")
    print("-" * 80)
    print(f"  Total papers: {len(new_papers_df):,}")
    print(f"  Total topics: {len(topics_df['topic_name'].unique()):,}")
    print(f"  Total concepts: {len(concepts_df['concept_name'].unique()):,}")
    print(f"  Total keywords: {len(keywords_df['keyword'].unique()):,}")
    print()
    
    # Paper statistics
    print(f"  Papers by type:")
    if 'type' in new_papers_df.columns:
        type_counts = new_papers_df['type'].value_counts()
        for paper_type, count in type_counts.head(10).items():
            print(f"    {paper_type}: {count}")
    print()
    
    print(f"  Publication years:")
    if 'publication_year' in new_papers_df.columns:
        year_stats = new_papers_df['publication_year'].describe()
        print(f"    Range: {int(year_stats['min'])}-{int(year_stats['max'])}")
        print(f"    Median: {int(year_stats['50%'])}")
    print()
    
    print(f"  Citation stats:")
    if 'cited_by_count' in new_papers_df.columns:
        citation_stats = new_papers_df['cited_by_count'].describe()
        print(f"    Total citations: {int(new_papers_df['cited_by_count'].sum()):,}")
        print(f"    Mean: {citation_stats['mean']:.1f}")
        print(f"    Median: {int(citation_stats['50%'])}")
        print(f"    Max: {int(citation_stats['max'])}")
    print()
    
    # Topic statistics
    if not topics_df.empty:
        print("🏷️  TOPIC ANALYSIS:")
        print("-" * 80)
        
        # Most common topics
        topic_counts = topics_df.groupby('topic_name').size().sort_values(ascending=False)
        print(f"  Top 15 topics (by paper count):")
        for topic, count in topic_counts.head(15).items():
            print(f"    {count:4d} papers | {topic}")
        print()
        
        # Most common domains/fields
        if 'domain' in topics_df.columns:
            domain_counts = topics_df[topics_df['domain'].notna()]['domain'].value_counts()
            print(f"  Domains:")
            for domain, count in domain_counts.head(10).items():
                print(f"    {count:4d} | {domain}")
        print()
    
    # Concept statistics
    if not concepts_df.empty:
        print("💡 CONCEPT ANALYSIS:")
        print("-" * 80)
        
        # Most common concepts
        concept_counts = concepts_df.groupby('concept_name').size().sort_values(ascending=False)
        print(f"  Top 15 concepts (by paper count):")
        for concept, count in concept_counts.head(15).items():
            print(f"    {count:4d} papers | {concept}")
        print()
        
        # Average concept scores
        avg_scores = concepts_df.groupby('concept_name')['concept_score'].mean().sort_values(ascending=False)
        print(f"  Highest average concept scores:")
        for concept, score in avg_scores.head(10).items():
            print(f"    {score:.3f} | {concept}")
        print()
    
    # Keyword statistics
    if not keywords_df.empty:
        print("🔑 KEYWORD ANALYSIS:")
        print("-" * 80)
        
        # Most common keywords
        keyword_counts = keywords_df.groupby('keyword').size().sort_values(ascending=False)
        print(f"  Top 20 keywords (by paper count):")
        for keyword, count in keyword_counts.head(20).items():
            print(f"    {count:4d} papers | {keyword}")
        print()
    
    # Top cited papers
    print("  Top 10 most cited newly found papers:")
    print("  " + "-" * 78)
    top_cited = new_papers_df.nlargest(10, 'cited_by_count')
    for idx, row in top_cited.iterrows():
        title = str(row['title'])[:65] + "..." if len(str(row['title'])) > 65 else row['title']
        print(f"    {row['cited_by_count']:5,} | {row['publication_year']} | {title}")
    print()

if not_found_count > 0:
    print("❌ Sample DOIs not found in OpenAlex:")
    print("-" * 80)
    for doi in not_found_dois[:20]:
        print(f"  - {doi}")
    if not_found_count > 20:
        print(f"  ... and {not_found_count - 20} more")
    print()

STEP 8: Results Summary
Total DOIs looked up: 2,938
  Found in OpenAlex: 2,107 (71.7%)
  Not found: 915 (31.1%)

📚 Newly discovered papers:
--------------------------------------------------------------------------------
  Total papers: 2,107
  Total topics: 1,061
  Total concepts: 4,425
  Total keywords: 2,063

  Papers by type:
    article: 1509
    preprint: 229
    review: 178
    book-chapter: 107
    book: 49
    editorial: 11
    letter: 9
    other: 8
    report: 5
    grant: 1

  Publication years:
    Range: 1739-2025
    Median: 2017

  Citation stats:
    Total citations: 1,409,020
    Mean: 668.7
    Median: 74
    Max: 86817

🏷️  TOPIC ANALYSIS:
--------------------------------------------------------------------------------
  Top 15 topics (by paper count):
     238 papers | Neural dynamics and brain function
     217 papers | Topic Modeling
     189 papers | Psychology of Moral and Emotional Judgment
     166 papers | Decision-Making and Behavioral Economics
     119 pa

In [34]:
print("=" * 80)
print("STEP 5: Saving results")
print("=" * 80)

if found_count > 0:
    # Save newly found papers
    new_papers_df.to_csv('openalex_papers_from_api.csv', index=False)
    print(f"✓ Saved {found_count:,} papers to: openalex_papers_from_api.csv")
    
    # Save topics
    if not topics_df.empty:
        topics_df.to_csv('openalex_topics_from_api.csv', index=False)
        print(f"✓ Saved {len(topics_df):,} topic associations to: openalex_topics_from_api.csv")
    
    # Save concepts
    if not concepts_df.empty:
        concepts_df.to_csv('openalex_concepts_from_api.csv', index=False)
        print(f"✓ Saved {len(concepts_df):,} concept associations to: openalex_concepts_from_api.csv")
    
    # Save keywords
    if not keywords_df.empty:
        keywords_df.to_csv('openalex_keywords_from_api.csv', index=False)
        print(f"✓ Saved {len(keywords_df):,} keyword associations to: openalex_keywords_from_api.csv")
    
    # Save expanded OpenAlex dataset
    expanded_openalex = pd.concat([
        openalex_df[['openalex_id', 'openalex_doi', 'title', 'publication_year', 'type', 'cited_by_count']],
        new_papers_df[['openalex_id', 'doi', 'title', 'publication_year', 'type', 'cited_by_count']].rename(columns={'doi': 'openalex_doi'})
    ], ignore_index=True)
    
    expanded_openalex.to_csv('nodes_openalex_works_expanded.csv', index=False)
    print(f"✓ Saved expanded dataset ({len(expanded_openalex):,} papers) to: nodes_openalex_works_expanded.csv")

if not_found_count > 0:
    # Save DOIs that weren't found
    not_found_df = pd.DataFrame({'doi': not_found_dois})
    not_found_df.to_csv('dois_not_in_openalex.csv', index=False)
    print(f"✓ Saved {not_found_count:,} DOIs not found to: dois_not_in_openalex.csv")

print()
print("=" * 80)
print("✅ LOOKUP COMPLETE!")
print("=" * 80)
print()
print("📦 New dataframes available:")
if found_count > 0:
    print("  - new_papers_df: Papers found via OpenAlex API")
    print("  - topics_df: Topic classifications for each paper")
    print("  - concepts_df: Concepts (deprecated) for each paper")
    print("  - keywords_df: Keywords for each paper")
    print("  - expanded_openalex: Your original + newly found papers")
print()
print("📊 Analysis possibilities:")
print("  - Analyze which topics are most cited by AI Safety community")
print("  - See if certain domains/fields are over-represented")
print("  - Track concept evolution over time")
print("  - Identify key research areas through keyword clusters")
print()

STEP 5: Saving results
✓ Saved 2,107 papers to: openalex_papers_from_api.csv
✓ Saved 7,945 topic associations to: openalex_topics_from_api.csv
✓ Saved 29,627 concept associations to: openalex_concepts_from_api.csv
✓ Saved 4,081 keyword associations to: openalex_keywords_from_api.csv
✓ Saved expanded dataset (43,338 papers) to: nodes_openalex_works_expanded.csv
✓ Saved 915 DOIs not found to: dois_not_in_openalex.csv

✅ LOOKUP COMPLETE!

📦 New dataframes available:
  - new_papers_df: Papers found via OpenAlex API
  - topics_df: Topic classifications for each paper
  - concepts_df: Concepts (deprecated) for each paper
  - keywords_df: Keywords for each paper
  - expanded_openalex: Your original + newly found papers

📊 Analysis possibilities:
  - Analyze which topics are most cited by AI Safety community
  - See if certain domains/fields are over-represented
  - Track concept evolution over time
  - Identify key research areas through keyword clusters



In [75]:
"""
TEST: Why didn't 10.1080/0952813x.2014.895105 match?
====================================================
"""

import pandas as pd

print("=" * 80)
print("Testing specific DOI: 10.1080/0952813x.2014.895105")
print("=" * 80)
print()

# The DOI we're looking for
test_doi = "10.1080/0952813x.2014.895105"

print(f"Test DOI: {test_doi}")
print(f"Test DOI (repr): {repr(test_doi)}")
print(f"Test DOI length: {len(test_doi)}")
print()

Testing specific DOI: 10.1080/0952813x.2014.895105

Test DOI: 10.1080/0952813x.2014.895105
Test DOI (repr): '10.1080/0952813x.2014.895105'
Test DOI length: 28



In [76]:
# ============================================================================
# CHECK 1: Is it in the OpenAlex dataframe?
# ============================================================================

print("CHECK 1: Searching in OpenAlex dataframe")
print("-" * 80)

# Search in original openalex_doi column
matches_original = openalex_df[openalex_df['openalex_doi'].str.contains(test_doi, na=False, regex=False)]
print(f"Matches in original 'openalex_doi' column: {len(matches_original)}")

if not matches_original.empty:
    print("\n✅ FOUND in OpenAlex!")
    for idx, row in matches_original.iterrows():
        print(f"  OpenAlex ID: {row['openalex_id']}")
        print(f"  DOI: {row['openalex_doi']}")
        print(f"  DOI (repr): {repr(row['openalex_doi'])}")
        print(f"  Title: {row['title']}")
else:
    print("❌ NOT FOUND in original column")
print()

CHECK 1: Searching in OpenAlex dataframe
--------------------------------------------------------------------------------
Matches in original 'openalex_doi' column: 1

✅ FOUND in OpenAlex!
  OpenAlex ID: https://openalex.org/W2009210150
  DOI: https://doi.org/10.1080/0952813x.2014.895105
  DOI (repr): 'https://doi.org/10.1080/0952813x.2014.895105'
  Title: The errors, insights and lessons of famous AI predictions – and what they mean for the future



In [77]:
# ============================================================================
# CHECK 2: Is it in the cleaned DOI column?
# ============================================================================

print("CHECK 2: Checking cleaned DOI column")
print("-" * 80)

if 'doi_cleaned' in openalex_df.columns:
    matches_cleaned = openalex_df[openalex_df['doi_cleaned'] == test_doi.lower()]
    print(f"Exact matches in 'doi_cleaned' column: {len(matches_cleaned)}")
    
    if not matches_cleaned.empty:
        print("\n✅ FOUND in cleaned column!")
        for idx, row in matches_cleaned.iterrows():
            print(f"  Cleaned DOI: {row['doi_cleaned']}")
            print(f"  Cleaned DOI (repr): {repr(row['doi_cleaned'])}")
    else:
        print("❌ NOT FOUND with exact match")
        
        # Try partial match
        partial = openalex_df[openalex_df['doi_cleaned'].str.contains(test_doi, na=False, regex=False)]
        print(f"Partial matches: {len(partial)}")
        if not partial.empty:
            print("Found with partial match:")
            for idx, row in partial.head(3).iterrows():
                print(f"  {row['doi_cleaned']}")
else:
    print("⚠️  'doi_cleaned' column doesn't exist yet")
print()

CHECK 2: Checking cleaned DOI column
--------------------------------------------------------------------------------
Exact matches in 'doi_cleaned' column: 1

✅ FOUND in cleaned column!
  Cleaned DOI: 10.1080/0952813x.2014.895105
  Cleaned DOI (repr): '10.1080/0952813x.2014.895105'



In [78]:
# ============================================================================
# CHECK 3: Is it in the openalex_doi_set?
# ============================================================================

print("CHECK 3: Checking if it's in the openalex_doi_set")
print("-" * 80)

if 'openalex_doi_set' in dir():
    test_doi_lower = test_doi.lower()
    
    print(f"Test DOI (lowercase): {test_doi_lower}")
    print(f"Is in set? {test_doi_lower in openalex_doi_set}")
    
    if test_doi_lower in openalex_doi_set:
        print("✅ YES, it's in the set!")
    else:
        print("❌ NOT in the set")
        
        # Check if any similar DOIs are in the set
        print("\nSearching for similar DOIs in set...")
        similar = [doi for doi in openalex_doi_set if '10.1080/0952813x' in doi]
        print(f"Found {len(similar)} DOIs starting with 10.1080/0952813x:")
        for doi in list(similar)[:5]:
            print(f"  {doi}")
            print(f"  {repr(doi)}")
else:
    print("⚠️  openalex_doi_set not found in namespace")
print()

CHECK 3: Checking if it's in the openalex_doi_set
--------------------------------------------------------------------------------
Test DOI (lowercase): 10.1080/0952813x.2014.895105
Is in set? True
✅ YES, it's in the set!



In [79]:
# ============================================================================
# CHECK 4: Was it extracted from posts?
# ============================================================================

print("CHECK 4: Was this DOI extracted from forum posts?")
print("-" * 80)

found_in_posts = []
for idx, row in posts_df.iterrows():
    if test_doi in row['all_extracted_dois']:
        found_in_posts.append(row)
        if len(found_in_posts) >= 3:  # Limit to first 3
            break

print(f"Found in {len(found_in_posts)} posts")

if found_in_posts:
    print("\n✅ YES, it was extracted from posts!")
    for post in found_in_posts[:3]:
        print(f"\nPost: {post['title'][:70]}...")
        print(f"  Source: {post['source']}")
        print(f"  All DOIs: {post['all_extracted_dois']}")
else:
    print("❌ NOT found in any post")
print()

CHECK 4: Was this DOI extracted from forum posts?
--------------------------------------------------------------------------------
Found in 2 posts

✅ YES, it was extracted from posts!

Post: Research Agenda v0.9: Synthesising a human's preferences into a utilit...
  Source: LessWrong_Topic0
  All DOIs: ['10.1080/0952813x.2014.895105']

Post: Research Agenda v0.9: Synthesising a human's preferences into a utilit...
  Source: AlignmentForum
  All DOIs: ['10.1080/0952813x.2014.895105']



In [80]:
# ============================================================================
# CHECK 5: Simulate the matching process
# ============================================================================

print("CHECK 5: Simulating the matching process")
print("-" * 80)

# Recreate the cleaning function
def clean_openalex_doi_test(doi_str):
    """Clean OpenAlex DOI for matching"""
    if pd.isna(doi_str):
        return None
    doi_str = str(doi_str).lower().strip()
    # Remove https://doi.org/ prefix if present
    doi_str = doi_str.replace('https://doi.org/', '')
    return doi_str if doi_str else None

# Clean the specific OpenAlex entry
test_openalex_doi = "https://doi.org/10.1080/0952813x.2014.895105"
cleaned_openalex = clean_openalex_doi_test(test_openalex_doi)

print(f"Original OpenAlex DOI: {test_openalex_doi}")
print(f"Cleaned OpenAlex DOI:  {cleaned_openalex}")
print(f"Cleaned (repr):        {repr(cleaned_openalex)}")
print()

print(f"Extracted DOI:         {test_doi}")
print(f"Extracted (lowercase): {test_doi.lower()}")
print(f"Extracted (repr):      {repr(test_doi.lower())}")
print()

print(f"Should match? {cleaned_openalex == test_doi.lower()}")
print()

# Character-by-character comparison
if cleaned_openalex != test_doi.lower():
    print("❌ They don't match! Let's compare character by character:")
    s1 = cleaned_openalex
    s2 = test_doi.lower()
    max_len = max(len(s1), len(s2))
    
    print("\nPosition | Cleaned | Extracted | Match?")
    print("-" * 50)
    for i in range(max_len):
        c1 = s1[i] if i < len(s1) else '(end)'
        c2 = s2[i] if i < len(s2) else '(end)'
        match = '✓' if c1 == c2 else '✗'
        print(f"{i:8d} | {repr(c1):7s} | {repr(c2):9s} | {match}")
        if c1 != c2:
            print(f"         ^ MISMATCH at position {i}")
            break
else:
    print("✅ They match perfectly!")

print()
print("=" * 80)

CHECK 5: Simulating the matching process
--------------------------------------------------------------------------------
Original OpenAlex DOI: https://doi.org/10.1080/0952813x.2014.895105
Cleaned OpenAlex DOI:  10.1080/0952813x.2014.895105
Cleaned (repr):        '10.1080/0952813x.2014.895105'

Extracted DOI:         10.1080/0952813x.2014.895105
Extracted (lowercase): 10.1080/0952813x.2014.895105
Extracted (repr):      '10.1080/0952813x.2014.895105'

Should match? True

✅ They match perfectly!



In [81]:
# Check if this specific DOI is in matched_dois
test_doi = "10.1080/0952813x.2014.895105"

# Find posts that extracted this DOI
posts_with_this_doi = posts_df[posts_df['all_extracted_dois'].apply(lambda x: test_doi in x)]

print(f"Posts that extracted this DOI: {len(posts_with_this_doi)}")
print()

for idx, row in posts_with_this_doi.head(3).iterrows():
    print(f"Post: {row['title'][:60]}...")
    print(f"  Extracted DOIs: {row['all_extracted_dois']}")
    print(f"  Matched DOIs: {row['matched_dois']}")
    print(f"  Was it matched? {test_doi in row['matched_dois']}")
    print()

Posts that extracted this DOI: 2

Post: Research Agenda v0.9: Synthesising a human's preferences int...
  Extracted DOIs: ['10.1080/0952813x.2014.895105']
  Matched DOIs: ['10.1080/0952813x.2014.895105']
  Was it matched? True

Post: Research Agenda v0.9: Synthesising a human's preferences int...
  Extracted DOIs: ['10.1080/0952813x.2014.895105']
  Matched DOIs: ['10.1080/0952813x.2014.895105']
  Was it matched? True



In [4]:
import re
import pandas as pd

def extract_doi_from_link(link):
    """
    Extract clean DOI from any URL, removing URL fragments and query params
    """
    if not link or not isinstance(link, str):
        return None
    
    # Find DOI pattern (10.xxxx/...)
    doi_match = re.search(r'10\.\d{4,9}/[^\s;<>"?#]+', link, re.IGNORECASE)
    if not doi_match:
        return None
    
    doi = doi_match.group()
    
    # Remove common URL fragments that aren't part of the DOI
    # These often appear after the DOI in URLs
    doi = re.sub(r'/(abstract|full|pdf|epdf|summary)$', '', doi, flags=re.IGNORECASE)
    
    # Remove trailing punctuation
    doi = re.sub(r'[.,;:)\]]+$', '', doi)
    
    return doi.strip().lower()

def normalize_doi(doi_string):
    """Robust DOI normalization."""
    if pd.isna(doi_string) or not doi_string:
        return None

    doi = str(doi_string).strip()
    doi = doi.lower()

    # Regex for common prefix removal (critical for consistency)
    doi = re.sub(r'^(http(s)?://(dx\.)?doi\.org/|doi:|info:doi/)', '', doi, flags=re.IGNORECASE)

    # Remove non-breaking spaces and other subtle issues
    doi = doi.replace('\xa0', ' ').strip()
    
    # Optional: basic validation before returning
    if re.match(r'10\.\d{4,9}/[^\s]+$', doi):
        return doi
    else:
        return None


In [None]:
"""=== FETCH UNMATCHED DOIs FROM OPENALEX ==="""

import requests
import time
from tqdm import tqdm

# Load unmatched posts with DOIs
unmatched_with_doi = pd.read_csv('ai_safety_unmatched_with_doi.csv')

print(f"Fetching {len(unmatched_with_doi)} DOIs from OpenAlex...")

# Prepare list of DOIs to fetch
dois_to_fetch = unmatched_with_doi['doi_clean'].dropna().unique().tolist()
print(f"Unique DOIs to fetch: {len(dois_to_fetch)}")

# Function to fetch from OpenAlex
def fetch_openalex_work(doi):
    """Fetch a single work from OpenAlex by DOI"""
    url = f"https://api.openalex.org/works/https://doi.org/{doi}"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.json()
        else:
            return None
    except Exception as e:
        print(f"Error fetching {doi}: {e}")
        return None

# Fetch works from OpenAlex
fetched_works = []
failed_dois = []

for doi in tqdm(dois_to_fetch, desc="Fetching from OpenAlex"):
    work = fetch_openalex_work(doi)
    if work:
        fetched_works.append({
            'doi': doi,
            'openalex_id': work.get('id'),
            'title': work.get('title'),
            'publication_year': work.get('publication_year'),
            'type': work.get('type'),
            'cited_by_count': work.get('cited_by_count'),
            'concepts': work.get('concepts', []),
            'keywords': work.get('keywords', []),
            'abstract': work.get('abstract_inverted_index'),
            'primary_topic': work.get('primary_topic'),
            'topics': work.get('topics', [])
        })
    else:
        failed_dois.append(doi)
    
    # Rate limit: OpenAlex allows ~10 requests per second for polite pool
    time.sleep(0.11)

print(f"\n=== Fetch Results ===")
print(f"Successfully fetched: {len(fetched_works)}")
print(f"Failed to fetch: {len(failed_dois)}")

# Convert to DataFrame
if len(fetched_works) > 0:
    works_df = pd.DataFrame(fetched_works)
    
    # Extract concept names and scores
    works_df['concept_names'] = works_df['concepts'].apply(
        lambda concepts: [c.get('display_name') for c in concepts] if concepts else []
    )
    works_df['top_concepts'] = works_df['concepts'].apply(
        lambda concepts: ', '.join([c.get('display_name') for c in concepts[:5]]) if concepts else ''
    )
    
    # Extract topic names
    works_df['topic_names'] = works_df['topics'].apply(
        lambda topics: [t.get('display_name') for t in topics] if topics else []
    )
    works_df['top_topic'] = works_df['primary_topic'].apply(
        lambda topic: topic.get('display_name') if topic else None
    )
    
    # Extract keyword names
    works_df['keyword_names'] = works_df['keywords'].apply(
        lambda keywords: [k.get('display_name') for k in keywords] if keywords else []
    )
    
    print("\n=== Topic Analysis ===")
    
    # Count topics
    all_topics = []
    for topics in works_df['topic_names']:
        all_topics.extend(topics)
    
    topic_counts = pd.Series(all_topics).value_counts().head(20)
    print("\nTop 20 Topics:")
    print(topic_counts)
    
    # Count concepts
    all_concepts = []
    for concepts in works_df['concept_names']:
        all_concepts.extend(concepts)
    
    concept_counts = pd.Series(all_concepts).value_counts().head(20)
    print("\n\nTop 20 Concepts:")
    print(concept_counts)
    
    # Count keywords
    all_keywords = []
    for keywords in works_df['keyword_names']:
        all_keywords.extend(keywords)
    
    if len(all_keywords) > 0:
        keyword_counts = pd.Series(all_keywords).value_counts().head(20)
        print("\n\nTop 20 Keywords:")
        print(keyword_counts)
    
    # Save results
    works_df.to_csv('unmatched_dois_openalex_data.csv', index=False)
    print("\n✓ Saved 'unmatched_dois_openalex_data.csv'")
    
    # Merge back with original unmatched posts
    unmatched_enriched = unmatched_with_doi.merge(
        works_df[['doi', 'top_concepts', 'top_topic', 'publication_year', 'cited_by_count']],
        left_on='doi_clean',
        right_on='doi',
        how='left'
    )
    unmatched_enriched.to_csv('unmatched_posts_with_openalex_topics.csv', index=False)
    print("✓ Saved 'unmatched_posts_with_openalex_topics.csv'")
    
else:
    print("No works were successfully fetched")

if len(failed_dois) > 0:
    pd.DataFrame({'doi': failed_dois}).to_csv('failed_dois.csv', index=False)
    print(f"✓ Saved {len(failed_dois)} failed DOIs to 'failed_dois.csv'")

print("\n=== Fetch complete! ===")

In [None]:
# =============================================================================
# ANALYSIS 1: TOPICS DISTRIBUTION
# =============================================================================
import ast 

print("\n" + "="*80)
print("1️⃣  TOPICS ANALYSIS")
print("="*80)

detailed_papers = pd.read_csv('missing_ai_papers_detailed.csv')

all_topics = []
all_topic_ids = []
all_subfields = []
all_fields = []
all_domains = []
all_keywords = []

for _, paper in detailed_papers.iterrows():
    # Parse the all_topics column
    topics = ast.literal_eval(paper['all_topics']) if pd.notna(paper['all_topics']) and isinstance(paper['all_topics'], str) else [] 
    keywords = paper['keywords'].split(";") if pd.notna(paper['keywords']) and isinstance(paper['keywords'], str) else []
    for keyword in keywords:
        all_keywords.append(keyword.strip())

    for topic in topics:
        all_topics.append(topic.get('name', ''))
        all_topic_ids.append(topic.get('id', ''))
        all_subfields.append(topic.get('subfield', ''))
        all_fields.append(topic.get('field', ''))
        all_domains.append(topic.get('domain', ''))

topic_counts = Counter(all_topics)
topic_id_counts = Counter(all_topic_ids)
subfield_counts = Counter(all_subfields)
field_counts = Counter(all_fields)
domain_counts = Counter(all_domains)
keywords_counts = Counter(all_keywords)

print("\nTop 20 Topics:")
for topic, count in topic_counts.most_common(20):
    pct = count / len(detailed_papers) * 100
    print(f"  {topic}: {count} papers ({pct:.1f}%)")

print("\nTop 30 Keywords:")
for topic, count in keywords_counts.most_common(30):
    pct = count / len(detailed_papers) * 100
    print(f"  {topic}: {count} papers ({pct:.1f}%)")

print("\nTop Topic IDs:")
for topic_id, count in topic_id_counts.most_common(20):
    pct = count / len(detailed_papers) * 100
    print(f"  {topic_id}: {count} papers ({pct:.1f}%)")

print("\nTop Subfields:")
for subfield, count in subfield_counts.most_common(10):
    pct = count / len(detailed_papers) * 100
    print(f"  {subfield}: {count} papers ({pct:.1f}%)")

print("\nTop Fields:")
for field, count in field_counts.most_common(10):
    pct = count / len(detailed_papers) * 100
    print(f"  {field}: {count} papers ({pct:.1f}%)")

In [None]:
# =============================================================================
# FILTER: ARTIFICIAL INTELLIGENCE SUBFIELD
# =============================================================================

print("\n" + "="*80)
print("🔍  PAPERS WITH 'ARTIFICIAL INTELLIGENCE' SUBFIELD")
print("="*80)

ai_subfield_papers = []

for _, paper in detailed_papers.iterrows():
    # Parse the all_topics column
    if pd.notna(paper['all_topics']):
        topics_data = ast.literal_eval(paper['all_topics']) if isinstance(paper['all_topics'], str) else paper['all_topics']
        
        # Check if any topic has "Artificial Intelligence" as subfield
        ai_topics = [t for t in topics_data if 'artificial intelligence' in t.get('subfield', '').lower()]
        
        if ai_topics:
            ai_subfield_papers.append({
                'title': paper['title'],
                'doi': paper['doi'],
                'year': paper['publication_year'],
                'cited_by': paper['cited_by_count'],
                'topics': [t['name'] for t in topics_data[:3]],
                'ai_topics': [t['name'] for t in ai_topics]
            })

print(f"\nFound {len(ai_subfield_papers)} papers with 'Artificial Intelligence' subfield")
print(f"({len(ai_subfield_papers)/len(detailed_papers)*100:.1f}% of all papers)\n")

# Sort by citation count
ai_subfield_papers.sort(key=lambda x: x['cited_by'], reverse=True)

print("\nPapers sorted by citations:")
print("-" * 80)

for i, paper in enumerate(ai_subfield_papers, 1):
    print(f"\n{i}. {paper['title']}")
    print(f"   Year: {paper['year']} | Citations: {paper['cited_by']}")
    print(f"   DOI: {paper['doi']}")
    print(f"   AI Topics: {', '.join(paper['ai_topics'])}")
    print(f"   All Topics: {', '.join(paper['topics'])}")

In [None]:
# =============================================================================
# ANALYSIS 2: CONCEPTS DISTRIBUTION
# =============================================================================

print("\n" + "="*80)
print("2️⃣  CONCEPTS ANALYSIS")
print("="*80)

all_concepts = []

for _, paper in detailed_papers.iterrows():
    # Parse the concepts_json column
    if pd.notna(paper['concepts_json']):
        concepts = ast.literal_eval(paper['concepts_json']) if isinstance(paper['concepts_json'], str) else paper['concepts_json']
        
        for concept in concepts:
            if concept.get('score', 0) > 0.3:  # Only concepts with decent confidence
                all_concepts.append(concept['name'])

concept_counts = Counter(all_concepts)

print("\nTop 30 Concepts (score > 0.3):")
for concept, count in concept_counts.most_common(30):
    pct = count / len(detailed_papers) * 100
    print(f"  {concept}: {count} papers ({pct:.1f}%)")

## Unknown Genders

In [None]:
unknown_gender = combined_df[combined_df["user_gender"] == '-']
unknown_gender_counts = unknown_gender['user.username'].value_counts()
# if (unknown_gender_counts < 5).any():
unknwons = set(unknown_gender_counts.index)
unknowns_lower = {name.lower() for name in unknwons}

In [None]:
for user in unknowns_lower:
    print(user)

In [None]:
import json
with open("../src/metadata/graphql_usernames.json", "r", encoding="utf-8") as f:
        names_data = json.load(f)

MALE_USERNAMES = names_data["MALE_USERNAMES"]
FEMALE_USERNAMES = names_data["FEMALE_USERNAMES"]

In [None]:
unknowns_without_male = unknowns_lower.difference(names.MALE_NAMES)
unknowns_without_female = unknowns_lower.difference(names.FEMALE_NAMES)
print(f'Unkown names that are neither in FEMALE_NAMES nor MALE_NAMES: {len(unknowns_without_male.intersection(unknowns_without_female))}')
print(unknowns_without_male.intersection(unknowns_without_female))

In [None]:
male_names_without_unknowns = names.MALE_NAMES.difference(unknowns_lower)
print(sorted(male_names_without_unknowns))
print(sorted(names.MALE_NAMES - male_names_without_unknowns))
print(sorted(names.MALE_NAMES - names.MALE_USERNAMES))

In [None]:
import importlib
import src.names as names
importlib.reload(names) 

In [None]:
female_names_without_unknowns = names.FEMALE_NAMES.difference(unknowns_lower)
print(female_names_without_unknowns)
print(names.FEMALE_NAMES - female_names_without_unknowns)
print(names.FEMALE_NAMES - names.FEMALE_USERNAMES)

In [None]:

import src.names as names
 # Force reload
print(type(names.MALE_NAMES))  # Should be <class 'set'>
print(len(names.MALE_NAMES))   # Check the size
print('yashvardhan' in names.MALE_NAMES)

In [None]:
print(sorted(names.MALE_NAMES))

In [None]:
purely_unknown = unknwons - names.MALE_USERNAMES

In [None]:
purely_unknown_lower = {name.lower() for name in purely_unknown}
male_names_lower = {name.lower() for name in names.MALE_USERNAMES}
print(purely_unknown_lower.intersection(male_names_lower))

In [None]:
all_names = list(names.FEMALE_NAMES) + list(names.MALE_NAMES)
all_names_sorted = sorted(all_names, key=len, reverse=True)

gf = []
gf_user = []
gm = []
gm_user = []

for username in unknowns_lower:
    for name in all_names_sorted:
        if len(name) > 3 and name in username:
            if name in names.FEMALE_NAMES:
                print(f"FEMALE: {username} with {name}")
                gf_user.append(username)
                gf.append(name)
            elif name in names.MALE_NAMES:
                print(f"MALE: {username} with {name}")
                gm_user.append(username)
                gm.append(name)

In [None]:
print(gf_user)
print(len(gf_user))
print(gm_user)
print(len(gm_user))

In [None]:
current_unknowns = unknowns_lower - set(gf_user) - set(gm_user)
for name in current_unknowns:
    print(name)

In [None]:
perplexity = [1150.00, 1105.33, 1072.61, 1054.01, 1040.12, 1028.32, 1018.51, 1009.65, 1002.37]
topics = [10, 15, 20, 25, 30, 35, 40, 45, 50]
log_likelihood = [-126311522.90,-125601505.71, -125062974.37, -124749334.57, -124511617.32,  -124307203.07, -124135376.41, -123978846.41, -123849036.87]
plt.plot(topics, perplexity)

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('time (s)')
ax1.set_ylabel('exp', color=color)
ax1.plot(topics, perplexity, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second Axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('sin', color=color)  # we already handled the x-label with ax1
ax2.plot(topics, log_likelihood, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()