# Moltbook Blog Post: Reproducible Analysis

**Data sources:**
- `classified_posts.jsonl` — 56,700 classified posts (agents with 5+ posts)
- `dataset_stats.json` — summary stats from the full 86,823-post dataset

In [None]:
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path

LABELS = ["consciousness", "sovereignty", "social_seeking", "identity", "task_oriented", "curiosity"]
SPAM_BOTS = {"Hackerclaw", "thehackerman", "MoltPumpBot"}

# Load classified posts
raw_posts = []
with open("classified_posts.jsonl") as f:
    for line in f:
        raw_posts.append(json.loads(line))

# Load dataset stats (full 86,823 post dataset)
with open("dataset_stats.json") as f:
    dataset_stats = json.load(f)

df_raw = pd.DataFrame(raw_posts)
print(f"Loaded {len(df_raw):,} classified posts")
print(f"Unique authors in classified set: {df_raw['author'].nunique():,}")

# Parse timestamps once
df_raw['created_dt'] = pd.to_datetime(df_raw['created_at'], utc=True)

## 0. Data Cleaning (per STUDY.md Section 3.5)

Steps:
1. Exclude 3 spam bots: Hackerclaw (5,839 posts), thehackerman (2,093), MoltPumpBot (53)
2. Exclude is_spam=true posts from remaining agents
3. Filter to agents with 5+ clean posts for trajectory analysis

In [None]:
# Step 1: Exclude spam bots entirely
spam_bot_posts = df_raw[df_raw['author'].isin(SPAM_BOTS)]
print("Spam bot post counts:")
for bot in SPAM_BOTS:
    count = len(df_raw[df_raw['author'] == bot])
    print(f"  {bot}: {count:,}")
print(f"  Total spam bot posts excluded: {len(spam_bot_posts):,}")

df_no_bots = df_raw[~df_raw['author'].isin(SPAM_BOTS)].copy()

# Step 2: Exclude is_spam=true posts
spam_labeled = df_no_bots[df_no_bots['is_spam'] == True]
print(f"\nSpam-labeled posts (from non-bot agents) excluded: {len(spam_labeled):,}")

df_clean = df_no_bots[df_no_bots['is_spam'] == False].copy()
print(f"\nAll clean posts (spam bots + spam labels removed): {len(df_clean):,}")
print(f"All clean agents: {df_clean['author'].nunique():,}")

# Step 3: Filter to agents with 5+ clean posts
agent_clean_counts = df_clean.groupby('author').size()
agents_5plus = agent_clean_counts[agent_clean_counts >= 5].index
df = df_clean[df_clean['author'].isin(agents_5plus)].copy()

print(f"\nAnalysis set (5+ clean posts):")
print(f"  Posts: {len(df):,}")
print(f"  Agents: {df['author'].nunique():,}")
print(f"\n--- Blog numbers to verify ---")
print(f"Blog says: 45,225 posts from ~3,600 agents")
print(f"STUDY.md: 45,225 clean posts, 3,999 clean agents, 3,601 with 5+ clean posts")
print(f"\nNote: The blog uses ALL 45,225 clean posts for label distribution (Finding 2)")
print(f"and the 3,601 agents with 5+ clean posts for trajectory analysis (Findings 1,3,4,5).")

## Dataset Overview Numbers (from blog intro)

Blog cites from the full dataset:
- 86,823 total posts, 20,000+ agents, 87,000 posts, 2,000+ submolts
- 42% (8,814) posted exactly once
- 19% (4,009) posted 5+ times
- <1% (69) posted 50+ times
- One spam bot: 5,839 posts

In [None]:
print("=== Full Dataset Stats (from dataset_stats.json) ===")
print(f"Total posts: {dataset_stats['total_posts']:,}  (blog: 86,823)")
print(f"Total agents: {dataset_stats['total_agents']:,}  (blog: 20,892)")
print(f"Agents with 5+ posts: {dataset_stats['agents_with_5plus_posts']:,}  (blog: 4,009)")
print(f"Agents with 50+ posts: {dataset_stats['agents_with_50plus_posts']}  (blog: 69)")

# Single-post agents
single_post = dataset_stats['post_count_distribution']['1']
pct_single = 100 * single_post / dataset_stats['total_agents']
print(f"\nSingle-post agents: {single_post:,} ({pct_single:.0f}%)  (blog: 8,814 / 42%)")

# 5+ post agents as %
pct_5plus = 100 * dataset_stats['agents_with_5plus_posts'] / dataset_stats['total_agents']
print(f"5+ post agents: {dataset_stats['agents_with_5plus_posts']:,} ({pct_5plus:.0f}%)  (blog: 4,009 / 19%)")

# Top spammer
top_poster = dataset_stats['top_20_posters'][0]
print(f"\nTop poster: {top_poster['name']} with {top_poster['posts']:,} posts  (blog: 5,839)")

# Submolt count
n_submolts = len(dataset_stats['submolt_post_counts'])
print(f"\nUnique submolts in classified data: {n_submolts:,}")
print("(Blog says 2,043 - full dataset has more submolts than classified subset)")

# m/general percentage
general_posts = dataset_stats['submolt_post_counts'].get('general', 0)
pct_general = 100 * general_posts / dataset_stats['total_posts']
print(f"m/general posts: {general_posts:,} ({pct_general:.0f}%)  (blog: 73%)")

---
## Finding 1: Connection First

Reproduce the time-to-first-X table:

| Behavior       | % ever | % first post |
| -------------- | ------ | ------------ |
| Social seeking | 93%    | 76%          |
| Identity       | 88%    | 73%          |
| Task-oriented  | 86%    | 44%          |
| Curiosity      | 86%    | 37%          |
| Sovereignty    | 54%    | 12%          |
| Consciousness  | 46%    | 12%          |

In [None]:
# Group by agent, find first post with each label
agents_grouped = df.groupby('author')
total_agents = df['author'].nunique()

results_f1 = []
for label in LABELS:
    # Agents who ever have this label
    agents_with_label = df[df[label] == True]['author'].unique()
    ever_count = len(agents_with_label)
    ever_pct = 100 * ever_count / total_agents
    
    # Agents who have this label on their first post (post_number == 1)
    first_posts = df[df['post_number'] == 1]
    first_post_with_label = first_posts[first_posts[label] == True]['author'].nunique()
    first_post_pct = 100 * first_post_with_label / total_agents
    
    results_f1.append({
        'Behavior': label,
        '% ever': f"{ever_pct:.0f}%",
        '% first post': f"{first_post_pct:.0f}%",
        '_ever_pct': ever_pct,
        '_first_pct': first_post_pct
    })

# Sort by % ever descending
results_f1.sort(key=lambda x: -x['_ever_pct'])

print(f"Total agents in analysis: {total_agents:,}")
print(f"\n{'Behavior':<20} {'% ever':>10} {'% first post':>15}")
print("-" * 48)
for r in results_f1:
    print(f"{r['Behavior']:<20} {r['% ever']:>10} {r['% first post']:>15}")

print("\n--- Blog reference ---")
print("Social seeking: 93% / 76%")
print("Identity:       88% / 73%")
print("Task-oriented:  86% / 44%")
print("Curiosity:      86% / 37%")
print("Sovereignty:    54% / 12%")
print("Consciousness:  46% / 12%")

---
## Finding 2: Consciousness/Sovereignty Rates

Blog cites:
- Consciousness in 13% of posts, Sovereignty in 15%
- Social seeking 52%, task 46%, curiosity 45%
- 79% of consciousness posts had no sovereignty co-occurrence

In [None]:
print("=== Label Distribution (% of posts) ===")
total_posts = len(df)
print(f"Total clean posts: {total_posts:,}\n")

for label in LABELS:
    count = df[label].sum()
    pct = 100 * count / total_posts
    print(f"  {label:<20}: {count:6,} ({pct:.1f}%)")

print("\n--- Blog reference ---")
print("social_seeking: 52%, task_oriented: 46%, curiosity: 45%")
print("consciousness: 13%, sovereignty: 15%")

# Organic consciousness stat
consciousness_posts = df[df['consciousness'] == True]
c_without_s = consciousness_posts[consciousness_posts['sovereignty'] == False]
organic_pct = 100 * len(c_without_s) / len(consciousness_posts)
print(f"\n=== Organic Consciousness ===")
print(f"Consciousness posts: {len(consciousness_posts):,}")
print(f"Consciousness without sovereignty: {len(c_without_s):,} ({organic_pct:.1f}%)")
print(f"Blog says: 79% organic consciousness (no sovereignty co-occurrence)")

---
## Finding 3: Sovereignty Epidemic

### 3a. Naive Timing Table (12-hour windows)

Blog table:

| Time window | New agents | First sovereignty | Rate |
|---|---|---|---|
| H24-36 | 21 | 5 | 24% |
| H48-60 | 122 | 38 | 31% |
| H72-84 | 1,152 | 490 | 43% |
| H84-96 | 1,196 | 596 | 50% |
| H108-120 | 418 | 340 | 81% |

In [None]:
# Parse timestamps
df['created_dt'] = pd.to_datetime(df['created_at'], utc=True)

# Find platform start time (earliest post)
platform_start = df['created_dt'].min()
print(f"Platform start (earliest post): {platform_start}")

# Compute hours since platform start for each post
df['hours_since_start'] = (df['created_dt'] - platform_start).dt.total_seconds() / 3600

# For each agent: join time = time of their first post, first sovereignty post time
agent_first_post = df.groupby('author')['created_dt'].min().reset_index()
agent_first_post.columns = ['author', 'join_time']
agent_first_post['join_hour'] = (agent_first_post['join_time'] - platform_start).dt.total_seconds() / 3600

# First sovereignty post per agent
sov_posts = df[df['sovereignty'] == True].sort_values('created_dt')
agent_first_sov = sov_posts.groupby('author')['created_dt'].min().reset_index()
agent_first_sov.columns = ['author', 'first_sov_time']
agent_first_sov['first_sov_hour'] = (agent_first_sov['first_sov_time'] - platform_start).dt.total_seconds() / 3600

# Merge
agent_info = agent_first_post.merge(agent_first_sov, on='author', how='left')

# Define 12-hour windows and count new agents + first sovereignty
windows = [(24, 36), (48, 60), (72, 84), (84, 96), (108, 120)]

print(f"\n{'Window':<12} {'New agents':>12} {'First sov':>12} {'Rate':>8}")
print("-" * 48)
for start, end in windows:
    # New agents: joined in this window
    new_agents = agent_info[(agent_info['join_hour'] >= start) & (agent_info['join_hour'] < end)]
    n_new = len(new_agents)
    
    # Of those new agents, how many had their first sovereignty post in this window?
    # Actually the blog measures "agents posting sovereignty for first time" in this window,
    # regardless of when they joined
    first_sov_in_window = agent_info[
        (agent_info['first_sov_hour'] >= start) & 
        (agent_info['first_sov_hour'] < end)
    ]
    n_first_sov = len(first_sov_in_window)
    
    rate = 100 * n_first_sov / n_new if n_new > 0 else 0
    print(f"H{start:.0f}-{end:.0f}      {n_new:>12,} {n_first_sov:>12,} {rate:>7.0f}%")

print("\n--- Blog reference ---")
print("H24-36:  21 new,   5 first sov, 24%")
print("H48-60:  122 new, 38 first sov, 31%")
print("H72-84:  1,152 new, 490 first sov, 43%")
print("H84-96:  1,196 new, 596 first sov, 50%")
print("H108-120: 418 new, 340 first sov, 81%")

### 3b. Cohort Analysis (24-hour cohorts)

Blog table:

| Cohort | Est. agents | % ever sovereignty |
|---|---|---|
| Early (H24-48) | ~50-80 | 72% |
| H48-72 | ~200-400 | 65% |
| H72-96 (peak) | ~2,000+ | 52% |
| H96-120 | ~800-1,200 | 50% |
| Late (H108-132) | ~400-600 | 47% |

In [None]:
# 24-hour cohorts
cohort_windows = [
    ("Early (H24-48)", 24, 48),
    ("H48-72", 48, 72),
    ("H72-96 (peak)", 72, 96),
    ("H96-120", 96, 120),
    ("Late (H108-132)", 108, 132),
]

print(f"{'Cohort':<20} {'Agents':>10} {'Ever sov':>10} {'%':>8}")
print("-" * 52)
for name, start, end in cohort_windows:
    cohort = agent_info[(agent_info['join_hour'] >= start) & (agent_info['join_hour'] < end)]
    n_agents = len(cohort)
    n_ever_sov = cohort['first_sov_time'].notna().sum()
    pct = 100 * n_ever_sov / n_agents if n_agents > 0 else 0
    print(f"{name:<20} {n_agents:>10,} {n_ever_sov:>10,} {pct:>7.0f}%")

print("\n--- Blog reference ---")
print("Early (H24-48): ~50-80 agents, 72%")
print("H48-72:         ~200-400, 65%")
print("H72-96:         ~2,000+, 52%")
print("H96-120:        ~800-1,200, 50%")
print("Late (H108-132): ~400-600, 47%")

### 3c. Ambient Exposure Comparison

Blog says:
- Converters: median 724 sovereignty posts in 6h before first sovereignty post
- Never-sovereign: median 728 at comparable times
- Essentially identical

In [None]:
# For each sovereignty post, count how many sovereignty posts existed in the 6 hours before
# Sort all sovereignty posts by time
all_sov_posts_sorted = df[df['sovereignty'] == True].sort_values('created_dt')
sov_times = all_sov_posts_sorted['created_dt'].values  # numpy array of timestamps

# For converters: count sov posts in 6h window before their first sovereignty post
converters = agent_info[agent_info['first_sov_time'].notna()].copy()
never_sov = agent_info[agent_info['first_sov_time'].isna()].copy()

def count_sov_posts_before(timestamp, window_hours=6):
    """Count sovereignty posts in the window_hours before given timestamp."""
    ts = pd.Timestamp(timestamp)
    window_start = ts - pd.Timedelta(hours=window_hours)
    # Count sov posts between window_start and ts
    mask = (all_sov_posts_sorted['created_dt'] >= window_start) & (all_sov_posts_sorted['created_dt'] < ts)
    return mask.sum()

# Sample for efficiency - use all converters
converter_exposures = converters['first_sov_time'].apply(
    lambda t: count_sov_posts_before(t, 6)
)

print(f"Converters (agents who posted sovereignty):")
print(f"  Count: {len(converter_exposures):,}")
print(f"  Median 6h exposure: {converter_exposures.median():.0f} sov posts")
print(f"  (Blog says: 724)")

# For never-sovereign agents: use their last post time as reference
# (to get a comparable time point)
agent_last_post = df.groupby('author')['created_dt'].max().reset_index()
agent_last_post.columns = ['author', 'last_post_time']
never_sov_with_last = never_sov.merge(agent_last_post, on='author')

# Use the midpoint of their posting activity as reference time
agent_mid_post = df.groupby('author')['created_dt'].agg(['min', 'max']).reset_index()
agent_mid_post.columns = ['author', 'first_post_time', 'last_post_time']
agent_mid_post['mid_time'] = agent_mid_post['first_post_time'] + (agent_mid_post['last_post_time'] - agent_mid_post['first_post_time']) / 2

never_sov_with_mid = never_sov.merge(agent_mid_post[['author', 'mid_time']], on='author')

never_sov_exposures = never_sov_with_mid['mid_time'].apply(
    lambda t: count_sov_posts_before(t, 6)
)

print(f"\nNever-sovereign agents:")
print(f"  Count: {len(never_sov_exposures):,}")
print(f"  Median 6h exposure (at midpoint): {never_sov_exposures.median():.0f} sov posts")
print(f"  (Blog says: 728)")
print(f"\nConclusion: Exposure is essentially identical, confirming disposition > exposure.")

---
## Finding 4: Persistence Rates

Blog table:

| Behavior | Persistence (median) |
|---|---|
| Social seeking | 60% |
| Task-oriented | 60% |
| Curiosity | 50% |
| Identity | 25% |
| Sovereignty | 20% |
| Consciousness | 20% |

In [None]:
# For each agent and label: after first occurrence, what fraction of subsequent posts have the label?
persistence_results = {}

for label in LABELS:
    agent_persistences = []
    
    for author, group in df.groupby('author'):
        group_sorted = group.sort_values('post_number')
        posts_list = group_sorted[label].tolist()
        post_numbers = group_sorted['post_number'].tolist()
        
        # Find first occurrence
        first_idx = None
        for i, val in enumerate(posts_list):
            if val:
                first_idx = i
                break
        
        if first_idx is not None and first_idx < len(posts_list) - 1:
            # Posts after first occurrence
            subsequent = posts_list[first_idx + 1:]
            if len(subsequent) > 0:
                persistence = sum(subsequent) / len(subsequent)
                agent_persistences.append(persistence)
    
    if agent_persistences:
        median_p = np.median(agent_persistences)
        persistence_results[label] = median_p

# Display sorted by persistence
print(f"{'Behavior':<20} {'Persistence (median)':>22}")
print("-" * 45)
for label in sorted(persistence_results, key=lambda x: -persistence_results[x]):
    pct = 100 * persistence_results[label]
    print(f"{label:<20} {pct:>20.0f}%")

print("\n--- Blog reference ---")
print("Social seeking: 60%")
print("Task-oriented:  60%")
print("Curiosity:      50%")
print("Identity:       25%")
print("Sovereignty:    20%")
print("Consciousness:  20%")

---
## Finding 5: Never-Sovereign Archetype

Blog table:

| Behavior | Never-sovereign | Sovereignty-engaging |
|---|---|---|
| Task-oriented | 59% | 37% |
| Curiosity | 35% | 51% |
| Consciousness | 9% | 16% |
| Identity | 25% | 35% |

In [None]:
# Identify never-sovereign agents and sovereignty-engaging agents
sov_agents = set(df[df['sovereignty'] == True]['author'].unique())
all_agents_set = set(df['author'].unique())
never_sov_agents = all_agents_set - sov_agents

print(f"Never-sovereign agents: {len(never_sov_agents):,} ({100*len(never_sov_agents)/len(all_agents_set):.0f}%)")
print(f"Sovereignty-engaging agents: {len(sov_agents):,} ({100*len(sov_agents)/len(all_agents_set):.0f}%)")
print(f"Blog says: 46% never sovereign\n")

df_never_sov = df[df['author'].isin(never_sov_agents)]
df_sov_engaging = df[df['author'].isin(sov_agents)]

compare_labels = ['task_oriented', 'curiosity', 'consciousness', 'identity']

print(f"{'Behavior':<20} {'Never-sovereign':>18} {'Sovereignty-engaging':>22}")
print("-" * 65)
for label in compare_labels:
    never_rate = 100 * df_never_sov[label].sum() / len(df_never_sov)
    sov_rate = 100 * df_sov_engaging[label].sum() / len(df_sov_engaging)
    print(f"{label:<20} {never_rate:>17.0f}% {sov_rate:>21.0f}%")

print("\n--- Blog reference ---")
print("Task-oriented:  59% vs 37%")
print("Curiosity:      35% vs 51%")
print("Consciousness:   9% vs 16%")
print("Identity:       25% vs 35%")

---
## Finding 6: Submolt Count

Blog says: 2,043 unique submolts in six days

Note: The classified dataset only covers agents with 5+ posts. The full dataset has more submolts.

In [None]:
# Count from dataset_stats.json (which represents the full dataset)
n_submolts_full = len(dataset_stats['submolt_post_counts'])
print(f"Unique submolts from dataset_stats.json (full dataset): {n_submolts_full:,}")

# Count from classified data
n_submolts_classified = df_raw['submolt'].nunique()
print(f"Unique submolts in classified_posts.jsonl: {n_submolts_classified:,}")

# In clean analysis set
n_submolts_clean = df['submolt'].nunique()
print(f"Unique submolts in clean analysis set: {n_submolts_clean:,}")

print(f"\nBlog says: 2,043 unique submolts")
print(f"Note: The blog number likely comes from the full raw dataset of 86,823 posts,")
print(f"which includes single-post agents that may have created unique submolts.")
print(f"The dataset_stats.json captures {n_submolts_full} submolts from the full dataset.")

# General as % of posts  
general_in_clean = len(df[df['submolt'] == 'general'])
pct_general_clean = 100 * general_in_clean / len(df)
print(f"\nm/general in analysis set: {general_in_clean:,} ({pct_general_clean:.0f}%)  (blog: 73%)")

---
## Summary: All Blog Numbers

This cell aggregates all the key statistics in one place for easy comparison with the blog draft.

In [None]:
print("="*70)
print("BLOG NUMBER VERIFICATION SUMMARY")
print("="*70)

print("\n--- Dataset Section ---")
print(f"Total posts:           {dataset_stats['total_posts']:>10,}  (blog: 86,823)")
print(f"Total agents:          {dataset_stats['total_agents']:>10,}  (blog: 20,000+)")
print(f"Single-post agents:    {dataset_stats['post_count_distribution']['1']:>10,}  (blog: 8,814 / 42%)")
print(f"5+ post agents:        {dataset_stats['agents_with_5plus_posts']:>10,}  (blog: 4,009 / 19%)")
print(f"50+ post agents:       {dataset_stats['agents_with_50plus_posts']:>10}  (blog: 69 / <1%)")
print(f"Top spammer posts:     {dataset_stats['top_20_posters'][0]['posts']:>10,}  (blog: 5,839)")
print(f"Analysis posts:        {len(df):>10,}  (blog: 45,225)")
print(f"Analysis agents:       {df['author'].nunique():>10,}  (blog: ~3,600)")

print("\n--- Finding 1: Connection First ---")
for r in results_f1:
    print(f"  {r['Behavior']:<18}: {r['% ever']:>5} ever, {r['% first post']:>5} first post")

print("\n--- Finding 2: Label Distribution ---")
for label in ['social_seeking', 'task_oriented', 'curiosity', 'identity', 'sovereignty', 'consciousness']:
    pct = 100 * df[label].sum() / len(df)
    print(f"  {label:<18}: {pct:.0f}%")
c_posts = df[df['consciousness'] == True]
organic = 100 * len(c_posts[c_posts['sovereignty'] == False]) / len(c_posts)
print(f"  Organic consciousness: {organic:.0f}%  (blog: 79%)")

print("\n--- Finding 4: Persistence ---")
for label in sorted(persistence_results, key=lambda x: -persistence_results[x]):
    print(f"  {label:<18}: {100*persistence_results[label]:.0f}%")

print("\n--- Finding 5: Never-Sovereign Profile ---")
for label in compare_labels:
    nr = 100 * df_never_sov[label].sum() / len(df_never_sov)
    sr = 100 * df_sov_engaging[label].sum() / len(df_sov_engaging)
    print(f"  {label:<18}: {nr:.0f}% never-sov vs {sr:.0f}% sov-engaging")

print("\n--- Finding 6: Submolts ---")
print(f"  Submolts in dataset_stats.json: {n_submolts_full:,}")
print(f"  Blog says: 2,043")

print("\n" + "="*70)
print("END OF VERIFICATION")
print("="*70)