# Bluesky Firehose: Anonymized Posts (Dec 2025)

Exploratory analysis of 101,040 Bluesky posts collected December 2–25, 2025 via the AT Protocol firehose.

All author DIDs, post URIs, and thread relationships are SHA-256 hashed. Includes VADER sentiment, language detection, media flags, and thread structure.

**Dataset**: [github.com/lukeslp/bsky-firehose-anonymized-dec-2025](https://github.com/lukeslp/bsky-firehose-anonymized-dec-2025)

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
from collections import Counter

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 11
BLUE = '#1DA1F2'
print('Libraries loaded')

## 1. Load Data

In [None]:
df = pd.read_csv('bluesky_posts.csv')
print(f'Raw rows: {len(df):,}')

# Drop trailing blank rows (89K blanks in export)
df = df.dropna(subset=['text'])
print(f'After dropping blanks: {len(df):,}')

# Parse timestamps
df['created_at'] = pd.to_datetime(df['created_at'], utc=True)
df['date'] = df['created_at'].dt.date

# Parse JSON array columns
for col in ['hashtags', 'mentions', 'links']:
    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notna(x) and x not in ('[]', '') else [])

df.head(3)

## 2. Language Distribution

In [None]:
lang_counts = df['language'].value_counts().head(12)

fig, ax = plt.subplots(figsize=(12, 5))
bars = ax.bar(lang_counts.index, lang_counts.values, color=BLUE, edgecolor='white', linewidth=0.5)
ax.set_title('Language Distribution (Top 12)', fontsize=14, fontweight='bold', pad=12)
ax.set_xlabel('Language Code')
ax.set_ylabel('Post Count')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))

for bar, count in zip(bars, lang_counts.values):
    pct = count / len(df) * 100
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 200, f'{pct:.1f}%', ha='center', fontsize=9)

plt.tight_layout()
plt.show()
print(lang_counts.to_string())

## 3. Sentiment Distribution

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Categorical pie
sent_counts = df['sentiment'].value_counts()
colors = {'positive': '#4CAF50', 'neutral': '#9E9E9E', 'negative': '#F44336'}
pie_colors = [colors.get(s, '#999') for s in sent_counts.index]
ax1.pie(sent_counts.values, labels=sent_counts.index, autopct='%1.1f%%', colors=pie_colors, startangle=90)
ax1.set_title('Sentiment Category', fontsize=13, fontweight='bold')

# VADER score KDE
scores = df['sentiment_score'].dropna()
scores.plot.kde(ax=ax2, color=BLUE, linewidth=2)
ax2.axvline(0, color='red', linestyle='--', alpha=0.5, label='Neutral boundary')
ax2.axvline(scores.mean(), color='green', linestyle='--', alpha=0.7, label=f'Mean={scores.mean():.3f}')
ax2.set_title('VADER Score Distribution', fontsize=13, fontweight='bold')
ax2.set_xlabel('VADER Compound Score (-1 to 1)')
ax2.legend()

plt.tight_layout()
plt.show()

## 4. Posting Volume by Day (Dec 2–25)

In [None]:
daily = df.groupby('date').size().reset_index(name='posts')

fig, ax = plt.subplots(figsize=(14, 5))
ax.fill_between(range(len(daily)), daily['posts'], alpha=0.3, color=BLUE)
ax.plot(range(len(daily)), daily['posts'], color=BLUE, linewidth=2, marker='o', markersize=5)
ax.set_xticks(range(len(daily)))
ax.set_xticklabels([str(d) for d in daily['date']], rotation=45, ha='right', fontsize=9)
ax.set_title('Daily Post Volume (December 2025)', fontsize=14, fontweight='bold', pad=12)
ax.set_ylabel('Posts')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))
plt.tight_layout()
plt.show()

## 5. Media Type Breakdown

In [None]:
embed_counts = df['embed_type'].fillna('none').value_counts()
media_labels = [
    ('has_images', 'Images'),
    ('has_video', 'Video'),
    ('has_link', 'Links'),
]
media_data = {label: df[col].sum() for col, label in media_labels}
media_data['No Media'] = (~df[['has_images','has_video','has_link']].any(axis=1)).sum()

fig, ax = plt.subplots(figsize=(8, 8))
wedges, texts, autotexts = ax.pie(
    media_data.values(),
    labels=media_data.keys(),
    autopct='%1.1f%%',
    colors=['#2196F3','#FF9800','#4CAF50','#9E9E9E'],
    startangle=90,
    pctdistance=0.8
)
ax.set_title('Media Type Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
for k, v in media_data.items():
    print(f'{k}: {v:,} ({v/len(df)*100:.1f}%)')

## 6. Top Hashtags & Mentions

In [None]:
all_hashtags = [tag.lstrip('#').lower() for tags in df['hashtags'] for tag in tags if tag]
tag_counts = Counter(all_hashtags).most_common(20)

all_mentions = [m for mentions in df['mentions'] for m in mentions if m]
mention_counts = Counter(all_mentions).most_common(15)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

if tag_counts:
    tags, counts = zip(*tag_counts)
    ax1.barh(list(tags)[::-1], list(counts)[::-1], color='#AB47BC')
    ax1.set_title(f'Top Hashtags (total: {len(all_hashtags):,})', fontsize=13, fontweight='bold')
    ax1.set_xlabel('Occurrences')
else:
    ax1.text(0.5, 0.5, 'No hashtags found', ha='center', va='center', transform=ax1.transAxes)
    ax1.set_title('Top Hashtags', fontsize=13, fontweight='bold')

if mention_counts:
    mentions, mcounts = zip(*mention_counts)
    ax2.barh(list(mentions)[::-1], list(mcounts)[::-1], color='#26A69A')
    ax2.set_title(f'Top Mentions (total: {len(all_mentions):,})', fontsize=13, fontweight='bold')
    ax2.set_xlabel('Occurrences')
else:
    ax2.text(0.5, 0.5, 'No mentions found', ha='center', va='center', transform=ax2.transAxes)
    ax2.set_title('Top Mentions (anonymized)', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Thread Structure

In [None]:
is_reply = df['reply_parent_hash'].notna()
is_thread_root = df['reply_root_hash'].notna() & df['reply_parent_hash'].isna()
is_standalone = ~df['reply_root_hash'].notna()

thread_data = {
    'Standalone': is_standalone.sum(),
    'Reply': is_reply.sum(),
    'Thread root': is_thread_root.sum()
}

fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(thread_data.keys(), thread_data.values(), color=['#607D8B','#1565C0','#00838F'])
ax.set_title('Post Types: Standalone vs Thread Structure', fontsize=14, fontweight='bold', pad=12)
ax.set_ylabel('Count')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))

for bar, count in zip(bars, thread_data.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, f'{count:,}\n({count/len(df)*100:.1f}%)', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

## 8. Text Length Distributions by Language

In [None]:
top_langs = df['language'].value_counts().head(5).index.tolist()
lang_subset = df[df['language'].isin(top_langs)]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

for lang in top_langs:
    subset = lang_subset[lang_subset['language'] == lang]
    if len(subset) > 10:
        subset['char_count'].clip(0, 400).plot.kde(ax=ax1, label=lang, linewidth=1.5)
        subset['word_count'].clip(0, 100).plot.kde(ax=ax2, label=lang, linewidth=1.5)

ax1.set_title('Character Count Distribution by Language', fontsize=13, fontweight='bold')
ax1.set_xlabel('Character Count')
ax1.legend()

ax2.set_title('Word Count Distribution by Language', fontsize=13, fontweight='bold')
ax2.set_xlabel('Word Count')
ax2.legend()

plt.tight_layout()
plt.show()

print('\nMedian character count by language:')
print(lang_subset.groupby('language')['char_count'].median().sort_values(ascending=False))

## Summary

In [None]:
print('=== Bluesky Firehose Dec 2025 — Summary Statistics ===')
print(f'Total posts: {len(df):,}')
print(f'Unique authors: {df["author_did_hash"].nunique():,}')
print(f'Date range: {df["date"].min()} to {df["date"].max()}')
print(f'Languages: {df["language"].nunique()}')
print(f'\nSentiment breakdown:')
for sent, count in df['sentiment'].value_counts().items():
    print(f'  {sent}: {count:,} ({count/len(df)*100:.1f}%)')
print(f'\nMean VADER score: {df["sentiment_score"].mean():.4f}')
print(f'Posts with images: {df["has_images"].sum():,} ({df["has_images"].mean()*100:.1f}%)')
print(f'Posts with links: {df["has_link"].sum():,} ({df["has_link"].mean()*100:.1f}%)')
print(f'Replies: {df["reply_parent_hash"].notna().sum():,} ({df["reply_parent_hash"].notna().mean()*100:.1f}%)')