In [None]:
from pymongo import MongoClient
from  matplotlib import pyplot as plt
import time
import datetime
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
from wordcloud import WordCloud
from collections import defaultdict

# License

Please feel free to share this document in any medium you so choose. Attribution is nice, but not required. 

# Intro

This document was produced and minted from a collection of metadata off Archive of Our Own on `2023-03-14`. The data is filtered down to only works tagged under the `Persona 5` fandom, and then parsed here. Here are some fun nerds stats on that data set!

In [None]:
mc = MongoClient("mongodb://owlzyhoots:owlzyhoots@aceden.xyz:27017/ao3")
db = mc['ao3']
works_col = db['works']
targets_col = db['targets']

In [None]:
pipeline = [
    {
        '$match': {
            # 'language': 'English',
            # 'fandoms': {'$regex': 'Stardew Valley'},
            # 'fandoms':"Stardew Valley (Video Game)",
            # 'meta.updated': {"$exists": True},
        }
    }, 
    {
        '$project': {
            'url': 0,
            'relationships': 0,
        }
    },
    {
        '$sample': {
            'size': 100_000
        }
    }
]
t1 = time.time()
works = pd.DataFrame(list(works_col.aggregate(pipeline, allowDiskUse=True)))
# works = pd.DataFrame(list(works_col.find()))
t2 = time.time()
print(f"Pipeline ran in [{t2-t1:.3f}] seconds, returning [{works.shape[0]:,.0f}] works.")

In [None]:
print(f"This data set contains [{works.words.sum():,}] total words. Thats [{works.words.sum()/1084170:,.1f}x] the total number of words in the Harry Potter Series.")
print(f"We found an average of [{works.words.sum()/works.nchapters.sum():,.0f}] words per chapter, and [{works.words.sum()/works.shape[0]:,.0f}] words per work.")

In [None]:
print(f"These works are generating an average of [{works.hits.sum()/works.words.sum():.3f}] hits per word.")

# The Graphs
The following sections are comprised of various histograms. Each chart is shown first with a log scale, and then a linear scale. This helps show the small data as well as the spikes.

In [None]:
fig = plt.figure()
_ = plt.hist(works.words, bins=range(0, 500_000, 5000), log=True)
_ = plt.title('Words Histogram')

fig = plt.figure()
_ = plt.hist(works.words, bins=range(0, 500_000, 5000))

In [None]:
print(f"About [{works.words[works.words>10].shape[0] / works.words.shape[0]:7.3%}] of works will have more than 10 words.")
print(f"About [{works.words[works.words>100].shape[0] / works.words.shape[0]:7.3%}] of works will have more than 100 words.")
print(f"About [{works.words[works.words>1000].shape[0] / works.words.shape[0]:7.3%}] of works will have more than 1K words.")
print(f"About [{works.words[works.words>10000].shape[0] / works.words.shape[0]:7.3%}] of works will have more than 10K words.")
print(f"About [{works.words[works.words>100000].shape[0] / works.words.shape[0]:7.3%}] of works will have more than 100K words.")

print()

print(f"The longest 99% of works will have at least [{works.words.quantile(.01):,.0f}] words")
print(f"The longest 75% of works will have at least [{works.words.quantile(.25):,.0f}] words")
print(f"The longest 50% of works will have at least [{works.words.quantile(.5):,.0f}] words")
print(f"The longest 25% of works will have [{works.words.quantile(1-0.25):,.0f}] words")
print(f"The longest 5% of works will have [{works.words.quantile(1-0.05):,.0f}] words")
print(f"The longest 1% of works will have [{works.words.quantile(.99):,.0f}] words")
print(f"The longest 0.1% of works will have [{works.words.quantile(.999):,.0f}] words")

In [None]:
fig = plt.figure()
_ = plt.hist(works.nchapters, bins=range(1,201), log=True)
_ = plt.title('Chapters Histogram')

fig = plt.figure()
_ = plt.hist(works.nchapters, bins=range(1,201))

In [None]:
x = works.nchapters[works.nchapters>1].shape[0] / works.nchapters.shape[0]
print(f"Only [{x:.1%}] of works have more than one chapter.")

x = works.nchapters[works.nchapters>10].shape[0] / works.nchapters.shape[0]
print(f"Only [{x:.1%}] of works have more than ten chapters.")

In [None]:
fig = plt.figure()
_ = plt.hist(works.hits, bins=100, log=True)
plt.title('Hits Histogram')

fig = plt.figure()
_ = plt.hist(works.hits, bins=100)

In [None]:
x = works.hits[works.hits>1].shape[0] / works.hits.shape[0]
print(f"[{x:7.3%}] of works have more than 1 hit.")
x = works.hits[works.hits>10].shape[0] / works.hits.shape[0]
print(f"[{x:7.3%}] of works have more than 10 hits.")
x = works.hits[works.hits>100].shape[0] / works.hits.shape[0]
print(f"[{x:7.3%}] of works have more than 100 hits.")
x = works.hits[works.hits>1000].shape[0] / works.hits.shape[0]
print(f"[{x:7.3%}] of works have more than 1,000 hits.")
x = works.hits[works.hits>10000].shape[0] / works.hits.shape[0]
print(f"[{x:7.3%}] of works have more than 10,000 hits.")
x = works.hits[works.hits>100000].shape[0] / works.hits.shape[0]
print(f"[{x:7.3%}] of works have more than 100,000 hits.")

print()

print(f"The top 99% of works will get at least [{works.hits.quantile(1-0.99):,.0f}] hits.")
print(f"The top 50% of works will get at least [{works.hits.quantile(1-0.50):,.0f}] hits.")
print(f"The top 10% of works will get more than [{works.hits.quantile(1-0.1):,.0f}] hits.")
print(f"The top 1% of works will get more than [{works.hits.quantile(1-0.01):,.0f}] hits.")
print(f"The top 0.1% of works will get more than [{works.hits.quantile(1-0.001):,.0f}] hits.")
print(f"The top 0.01% of works will get more than [{works.hits.quantile(1-0.0001):,.0f}] hits.")

In [None]:
fig = plt.figure()
_ = plt.hist(works.kudos, bins=100, log=True)
plt.title('Kudos Histogram')

fig = plt.figure()
_ = plt.hist(works.kudos, bins=100)

In [None]:
x = works.kudos[works.kudos>0].shape[0] / works.kudos.shape[0]
print(f"[{1-x:.1%}] of works will get no kudos.")

print()

print(f"90% of works will get at least [{works.kudos.quantile(1-0.90):,.0f}] kudos.")
print(f"50% of works will get at least [{works.kudos.quantile(1-0.50):,.0f}] kudos.")
print(f"10% of works will get more than [{works.kudos.quantile(1-0.1):,.0f}] kudos.")
print(f"1% of works will get more than [{works.kudos.quantile(1-0.01):,.0f}] kudos.")
print(f"0.1% of works will get more than [{works.kudos.quantile(1-0.001):,.0f}] kudos.")
print(f"0.01% of works will get more than [{works.kudos.quantile(1-0.0001):,.0f}] kudos.")

In [None]:
fig = plt.figure()
_ = plt.hist(works.comments, bins=100, log=True)
plt.title('Comments Histogram')

fig = plt.figure()
_ = plt.hist(works.comments, bins=100)

In [None]:
x = works.comments[works.comments>0].shape[0] / works.comments.shape[0]
print(f"Only [{x:.1%}] of works will get at least one comment.")
x = works.comments[works.comments>10].shape[0] / works.comments.shape[0]
print(f"Only [{x:.1%}] of works will get more than 10 comments.")
x = works.comments[works.comments>100].shape[0] / works.comments.shape[0]
print(f"Only [{x:.1%}] of works will get more than 100 comments.")

print()

# print(f"The top 90% of works will get at least [{works.comments.quantile(1-0.90):,.0f}] comments.")
print(f"The top 50% of works will get at least [{works.comments.quantile(1-0.50):,.0f}] comments.")
print(f"The top 10% of works will get more than [{works.comments.quantile(1-0.1):,.0f}] comments.")
print(f"The top 1% of works will get more than [{works.comments.quantile(1-0.01):,.0f}] comments.")
print(f"The top 0.1% of works will get more than [{works.comments.quantile(1-0.001):,.0f}] comments.")
print(f"The top 0.01% of works will get more than [{works.comments.quantile(1-0.0001):,.0f}] comments.")

In [None]:
categories = defaultdict(lambda: 0)
for cat in works.categories:
    for element in cat:
        categories[element] += 1
fig = plt.figure()
_ = plt.pie(categories.values(), labels=categories.keys(), autopct='%.1f')
_ = plt.title('Categories Breakdown')

In [None]:
categories = defaultdict(lambda: 0)
for cat in works.rating:
    categories[cat] += 1
fig = plt.figure()
_ = plt.pie(categories.values(), labels=categories.keys(), autopct='%.1f')
_ = plt.title('Categories Breakdown')

In [None]:
fig = plt.figure()
_ = plt.hist(works.date_published.dt.year, bins=range(2014,2025+1,1), log=True)
plt.title('Works by Year')

fig = plt.figure()
_ = plt.hist(works.date_published.dt.year, bins=range(2014,2025+1,1))

In [None]:
for year in range(2014,2025+1):
    fig = plt.figure()
    _ = plt.hist(works[works.date_published.dt.year == year].date_published.dt.month,bins=range(1,14,1))
    _ = plt.title(f'Works Published by Month of Year in {year}')
    _ = plt.xticks(
        [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5], 
        ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    )

In [None]:

fig = plt.figure()
_ = plt.hist(works.date_published.dt.month,bins=range(1,14,1))
_ = plt.title(f'Works Published by Month of Year in 2014-2023')
_ = plt.xticks(
    [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5], 
    ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
)

In [None]:
fig = plt.figure()
_ = plt.hist(works.date_published.dt.day,bins=31, density=True)
_ = plt.title('Works Published by Day of Month')

In [None]:
fig = plt.figure()
_ = plt.hist(works.date_published.dt.weekday, bins=range(0,8), density=True)
plt.title('Works Published by Day of Week')
_ = plt.xticks([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5], ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])

# Deeper Dive

What happens when we only look at the top 10% of stories by hits?

In [None]:
works_top_10_percent_hits = works[works.hits > works.hits.quantile(0.9)]
print(f"There are [{works_top_10_percent_hits.shape[0]:,}] works in this range.")

In [None]:
fig = plt.figure()
_ = plt.hist(works_top_10_percent_hits.words, bins=range(0,500_000,5000), log=True)
plt.title('Words')

fig = plt.figure()
_ = plt.hist(works_top_10_percent_hits.words, bins=range(0,500_000,5000))

In [None]:
print(f"These works have a median word count of [{works_top_10_percent_hits.words.median():,.0f}], vs the global median of [{works.words.median():,.0f}].")
print(f"These works have a mean word count of [{works_top_10_percent_hits.words.mean():,.0f}], vs the global mean of [{works.words.mean():,.0f}].")

In [None]:
fig = plt.figure()
_ = plt.hist(works_top_10_percent_hits.nchapters, bins=100, log=True)
plt.title('Chapters')

fig = plt.figure()
_ = plt.hist(works_top_10_percent_hits.nchapters, bins=100)

In [None]:
print(f"These works have a median chapter count of [{works_top_10_percent_hits.nchapters.median():,.1f}], vs the global median of [{works.nchapters.median():,.1f}].")
print(f"These works have a mean chapter count of [{works_top_10_percent_hits.nchapters.mean():,.1f}], vs the global mean of [{works.nchapters.mean():,.1f}].")

In [None]:
_ = plt.hist(works_top_10_percent_hits.hits, bins=100, log=True)
plt.title('Hits')

fig = plt.figure()
_ = plt.hist(works_top_10_percent_hits.hits, bins=100)

In [None]:
_ = plt.hist(works_top_10_percent_hits.kudos, bins=100, log=True)
plt.title('Kudos')

fig = plt.figure()
_ = plt.hist(works_top_10_percent_hits.kudos, bins=100)

In [None]:
cloud_string = ""
for tag_list in works.tags:
    for tag in tag_list:
        cloud_string += f" {tag}"

In [None]:
wordcloud = WordCloud().generate(cloud_string)

This is a worldcloud of all the tags over ALL fics in the dataset. 

In [None]:
fig = plt.figure(dpi=300)
_ = plt.imshow(wordcloud, interpolation='antialiased')
_ = plt.axis("off")

The following data is binned by each year. Early years are a bit weird as there are just so few works that some tags stand out. 

In [None]:
for year in range(2000, 2025):
    cloud_string = ""
    for tag_list in works.tags[works.date_published.dt.year == year]:
        for tag in tag_list:
            cloud_string += f" {tag}"
    try:
        wordcloud = WordCloud().generate(cloud_string)
    except ValueError:
        continue
    
    fig = plt.figure(dpi=300)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(year)

---