In [1]:
import simdjson


# Lazy loaded proxy object.
def load_json(path):
    parser = simdjson.Parser()
    return parser.load(path)

In [2]:
# Load datasets.
tags = load_json("processed/tags.json")
posts = load_json("processed/posts.json")
tag_names_to_ids = load_json("processed/tag_names_to_ids.json")
tags_with_rankings = load_json("processed/tags_with_rankings.json")
tags_with_average_score = load_json("processed/tags_with_average_score.json")

In [89]:
from collections import defaultdict
from datetime import datetime
import plotly.express as px
import pandas as pd


def plot(name, figure):
    figure.update_layout(title_x=0.5)
    figure.write_json(f"processed/plots/{name}.json")
    figure.show()

In [4]:
# Tag category counts.
tag_category_counts = defaultdict(int)
for tag in tags.values():
    tag_category_counts[tag["category"]] += 1

In [5]:
tag_category_counts

defaultdict(int, {1: 245719, 4: 162764, 0: 58053, 3: 31932, 5: 392})

In [6]:
def reindex_ratings(df):
    return df.reindex(["s", "q", "e"]).rename({
        "s": "Safe",
        "q": "Questionable",
        "e": "Explicit",
    })

In [7]:
# Post rating counts.
post_rating_counts = defaultdict(int)
for post in posts.values():
    post_rating_counts[post["rating"]] += 1

In [91]:
post_rating_counts_df = reindex_ratings(pd.DataFrame(post_rating_counts, index=["count"]).T)
plot("post_rating_counts", px.bar(post_rating_counts_df).update_layout(
    title="Post count for each rating category",
    xaxis_title="Rating category",
    yaxis_title="Post count",
    showlegend=False,
))

In [24]:
# Post rating cumulative scores.
post_rating_cumulative_scores = defaultdict(int)
for post in posts.values():
    post_rating_cumulative_scores[post["rating"]] += post["score"]

In [26]:
# Post rating average scores.
post_rating_average_scores = {
    k: post_rating_cumulative_scores[k] / post_rating_counts[k]
    for k in post_rating_counts.keys()
}

In [92]:
post_rating_average_scores_df = reindex_ratings(pd.DataFrame(post_rating_average_scores, index=["count"]).T)
plot("post_rating_average_scores", px.bar(post_rating_average_scores_df).update_layout(
    title="Average score for each rating category",
    xaxis_title="Rating category",
    yaxis_title="Average score",
    showlegend=False,
))

In [45]:
def timestamp_to_date(timestamp):
    return datetime.fromisoformat(timestamp.split(" ")[0])

In [46]:
# Post scores against time.
post_scores_and_time = [
    {
        "created_at": timestamp_to_date(post["created_at"]),
        "score": post["score"],
    }
    for post in posts.values()
]

In [47]:
post_scores_and_time_df = pd.DataFrame(post_scores_and_time)
post_scores_and_time_df = post_scores_and_time_df.resample("1W", on="created_at").mean()

In [93]:
plot("post_scores_and_time", px.scatter(post_scores_and_time_df).update_layout(
    title="Average score over post creation time",
    xaxis_title="Creation time",
    yaxis_title="Average score",
    showlegend=False,
))

In [49]:
# Post tag counts.
post_tag_counts = defaultdict(int)
for post in posts.values():
    post_tag_counts[len(post["tags"])] += 1

In [94]:
post_tag_counts_df = pd.DataFrame(post_tag_counts, index=["count"]).T
plot("post_tag_counts", px.scatter(post_tag_counts_df).update_layout(
    title="Number of posts with tag count",
    xaxis_title="Tag count",
    yaxis_title="Post count",
    showlegend=False,
))

In [51]:
post_tag_count_cutoff = 200

In [52]:
# Post tag count against time.
post_tag_counts_and_time = [
    {
        "created_at": timestamp_to_date(post["created_at"]),
        "tag_count": len(post["tags"]),
    }
    for post in posts.values()
]

In [53]:
post_tag_counts_and_time_df = pd.DataFrame(post_tag_counts_and_time)
post_tag_counts_and_time_df = post_tag_counts_and_time_df.resample("1W", on="created_at").mean()

In [95]:
plot("post_tag_counts_and_time", px.scatter(post_tag_counts_and_time_df).update_layout(
    title="Average post tag count over post creation time",
    xaxis_title="Creation time",
    yaxis_title="Average tag count",
    showlegend=False,
))

In [57]:
# Post score against tag count.
post_scores_and_tag_count = [
    {
        "tag_count": len(post["tags"]),
        "score": post["score"],
    }
    for post in posts.values()
    if len(post["tags"]) <= post_tag_count_cutoff
]

In [58]:
post_scores_and_tag_count_df = pd.DataFrame(post_scores_and_tag_count)
post_scores_and_tag_count_df = post_scores_and_tag_count_df.groupby("tag_count").mean()

In [96]:
plot("post_scores_and_tag_count", px.scatter(post_scores_and_tag_count_df).update_layout(
    title="Average score for post tag count",
    xaxis_title="Tag count",
    yaxis_title="Average score",
    showlegend=False,
))

In [61]:
# Post creation date counts.
post_creation_date_counts = defaultdict(int)
for post in posts.values():
    created_at = timestamp_to_date(post["created_at"])
    post_creation_date_counts[created_at] += 1

In [62]:
post_creation_date_counts_df = pd.DataFrame(post_creation_date_counts, index=["count"]).T
post_creation_date_counts_df = post_creation_date_counts_df.resample("1M").sum()

In [97]:
plot("post_creation_date_counts", px.scatter(post_creation_date_counts_df).update_layout(
    title="Number of posts with post creation time",
    xaxis_title="Creation time",
    yaxis_title="Post count",
    showlegend=False,
))

In [84]:
# Top tags sorted by average post score.
tags_with_average_score_sorted = list(tags_with_average_score.items())
tags_with_average_score_sorted.sort(key=lambda x: x[1], reverse=True)
tags_with_average_score_sorted = [
    {"name": tags[tag_id]["name"], "score": score}
    for tag_id, score in tags_with_average_score_sorted[:50]
]

In [98]:
tags_with_average_score_df = pd.DataFrame(tags_with_average_score_sorted[:50])
plot("tags_with_average_score", px.bar(tags_with_average_score_df, x="name", y="score").update_layout(
    title="Average score for top scoring tags (post count ≥ 1000)",
    xaxis_title="Tag name",
    yaxis_title="Average score",
    showlegend=False,
))