In [60]:
import duckdb
post_times = dict(duckdb.query("select id, time from 'story.parquet' where to_timestamp(time) > timestamp '2020-01-01'").execute().fetchall())

In [59]:
from metaflow import Flow, namespace
namespace(None)
post_run =  next(Flow('HNSentimentAnalyzePosts').runs('analyze_this'))
topics = {int(post_id): topics for post_id, topics in post_run['end'].task['post_tags'].data.items()}
post_run

Run('HNSentimentAnalyzePosts/argo-hnsentiment.user.tuulosgmail.com.hnsentimentanalyzeposts-rhdls')

In [50]:
from collections import Counter
def all_topics(filterset=None):
    for post_id, (topic_lst, _) in topics.items():
        if filterset is None or post_id in filterset:
            for topic in topic_lst:
                yield topic.strip()

In [61]:
from itertools import chain, islice

import altair as alt
import pandas as pd

alt.renderers.enable('html')

from altair_theme import THEME
alt.themes.register('ob', lambda: THEME)
alt.themes.enable('ob')

#alt.themes.enable('fivethirtyeight')
#theme = alt.themes.get()
#theme['config']['background'] = 'black'

c = Counter(all_topics())
df = pd.DataFrame(c.most_common(20), columns=['Topic', 'Count'])

base = alt.Chart(df).encode(
    x=alt.X('Count', title="Number of posts"),
    y=alt.Y("Topic").sort('-x'),
    text='Count',

).properties(width=1000, height=700)
chart = base.mark_bar() + base.mark_text(align='left', dx=5)
chart.configure(axis=alt.AxisConfig(labelFontSize=16, titleFontSize=20))

In [69]:
from datetime import datetime
cutoff = datetime(2022, 1, 1).timestamp()
old = {post_id for post_id, t in post_times.items() if t < cutoff}
new = {post_id for post_id, t in post_times.items() if t >= cutoff}

old_topics = Counter(all_topics(old))
new_topics = Counter(all_topics(new))

diff = []
for topic in set(old_topics).union(new_topics):
    old_f = old_topics.get(topic, 0)
    new_f = new_topics.get(topic, 0)
    if old_f + new_f < 100:
        continue
    diff.append((old_f / len(old) - new_f / len(new), topic))

diff.sort()

up_topics = frozenset(t for _, t in diff[:10])
down_topics = frozenset(t for _, t in diff[-10:])
up_topics

frozenset({'AI',
           'Artificial Intelligence',
           'ChatGPT',
           'GitHub',
           'Layoffs',
           'Machine Learning',
           'Natural Language Processing',
           'Open Source',
           'Productivity',
           'Technology'})

In [105]:
from datetime import datetime
import pandas as pd

CUTOFF = datetime(2023, 6, 1).timestamp()

def timeseries(toi):
    days = {t: Counter() for t in toi}
    for post_id, tstamp in post_times.items():
        if tstamp < CUTOFF and post_id in topics:
            post_topics, _ = topics[post_id]
            for t in post_topics:        
                if t in toi:
                    day = datetime.fromtimestamp(tstamp).replace(day=1, hour=0, minute=0, second=0)
                    days[t][day] += 1
    dfdata = []
    for t, series in days.items():
        for day, count in series.items():
            dfdata.append({'topic': t, 'date': day, 'posts': count})  
    return pd.DataFrame(dfdata)

In [122]:
def timeseries_chart(df):
    base = alt.Chart(df).encode(
        color=alt.Color("topic",
            legend=alt.Legend(
                legendX=100,
                legendY=40,
                direction='horizontal',
                orient='top',
                labelFontSize=14,
                title=''
            )
        )
    )
    line = base.mark_line().encode(x="date", y="posts")
    last_dot = base.mark_circle().encode(
       alt.X("last_date['date']:T"),
       alt.Y("last_date['posts']:Q")
    ).transform_aggregate(
        last_date="argmax(date)",
        groupby=["topic"]
    )
    topic_name = last_dot.mark_text(align="left", dx=4, fontSize=14).encode(text="topic")
    chart = (line + last_dot + topic_name).encode(
        x=alt.X(axis=alt.Axis(title='')).title("date"),
        y=alt.Y().title("posts")
    ).properties(width=1000, height=700)\
     .configure(axis=alt.AxisConfig(labelFontSize=16, titleFontSize=20))
    return chart

In [123]:
timeseries_chart(timeseries(up_topics))

In [124]:
timeseries_chart(timeseries(down_topics))