In [69]:
import duckdb
post_times = dict(duckdb.query("select id, time from 'story.parquet' where to_timestamp(time) > timestamp '2020-01-01'").execute().fetchall())

In [70]:
from metaflow import Flow, namespace
namespace(None)
post_run =  next(Flow('HNSentimentAnalyzePosts').runs('analyze_this'))
topics = {int(post_id): list({t.lower().capitalize() for t in topics}) for post_id, (topics, _) in post_run['end'].task['post_tags'].data.items()}
print('num docs', len(topics))
post_run

num docs 84365


Run('HNSentimentAnalyzePosts/argo-hnsentiment.user.tuulosgmail.com.hnsentimentanalyzeposts-rhdls')

In [71]:
from collections import Counter
def all_topics(filterset=None):
    for post_id, topic_lst in topics.items():
        if filterset is None or post_id in filterset:
            for topic in topic_lst:
                yield topic.strip()

In [80]:
from itertools import chain, islice

import altair as alt
import pandas as pd

alt.renderers.enable('html')

from altair_theme import THEME
alt.themes.register('ob', lambda: THEME)
alt.themes.enable('ob')

#alt.themes.enable('fivethirtyeight')
#theme = alt.themes.get()
#theme['config']['background'] = 'black'

c = Counter(all_topics())
df = pd.DataFrame(c.most_common(20), columns=['Topic', 'Count'])
print('top %', sum(v for k, v in c.most_common(20)) / sum(c.values()))

base = alt.Chart(df).encode(
    x=alt.X('Count', title="Number of posts"),
    y=alt.Y("Topic").sort('-x'),
    text='Count',

).properties(width=1000, height=900)
chart = base.mark_bar() + base.mark_text(align='left', dx=5, fontSize=22)
chart.configure(axis=alt.AxisConfig(labelFontSize=22, titleFontSize=22),
                padding=40)

top % 0.09515687396550557
num topics 144788


In [73]:
from datetime import datetime
cutoff = datetime(2022, 1, 1).timestamp()
old = {post_id for post_id, t in post_times.items() if t < cutoff}
new = {post_id for post_id, t in post_times.items() if t >= cutoff}

old_topics = Counter(all_topics(old))
new_topics = Counter(all_topics(new))

diff = []
for topic in set(old_topics).union(new_topics):
    old_f = old_topics.get(topic, 0)
    new_f = new_topics.get(topic, 0)
    if old_f + new_f < 100:
        continue
    diff.append((old_f / len(old) - new_f / len(new), topic))

diff.sort()

up_topics = frozenset(t for _, t in diff[:10])
down_topics = frozenset(t for _, t in diff[-10:])
up_topics

frozenset({'Ai',
           'Artificial intelligence',
           'Chatgpt',
           'Layoffs',
           'Machine learning',
           'Natural language processing',
           'Open source',
           'Software development',
           'Technology',
           'Web development'})

In [74]:
from datetime import datetime
import pandas as pd

CUTOFF = datetime(2023, 6, 1).timestamp()

def timeseries(toi):
    days = {t: Counter() for t in toi}
    for post_id, tstamp in post_times.items():
        if tstamp < CUTOFF and post_id in topics:
            for t in topics[post_id]:      
                if t in toi:
                    day = datetime.fromtimestamp(tstamp).replace(day=1, hour=0, minute=0, second=0)
                    days[t][day] += 1
    dfdata = []
    for t, series in days.items():
        for day, count in series.items():
            dfdata.append({'topic': t, 'date': day, 'posts': count})  
    return pd.DataFrame(dfdata)

In [75]:
def timeseries_chart(df, enable_legend=True):
    if enable_legend:
        legend = alt.Legend(
                legendX=100,
                legendY=40,
                direction='horizontal',
                orient='top',
                labelFontSize=22,
                title='',
        )
    else:
        legend = None
    base = alt.Chart(df).encode(
        color=alt.Color("topic", legend=legend)
    )
    line = base.mark_line().encode(x="date", y="posts")
    last_dot = base.mark_circle().encode(
       alt.X("last_date['date']:T"),
       alt.Y("last_date['posts']:Q")
    ).transform_aggregate(
        last_date="argmax(date)",
        groupby=["topic"]
    )
    topic_name = last_dot.mark_text(align="left", dx=4, fontSize=20).encode(text="topic")
    chart = (line + last_dot + topic_name).encode(
        x=alt.X(axis=alt.Axis(title='')).title("date"),
        y=alt.Y().title("posts")
    ).properties(width=1000, height=900)\
     .configure(axis=alt.AxisConfig(labelFontSize=22, titleFontSize=22),
                padding=40)
    return chart

In [76]:
timeseries_chart(timeseries(up_topics), enable_legend=False)

In [77]:
timeseries_chart(timeseries(down_topics))

In [78]:
brand_new = []
for t in set(new_topics) - set(old_topics):
    brand_new.append((new_topics[t], t))
brand_new.sort(reverse=True)
brand_new[:15]

[(80, 'Gpt-4'),
 (56, 'Stable diffusion'),
 (51, 'Silicon valley bank'),
 (42, 'Russia-ukraine war'),
 (18, 'Ventura'),
 (16, 'Bank failure'),
 (15, 'Midjourney'),
 (15, 'Mac studio'),
 (15, 'Hiring freeze'),
 (14, 'War in ukraine'),
 (14, 'Meta ai'),
 (14, 'Fido alliance'),
 (14, 'Fdic'),
 (14, 'Cost of living crisis'),
 (13, 'Ukraine crisis')]

In [79]:
brand_new = []
for t in set(old_topics) - set(new_topics):
    brand_new.append((old_topics[t], t))
brand_new.sort(reverse=True)
brand_new[:15]

[(32, 'George floyd'),
 (26, 'Herd immunity'),
 (22, 'Antibodies'),
 (20, 'Ios 14'),
 (19, 'Freenode'),
 (18, 'Suez canal'),
 (18, 'Hydroxychloroquine'),
 (18, 'Community update'),
 (15, 'Racial injustice'),
 (15, 'Immune response'),
 (14, 'Ventilators'),
 (14, 'Infection rates'),
 (12, 'Wallstreetbets'),
 (12, 'Ventilator'),
 (12, 'Remdesivir')]