In [1]:
from metaflow import Flow, namespace
namespace(None)
post_run =  next(Flow('HNSentimentAnalyzeComments').runs('analyze_this'))
sentiments = {}
comments_meta = {}
total_tokens = 0
for task in post_run['analyze_comments']:
    if 'post_sentiment' in task:
        for post_id, (score, num_tokens) in task['post_sentiment'].data.items():
            pid = int(post_id)
            total_tokens += num_tokens
            sentiments[pid] = score
            comments_meta[pid] = num_tokens
print(f"Sentiments found for {len(sentiments)} posts, {total_tokens} processed")

Sentiments found for 109512 posts, 236575696 processed


In [2]:
from collections import Counter
def all_topics(filterset=None):
    for post_id, (topic_lst, _) in topics.items():
        if filterset is None or post_id in filterset:
            for topic in topic_lst:
                yield topic.strip()

In [3]:
from metaflow import Flow, namespace
namespace(None)
post_run =  next(Flow('HNSentimentAnalyzePosts').runs('analyze_this'))
post_tags = post_run['end'].task['post_tags'].data
topics = {int(post_id): list({t.lower().capitalize() for t in topics}) for post_id, (topics, _) in post_tags.items()}
post_run

Run('HNSentimentAnalyzePosts/argo-hnsentiment.user.tuulosgmail.com.hnsentimentanalyzeposts-rhdls')

In [4]:
print('post tokens', sum(num_tokens for _, num_tokens in post_tags.values()))

post tokens 139459989


In [5]:
from collections import defaultdict
topic_sentiment = defaultdict(list)
for post_id, topic_lst in topics.items():
    score = sentiments.get(post_id)
    if score is not None:
        for topic in topic_lst:
            topic = topic.strip()
            if topic and 'Please provide the article' not in topic and "which is not an article" not in topic:
                topic_sentiment[topic].append(score)

In [6]:
import duckdb
post_times = list(duckdb.query("select id, time, title, url from 'story.parquet' where to_timestamp(time) > timestamp '2020-01-01'").execute().fetchall())
post_meta = {id: (title, url, time) for id, time, title, url in post_times if id in sentiments}

In [7]:
from datetime import datetime
import pandas as pd
from collections import defaultdict

CUTOFF = datetime(2023, 6, 1).timestamp()

def timeseries():
    days = defaultdict(list)
    for post_id, tstamp, _, _ in post_times:
        if tstamp < CUTOFF:  
            if post_id in sentiments:
                day = datetime.fromtimestamp(tstamp).replace(hour=0, minute=0, second=0)
                days[day].append(sentiments[post_id])
    dfdata = []
    for day, series in days.items():
        series.sort()
        median = sum(series) / len(series)
        dfdata.append({'date': day, 'mood': median})  
    return pd.DataFrame(dfdata)

mood_df = timeseries()
mood_df.head()

Unnamed: 0,date,mood
0,2020-07-06,6.24
1,2022-05-13,6.72973
2,2022-06-09,6.430233
3,2021-08-19,5.907407
4,2020-03-08,6.364865


In [8]:

import altair as alt

alt.renderers.enable('html')

from altair_theme import THEME
alt.themes.register('ob', lambda: THEME)
alt.themes.enable('ob')

base = alt.Chart(mood_df).properties(width=1000, height=700)
chart = base.mark_point().encode(x="date", y="mood")
chart + chart.transform_loess('date', 'mood').mark_line().encode(color=alt.value('#fc6603'))

In [9]:
from collections import Counter
#df = pd.DataFrame({'sentiment': list(sentiments.values())})

df = pd.DataFrame([{'score': s, 'count': c} for s, c in Counter(sentiments.values()).items()])

alt.Chart(df).mark_bar().encode(
    x=alt.X("score:O", title="Sentiment score"),
    y=alt.Y('count', title="Number of posts"),
).properties(width=600, height=300).configure(axisBottom=alt.AxisConfig(labelFontSize=16, titleFontSize=16, labelAngle=0),
                                              axisLeft=alt.AxisConfig(labelFontSize=14, titleFontSize=16))

In [10]:
import math
divisive = {}
for topic, scores in topic_sentiment.items():
    if len(scores) > 40:
        num_high = sum(1 for s in scores if s >= 7)
        num_low = sum(1 for s in scores if s <= 4)
        symmetric = 1.0 - abs(num_high - num_low) / (num_high + num_low)
        tail_heavy = (num_high + num_low) / len(scores)
        div = tail_heavy * symmetric
        divisive[topic] = div


In [11]:
from datetime import datetime
import json

def post_entry(arg, max_score=10):
    score, post_id = arg
    title, url, time = post_meta[post_id]
    return {
        'title': title,
        'url': f'https://news.ycombinator.com/item?id={post_id}',
        'time': datetime.fromtimestamp(time).strftime('%Y-%m-%d'),
        'score': score
    }

from collections import defaultdict
inv_index = defaultdict(list)
for post_id, topic_lst in topics.items():
    for topic in topic_lst:
        if post_id in sentiments and comments_meta.get(post_id, 0) > 200:
            inv_index[topic].append((sentiments[post_id], post_id))

positives = []
negatives = []
topic_data = {}
for topic, scores in topic_sentiment.items():
    scores.sort()
    posts = sorted(inv_index[topic])
    if len(posts) > 5:
        n = len(posts) // 2
        median = scores[len(scores) // 2]
        if median > 7:
            positives.append((len(posts), topic))
        if median < 4:
            negatives.append((len(posts), topic))
        topic_data[topic] = {
            'topic': topic,
            'num_posts': len(posts),
            'angry_posts': [post_entry((s, post_id)) for s, post_id in posts[:min(n, 5)] if s < 7],
            'happy_posts': list(map(post_entry, posts[max(-5, -n):])),
            'median_score': median,
            'divisiveness': divisive.get(topic, 0)
        }

with open('topic_data.json', 'w') as f:
    json.dump(topic_data, f)

In [14]:
positives.sort(reverse=True)
negatives.sort(reverse=True)
negatives[:20]

[(86, 'Ftx'),
 (63, 'Police misconduct'),
 (62, 'Sam bankman-fried'),
 (42, 'Xinjiang'),
 (29, 'Torture'),
 (24, 'Employee monitoring'),
 (23, 'Cost cutting'),
 (22, 'Racial profiling'),
 (22, 'Online safety bill'),
 (21, 'War on terror'),
 (20, 'Atlassian'),
 (19, 'Csam'),
 (18, 'Nypd'),
 (18, 'Alameda research'),
 (16, 'International students'),
 (15, 'Tsa'),
 (15, 'Earn it act'),
 (15, 'Car features'),
 (14, 'Bloatware'),
 (13, 'Human rights abuses')]

In [15]:
positives[:20]

[(3797, 'Programming'),
 (2661, 'Computer science'),
 (2505, 'Open-source'),
 (1669, 'Python'),
 (918, 'Game development'),
 (870, 'Rust'),
 (800, 'Electronics'),
 (780, 'Mathematics'),
 (714, 'Functional programming'),
 (708, 'Programming language'),
 (695, 'Physics'),
 (639, 'Embedded systems'),
 (573, 'Self-improvement'),
 (573, 'Database'),
 (537, 'Unix'),
 (520, 'Astronomy'),
 (519, 'Retro computing'),
 (514, 'Nostalgia'),
 (504, 'Rust programming language'),
 (502, 'Debugging')]