# PubPulse laboratory

In [1]:
%reload_ext autoreload
%autoreload 2
%reload_ext sql

Import some handy things...

In [2]:
import os, sys
sys.path.insert(0, os.path.dirname(os.getcwd()))

from ipywidgets import IntProgress
from IPython.display import display
import time
import os
import numpy as np
import pandas as pd
import psycopg2
from pgvector.psycopg2 import register_vector
from tqdm.notebook import trange, tqdm
from IPython.display import HTML

pd.set_option('display.max_colwidth', 100)

Set up some configuration so we can effectively use mastodon_agent modules...

In [3]:
from dotenv import load_dotenv
load_dotenv()

from mastodon_agent.config import config

# TODO: move more of this into env vars in ./scripts/start-host-notebook.sh?
config.debug = True
config.database_url = os.environ["DATABASE_URL"]
config.embeddings_api_url = 'http://127.0.0.1:8674/predictions/my_model'
config.celery_broker_url = 'amqp://localhost'
config.celery_results_backend = 'rpc://localhost'
config.ml_api_url = 'http://127.0.0.1:8673'

In [4]:
from mastodon_agent.tasks import ml_gpu

In [5]:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

engine = create_engine(config.database_url)

In [6]:
import torch
import math
# this ensures that the current MacOS version is at least 12.3+
print(torch.backends.mps.is_available())
# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

True
True


How many statuses have we ingested so far?

In [21]:
%sql SELECT count(*) FROM statuses

 * postgresql://postgres:***@localhost:55432/example
1 rows affected.


count
198505


Let's take a look at the latest posts ingested:

In [20]:
%%sql
SELECT
    url,
    ingested_at,
    status->>'created_at' as created_at,
    status->'account'->>'acct' as acct
FROM statuses
ORDER BY ingested_at DESC
LIMIT 5;

 * postgresql://postgres:***@localhost:55432/example
5 rows affected.


url,ingested_at,created_at,acct
https://mastodon.social/@Lucius_Chen/112334318110553763,2024-04-25 22:54:24.680186+00:00,2024-04-25T22:54:24.535000+00:00,Lucius_Chen
https://rss-parrot.net/u/nos.nl/status/1713448386088282074,2024-04-25 22:54:23.838612+00:00,2024-04-25T22:54:23+00:00,nos.nl@rss-parrot.net
https://newsmast.social/@newzealandheraldrss/112334317902897313,2024-04-25 22:54:23.648778+00:00,2024-04-25T22:54:21+00:00,newzealandheraldrss@newsmast.social
https://rss-parrot.net/u/nos.nl/status/1713448386088282073,2024-04-25 22:54:23.624196+00:00,2024-04-25T22:54:23+00:00,nos.nl@rss-parrot.net
https://rss-parrot.net/u/nos.nl/status/1713448386088282072,2024-04-25 22:54:23.619846+00:00,2024-04-25T22:54:23+00:00,nos.nl@rss-parrot.net


Try fetching the latest posts using python:

In [19]:
from sqlalchemy.sql import text

with engine.connect() as conn:
    stmt = text("""
        SELECT
            ingested_at,
            status->>'created_at' as created_at,
            url,
            status->'account'->>'acct' as acct,
            status->>'content' as content
        FROM statuses
        ORDER BY ingested_at DESC
        LIMIT 3;
    """)
    result = conn.execute(stmt)
    
from collections import namedtuple

Record = namedtuple('Record', result.keys())
records = [Record(*r) for r in result.fetchall()]

texts = [r.content for r in records if r.content]

df = pd.DataFrame(records)
HTML(df.to_html(render_links=True, escape=False))

Unnamed: 0,ingested_at,created_at,url,acct,content
0,2024-04-25 22:54:16.958588+00:00,2024-04-25T22:54:15+00:00,https://defcon.social/@corbden/112334317497577398,corbden@defcon.social,"Pinky: What are we doing tonight, Bwain?Brain: Same thing we do every night, Pinky. Drugs."
1,2024-04-25 22:54:16.893231+00:00,2024-04-25T22:54:16+00:00,https://mas.to/@leadstoriescom/112334317574062130,leadstoriescom@mas.to,"Fact Check: Carrot, Baking Soda Recipe Does NOT Repair Eyesight In 1 Week https://leadstories.com/hoax-alert/2024/04/fact-check-carrot-baking-soda-recipe-does-not-repair-eyesight-in-one-week.html"
2,2024-04-25 22:54:16.781936+00:00,2024-04-25T22:54:12+00:00,https://techhub.social/@fakenewsbot/112334317290814288,fakenewsbot@techhub.social,this news is not real: BrickPlanet Kid is monitoring media!


Let's load up a local embedding model:

In [10]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

[2024-04-25T15:52:22] INFO [sentence_transformers.SentenceTransformer.__init__:107] Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
[2024-04-25T15:52:24] INFO [sentence_transformers.SentenceTransformer.__init__:213] Use pytorch device_name: mps


Try comparing a few different ways to access an embedding model:

In [22]:
import requests 
from mastodon_agent.tasks import ml_gpu

texts = [
    "I like pie",
    "Have you the like of pie!",
    "Lorem ipsum dolor sit amet consectetur adipiscing elit Aliquam mattis arcu sit amet ex convallis ac varius lacus vehicula",
    "Etiam non feugiat sapien. Vestibulum accumsan elit massa, at volutpat augue lacinia lacinia.",
]

local_api_resp = requests.post(
    f"{config.ml_api_url}/embeddings",
    json = { "inputs": texts }
)
embeddings_from_local_api = local_api_resp.json()

response = requests.post(
    f"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2",
    headers={"Authorization": f"Bearer {config.hf_token}"},
    json={
        "inputs": texts,
        "options":{"wait_for_model":True}
    }
)
embeddings_from_hf = response.json()

embeddings_from_model = embedding_model.encode(texts)

embeddings_from_celery = ml_gpu.embed.delay(texts).get(timeout=10)

pd.DataFrame([
    embeddings_from_local_api[0],
    embeddings_from_hf[0],
    embeddings_from_model[0],
    embeddings_from_celery[0],
])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555
1,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555
2,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555
3,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555


How many ingested statuses do we have since the last newest generated embedding?

In [23]:
%%sql
SELECT count(url)
FROM statuses
WHERE ingested_at > (SELECT created_at FROM status_embeddings ORDER BY created_at DESC LIMIT 1);

 * postgresql://postgres:***@localhost:55432/example
1 rows affected.


count
286


Generate embeddings for statuses newer than the newest embedding:

In [24]:
from mastodon_agent.tasks import ml_gpu

conn = psycopg2.connect(os.environ["DATABASE_URL"])
register_vector(conn)

cur = conn.cursor()
cur.execute("""
    SELECT
        url,
        status->>'content' as content
    FROM statuses    
    WHERE ingested_at > (
        SELECT created_at
        FROM status_embeddings
        ORDER BY created_at DESC
        LIMIT 1
    )
    ORDER BY ingested_at DESC
    LIMIT 5000
""")

def embed_with_local_api(texts):
    response = requests.post(
        f"{config.ml_api_url}/embeddings",
        json = { "inputs": texts }
    )
    return response.json()

def embed_with_hf_api(texts):
    response = requests.post(
        f"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2",
        headers={"Authorization": f"Bearer {config.hf_token}"},
        json={
            "inputs": texts,
            "options":{"wait_for_model":True}
        }
    )
    return response.json()
    
def embed_with_inprocess_model(texts):
    return embedding_model.encode(texts)

def embed_with_celery_job(texts):
    return ml_gpu.embed.delay(texts).get(timeout=10)

embed = embed_with_celery_job

CHUNK_SIZE = 100
chunks = []

def embed_statuses_chunk():
    global chunks
    urls = [c[0] for c in chunks]
    texts = [c[1] for c in chunks]
    embeddings = embed(texts)

    chunks = []

    with conn:
        with conn.cursor() as cur:
            for idx in range(0, len(urls)):
                url = urls[idx]
                embedding = embeddings[idx]
                cur.execute(
                    """
                        INSERT INTO status_embeddings (url, embedding) VALUES (%s, %s)
                          ON CONFLICT (url) DO UPDATE SET embedding = EXCLUDED.embedding;            
                    """,
                    (url, embedding)
                )

for row in tqdm(cur, total=cur.rowcount):
    chunks.append((row[0], row[1]))
    if len(chunks) >= CHUNK_SIZE:    
        embed_statuses_chunk()

embed_statuses_chunk()

  0%|          | 0/372 [00:00<?, ?it/s]

In [14]:
%sql SELECT count(embedding) FROM status_embeddings

 * postgresql://postgres:***@localhost:55432/example
1 rows affected.


count
145730


In [25]:
#embeddings = embedding_model.encode([
#    """I really like banana bread"""
#])

embeddings = ml_gpu.embed.delay([
    """retro gaming is nifty"""
]).get(timeout=10)

conn = psycopg2.connect(os.environ["DATABASE_URL"])
register_vector(conn)

cur = conn.cursor()
cur.execute(
    """
    SELECT
        ingested_at,
        url,
        status->'account'->>'acct' as acct,
        status->>'content' as content
    FROM statuses
    WHERE url in (
        SELECT url
        FROM status_embeddings
        WHERE created_at > now() - INTERVAL '6 hours'
        ORDER BY embedding <-> %s
        LIMIT 25
    )
    ORDER BY ingested_at DESC
    LIMIT 25
    """,
    (np.array(embeddings[0]),)
)
rows = cur.fetchall()

df = pd.DataFrame(rows, columns=("ingested_at", "url", "acct", "content"))
HTML(df.to_html(render_links=True, escape=False))

Unnamed: 0,ingested_at,url,acct,content
0,2024-04-25 22:44:20.684419+00:00,https://freeradical.zone/@karlexceed/112334278334110478,karlexceed@freeradical.zone,"I have a sudden urge to use my Nintendo Wii as a generic desktop computer. But I want 120% execution on the idea...I want the little console standing on my desk, with matching white plastic peripherals. I'm imagining something like the classic Apple one button mouse and the monitor would have the IR sensor bar built in. The operating system needs to be something like Mac OS 9, but even simpler. And I need to be able to launch into game discs from the desktop and play Wii Sports as normal."
1,2024-04-25 22:44:03.499047+00:00,https://v2br.social/@karlexceed/112334277299521438,karlexceed@v2br.social,"I have a sudden urge to use my Nintendo Wii as a generic desktop computer. But I want 120% execution on the idea...I want the little console standing on my desk, with matching white plastic peripherals. I'm imagining something like the classic Apple one button mouse and the monitor would have the IR sensor bar built in. The operating system needs to be something like Mac OS 9, but even simpler. And I need to be able to launch into game discs from the desktop and play Wii Sports as normal."
2,2024-04-25 22:31:32.059707+00:00,https://kind.social/@wrenderlust/112334228138009785,wrenderlust@kind.social,"Well looking through some old Mac magazines on archive.org, I saw an ad for video game but I've been searching for the name of for a few years. The journeyman project. A demo copy came with one of the first computers I've ever had way back when. I didn't know what I was doing then, but I was impressed by the immersion in the visuals. I wanted to live in that little world. What are the odds that I rediscover the name this way."
3,2024-04-25 22:25:56.538717+00:00,https://wetdry.world/@driftini/112334206122103106,driftini@wetdry.world,"honestly this Epic Gamer mat was such a mistakeonly reason I use it is because it was a gift, but still it just makes everything look a lot shittier and messier, also it doesn't even light up properly anymorenow that I'm talking about it I feel like just getting it off my desk at last, idk if I have any mouse mat lying around thoughalso yeah the keyboard is in a very funny state right now, another thing I'll have to change later down the line when I got money to waste"
4,2024-04-25 22:20:59.392523+00:00,https://tech.lgbt/@Tourma/112334186489791404,Tourma@tech.lgbt,"Playing: Steam World Quest: the Hand of Gilgamech (Switch)I have yet to at a bad Steam World game. I beat Dig on my 2DS, got rather far in Dig 2 on my PC, and played some of Heist on my 2DS. All very different from each other, all great.Quest is a turn based RPG with card mechanics, and it works. It has the same irreverent tone the other games have, but lighter thanks to the quasi-medieval setting.I'm not typically a card mechanics fan, but it works here. It varies the combat without beging cumbersome. The game nould be dull without it.The biggest knock I'll give it is that it seems to drain my Switch's battery pretty fast, though that's more of a Switch problem than the game.Highly recommend.#TourmaGaming #VideoGames #IndieGames #SteamWorld #SteamWorldQuest"
5,2024-04-25 22:17:26.577123+00:00,https://catgirl.center/notes/9sjdc7jo64xe0mtq,luna@catgirl.center,"oh yeah and also I get better video this way too, the HDMI port on my wii u is dead so I have it hooked up via composite which is... not great"
6,2024-04-25 21:39:20.719057+00:00,https://meta.masto.host/@GamingNews/112333465343135063,GamingNews@meta.masto.host,"Gamespot just posted:Capcom Is Delisting Three Of Its Most Underrated Games--And It's Unclear WhyCapcom is delisting some of its most underrated games soon, namely ones from the Dark Void series.According to Steam, the games Dark Void, Dark Void Zero, and Flock will be removed from the storefront on May 8. The reasoning behind the delisting is currently unclear, as well as if the removal will also extend to PSN Store and Xbox Live Marketplace.Dark ...https://www.gamespot.com/articles/capcom-is-delisting-three-of-its-most-underrated-games-and-its-unclear-why/1100-6522957/?ftag=CAD-01-10abi2f#gamingNews"
7,2024-04-25 21:37:25.602035+00:00,https://ruby.social/@peterphillips/112334015231629442,peterphillips@ruby.social,Current gaming status
8,2024-04-25 19:27:45.044047+00:00,https://infosec.exchange/@furtivedefect/112333505403663740,furtivedefect@infosec.exchange,"After spending too much time in windows, I've moved back to Linux for my daily driver (apart from some games) and it feels so good. Considering Arch and i3 but I'm lazy and allergic to effort. Maybe in the future. Open to suggestions on how to optimize my Linux experience for day to day use."
9,2024-04-25 18:24:14.531481+00:00,https://tombraidersocial.com/@tombraiderchronicles/112333255575948933,tombraiderchronicles@tombraidersocial.com,"Blaze Entertainment has released a new video showcasing Lara Croft speeding through the canals of Venice in #TombRaider II on Evercade, a cartridge-based retro gaming system. Tomb Raider Collection 1 will be released in July of 2024.https://www.tombraiderchronicles.com/headlines4871.html"
