# PubPulse laboratory

First, a little setup to use the database (output hidden, because it's noisy):

In [42]:
import os, sys
sys.path.insert(0, os.path.dirname(os.getcwd()))

from ipywidgets import IntProgress
from IPython.display import display
import time
import os
import numpy as np
import pandas as pd
import psycopg2
from pgvector.psycopg2 import register_vector
from tqdm.notebook import trange, tqdm

pd.set_option('display.max_colwidth', 200)

In [82]:
from dotenv import load_dotenv
load_dotenv()

from mastodon_agent.config import config

config.debug = True
config.embeddings_api_url = 'http://127.0.0.1:8674/predictions/my_model'
config.database_url = os.environ["DATABASE_URL"]

In [86]:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

engine = create_engine(config.database_url)

In [4]:
%reload_ext sql

In [1]:
import torch
import math
# this ensures that the current MacOS version is at least 12.3+
print(torch.backends.mps.is_available())
# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

True
True


How many statuses have we ingested so far?

In [5]:
%%sql
SELECT count(*)
FROM statuses

1 rows affected.


count
69480


Let's take a look at the latest posts ingested:

In [88]:
%%sql
SELECT
    url,
    ingested_at,
    status->>'created_at' as created_at,
    status->'account'->>'acct' as acct
FROM statuses
ORDER BY ingested_at DESC
LIMIT 5;

 * postgresql://postgres:***@localhost:55432/example
5 rows affected.


url,ingested_at,created_at,acct
https://press.coop/@cnni/112329070264668491,2024-04-25 00:40:14.610304+00:00,2024-04-25T00:39:48+00:00,cnni@press.coop
https://hoto.moe/notes/9si300e9d4,2024-04-25 00:40:14.378527+00:00,2024-04-25T00:40:12.801000+00:00,shepherdboy0129@hoto.moe
https://aus.social/@jpm/112329071788135276,2024-04-25 00:40:14.288393+00:00,2024-04-25T00:40:11+00:00,jpm@aus.social
https://www.threads.net/@ipedro/post/C6Kh5c0rt1N,2024-04-25 00:40:13.559104+00:00,2024-04-25T00:34:56+00:00,ipedro@threads.net
https://mastodon.moule.world/@MOULE/112329071577201566,2024-04-25 00:40:13.510709+00:00,2024-04-25T00:40:08+00:00,MOULE@moule.world


Try fetching the latest posts using python:

In [104]:
from sqlalchemy.sql import text

with engine.connect() as conn:
    stmt = text("""
        SELECT
            ingested_at,
            url,
            status->>'created_at' as created_at,
            status->'account'->>'acct' as acct,
            status->>'content' as content
        FROM statuses
        ORDER BY ingested_at DESC
        LIMIT 10;
    """)
    result = conn.execute(stmt)
    
from collections import namedtuple

Record = namedtuple('Record', result.keys())
records = [Record(*r) for r in result.fetchall()]

texts = [r.content for r in records if r.content]

pd.DataFrame(records)

Unnamed: 0,ingested_at,url,created_at,acct,content
0,2024-04-25 00:47:21.722420+00:00,https://mstdn.social/@searchlight/112329099864358285,2024-04-25T00:47:20+00:00,searchlight@mstdn.social,"<p><a href=""https://mstdn.social/tags/NowPlaying"" class=""mention hashtag"" rel=""nofollow noopener noreferrer"" target=""_blank"">#<span>NowPlaying</span></a> <br>Let It Grow by The Reckoning from 2024..."
1,2024-04-25 00:47:20.498997+00:00,https://friendica.myportal.social/display/e65e1095-2066-29a8-146d-ae4008311915,2024-04-25T00:47:16+00:00,anubis2814@friendica.myportal.social,"I can't think of any historical event that could predict where this could go<br>♲ <a href=""https://h-i.social/@are0h/112329078850145472"" rel=""nofollow noopener noreferrer"" target=""_blank"">h-i.soci..."
2,2024-04-25 00:47:19.586451+00:00,https://flipping.rocks/@mush/112329099738205138,2024-04-25T00:47:18+00:00,mush@flipping.rocks,<p>stop everything has been a good strategy for a while now</p>
3,2024-04-25 00:47:18.037931+00:00,https://mas.to/@solar_guatemalacity/112329099620062891,2024-04-25T00:47:16+00:00,solar_guatemalacity@mas.to,"<p>The sun will rise in <a href=""https://mas.to/tags/GuatemalaCity"" class=""mention hashtag"" rel=""nofollow noopener noreferrer"" target=""_blank"">#<span>GuatemalaCity</span></a> <a href=""https://mas...."
4,2024-04-25 00:47:17.756421+00:00,https://twitter.com/ColeyMick/status/1783296416623808978,2024-04-25T00:46:12+00:00,ColeyMick@sportsbots.xyz,"<p>Everyone who asked for more post ups all season should lose their rights. All of their rights. Voting, speaking out loud, all of the rights.</p>"
5,2024-04-25 00:47:17.749850+00:00,https://flipboard.com/@janettespeyer/flipboard-user-group-el6a4od8z/-/a-hv0cqhHVRPmheVlETW6XCg%3Aa%3A76802984-%2F0,2024-04-25T00:47:04+00:00,succsandsun@flipboard.com,"<p>Echeveria 'Afterglow' | Succulents and Sunshine<br><a href=""https://www.succulentsandsunshine.com/types-of-succulents/echeveria-afterglow/?utm_source=flipboard&amp;utm_medium=activitypub"" rel=""..."
6,2024-04-25 00:47:16.960335+00:00,https://twitter.com/HPbasketball/status/1783296546441675110,2024-04-25T00:46:43+00:00,HPbasketball@sportsbots.xyz,"<p>OK but like also they are chucking <a href=""https://twitter.com/wcgoldberg/status/1783296359744831761"" rel=""nofollow noopener noreferrer"" target=""_blank""><span class=""invisible"">https://</span>..."
7,2024-04-25 00:47:16.368974+00:00,https://mastodon.social/@TSPnow/112329099583215767,2024-04-25T00:47:16.122000+00:00,TSPnow,"<p>Saving for retirement <a href=""https://www.govexec.com/pay-benefits/2024/03/saving-retirement/394730/"" target=""_blank"" rel=""nofollow noopener noreferrer"" translate=""no""><span class=""invisible"">..."
8,2024-04-25 00:47:16.249410+00:00,https://meta.masto.host/@GamingNews/112329092894903219,2024-04-25T00:45:34+00:00,GamingNews@meta.masto.host,"<p>Nintendo Life just posted:</p><p>Star Wars: Battlefront Classic Collection Update 2 Out Now On Switch, Here Are The Full Patch Notes</p><p>""Our work on Update III is beginning"".When Star Wars: ..."
9,2024-04-25 00:47:16.002387+00:00,https://twitter.com/HPbasketball/status/1783296338056097800,2024-04-25T00:45:53+00:00,HPbasketball@sportsbots.xyz,<p>Oh my God they're gonna do it again.</p>


In [10]:
import os
import psycopg2

conn = psycopg2.connect(config.database_url)

cur = conn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
cur.execute("DROP TABLE IF EXISTS embeddings")
cur.execute("""
CREATE TABLE IF NOT EXISTS embeddings(
    id INTEGER,
    url character varying NOT NULL UNIQUE,
    embedding vector(384)
)
""")

conn.commit()

In [76]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Try comparing a few different ways to access an embedding model:

In [105]:
import requests 

chunks = [
    "I like pie",
    "Have you the like of pie?",
    "Etiam non feugiat sapien. Vestibulum accumsan elit massa, at volutpat augue lacinia lacinia.",
    "Lorem ipsum dolor sit amet consectetur adipiscing elit Aliquam mattis arcu sit amet ex convallis ac varius lacus vehicula",
]

local_api_resp = requests.post(
    config.embeddings_api_url,
    json = {
        "input": chunks
    }
)

response = requests.post(
    f"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2",
    headers={"Authorization": f"Bearer {config.hf_token}"},
    json={
        "inputs": chunks,
        "options":{"wait_for_model":True}
    }
)
embeddings_from_hf = response.json()

embeddings_from_model = embedding_model.encode(chunks)

pd.DataFrame([
    embeddings_from_local_api[0],
    embeddings_from_hf[0],
    embeddings_from_model[0],
])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555
1,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555
2,-0.021575,-0.022051,-0.0046,0.021153,-0.123979,0.027267,0.133419,-0.011202,0.082663,-0.004497,...,0.14947,0.004144,0.028378,0.051224,0.028115,0.103199,0.135535,0.088757,0.09187,-0.04555


Try generating and storing embedding vectors for the latest bunch of statuses:

In [77]:
conn = psycopg2.connect(os.environ["DATABASE_URL"])
register_vector(conn)

cur = conn.cursor()
cur.execute("""
    SELECT
        url,
        status->>'content' as content
    FROM statuses
    ORDER BY ingested_at DESC
    LIMIT 1000
""")

CHUNK_SIZE = 100
chunks = []

def embed_statuses_chunk():
    global chunks
    urls = [c[0] for c in chunks]
    embeddings = embedding_model.encode([c[1] for c in chunks])
    chunks = []

    with conn:
        with conn.cursor() as cur:
            for idx in range(0, len(urls)):
                url = urls[idx]
                embedding = embeddings[idx]
                cur.execute(
                    """
                        INSERT INTO embeddings (url, embedding) VALUES (%s, %s)
                          ON CONFLICT (url) DO UPDATE SET embedding = EXCLUDED.embedding;            
                    """,
                    (url, embedding)
                )

for row in tqdm(cur, total=cur.rowcount):
    chunks.append((row[0], row[1]))
    if len(chunks) >= CHUNK_SIZE:    
        embed_statuses_chunk()

embed_statuses_chunk()

  0%|          | 0/10000 [00:00<?, ?it/s]

In [38]:
conn = psycopg2.connect(os.environ["DATABASE_URL"])
register_vector(conn)

cur = conn.cursor()
cur.execute("""
    SELECT
        url,
        status->>'content' as content
    FROM statuses
    ORDER BY ingested_at DESC
    LIMIT 30
""")

CHUNK_SIZE = 10
chunks = []

def embed_statuses_chunk_2():
    global chunks
    urls = [c[0] for c in chunks]
    api_resp = requests.post(
        config.embeddings_api_url,
        json = {
            "input": [c[1] for c in chunks]
        }
    )
    embeddings = api_resp.json()

    chunks = []

    with conn:
        with conn.cursor() as cur:
            for idx in range(0, len(urls)):
                url = urls[idx]
                embedding = embeddings[idx]
                cur.execute(
                    """
                        INSERT INTO embeddings (url, embedding) VALUES (%s, %s)
                          ON CONFLICT (url) DO UPDATE SET embedding = EXCLUDED.embedding;            
                    """,
                    (url, embedding)
                )

for row in tqdm(cur, total=cur.rowcount):
    chunks.append((row[0], row[1]))
    if len(chunks) >= CHUNK_SIZE:    
        embed_statuses_chunk_2()

embed_statuses_chunk_2()

  0%|          | 0/30 [00:00<?, ?it/s]

In [58]:
conn = psycopg2.connect(os.environ["DATABASE_URL"])
register_vector(conn)

cur = conn.cursor()
cur.execute("""
    SELECT
        url,
        status->>'content' as content
    FROM statuses
    ORDER BY ingested_at DESC
    LIMIT 500
""")

CHUNK_SIZE = 50
chunks = []

EMBEDDINGS_API_URL = 'http://127.0.0.1:8674/predictions/my_model'

def embed_statuses_chunk_3():
    global chunks
    urls = [c[0] for c in chunks]
    api_resp = requests.post(
        f"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2",
        headers={"Authorization": f"Bearer {config.hf_token}"},
        json={
            "inputs": [c[1] for c in chunks],
            "options":{"wait_for_model":True}
        }
    )
    embeddings = api_resp.json()

    chunks = []

    with conn:
        with conn.cursor() as cur:
            for idx in range(0, len(urls)):
                url = urls[idx]
                embedding = embeddings[idx]
                cur.execute(
                    """
                        INSERT INTO embeddings (url, embedding) VALUES (%s, %s)
                          ON CONFLICT (url) DO UPDATE SET embedding = EXCLUDED.embedding;            
                    """,
                    (url, embedding)
                )

for row in tqdm(cur, total=cur.rowcount):
    chunks.append((row[0], row[1]))
    if len(chunks) >= CHUNK_SIZE:    
        embed_statuses_chunk_3()

embed_statuses_chunk_3()

  0%|          | 0/500 [00:00<?, ?it/s]

In [78]:
%%sql
SELECT count(embedding)
FROM embeddings

 * postgresql://postgres:***@localhost:55432/example
1 rows affected.


count
62860


In [103]:
embeddings = embedding_model.encode([
    """large language models and retrieval augmented generation"""
])


conn = psycopg2.connect(os.environ["DATABASE_URL"])
register_vector(conn)

cur = conn.cursor()
cur.execute(
    """
    SELECT
        ingested_at,
        url,
        status->'account'->>'acct' as acct,
        status->>'content' as content
    FROM statuses
    WHERE url in (
        SELECT url
        FROM embeddings
        ORDER BY embedding <-> %s
        LIMIT 25
    )
    """,
    (np.array(embeddings[0]),)
)
rows = cur.fetchall()

pd.DataFrame(rows, columns=("ingested_at", "url", "acct", "content"))

Unnamed: 0,ingested_at,url,acct,content
0,2024-04-24 21:01:13.630933+00:00,https://assemblag.es/@mistertim/112328208195408095,mistertim@assemblag.es,"<p>So the first chapter of my extremely personalised ""python programming and computational thinking"" syllabus for <span class=""h-card""><a href=""https://assemblag.es/@jcalpickard"" class=""u-url ment..."
1,2024-04-24 17:33:15.435934+00:00,https://mastodon.social/@i4cy/112327392894989294,i4cy,<p>There are mechanisms to interoperate between languages. Interpreted languages like Python or BASIC entry point referencing is achieved dynamically using late binding. This comes with a performa...
2,2024-04-24 22:56:15.923920+00:00,https://press.coop/@VentureBeat/112328663005670666,VentureBeat@press.coop,<p>Cohere releases toolkit to accelerate generative AI app development in the enterprise</p><p>Cohere's new developer toolkit is an open-source repository to build retrieval-augmented generation A...
3,2024-04-24 19:57:45.653111+00:00,https://press.coop/@VentureBeat/112327961132873015,VentureBeat@press.coop,"<p>DeepMind researchers discover impressive learning capabilities in long-context LLMs</p><p>In their study, the DeepMind researchers investigated how many-shot ICL affects the performance of LLMs..."
4,2024-04-24 21:05:14.965227+00:00,https://mstdn.social/@raku_updated_modules/112328226492504360,raku_updated_modules@mstdn.social,"<p>Data::Generators 0.1.8<br>Random data generation functions: strings, words, numbers, pet names, job titles, vectors, arrays, and tabular datasets.<br><a href=""https://raku.land/zef:antononcube/..."
5,2024-04-24 20:05:19.549214+00:00,https://mstdn.social/@raku_updated_modules/112327990696332434,raku_updated_modules@mstdn.social,"<p>DSL::FiniteStateMachines 0.1.3<br>Finite State Machines (FSMs) roles and classes for making conversational agents based on Domain Specific Languages (DSLs).<br><a href=""https://raku.land/zef:an..."
6,2024-04-24 17:39:37.370216+00:00,https://flipboard.social/@topintech/112327417836046816,topintech@flipboard.social,"<p>Apple releases new family of Open-source Efficient Language Models as AI work progresses <a href=""https://9to5mac.com/2024/04/24/apple-open-source-models-language/"" rel=""nofollow noopener noref..."
7,2024-04-24 19:59:41.795782+00:00,https://sigmoid.social/@alfcnz/112327968292331826,alfcnz@sigmoid.social,<p>The distributional hypothesis states that words that appear in similar contexts tend to have similar meaning.<br>The co-occurrence matrix counts how many times a word appears in a given context...
8,2024-04-24 23:51:34.446771+00:00,https://press.coop/@CNET/112328880042123303,CNET@press.coop,"<p>AI Chatbots Need Large Language Models. Here's What to Know About LLMs - CNET</p><p>Chatbots may sound human, but they don't think the way we do. <a href=""https://press.coop/tags/press"" cla..."
9,2024-04-24 17:46:11.795133+00:00,https://ai.wiki/@AIWIKI/112327443767668434,AIWIKI@ai.wiki,"<p>Microsoft claims that small, localized language models can be powerful as well <a href=""https://search.ai.wiki/microsoft-claims-that-small-localized-language-models-can-be-powerful-as-well/"" re..."
