# PubPulse laboratory

First, a little setup to use the database (output hidden, because it's noisy):

In [7]:
%%capture
!pip install psycopg2-binary
!pip install ipython-sql
!pip install -U sentence-transformers
!pip install SQLAlchemy --quiet

%load_ext sql
%sql $DATABASE_URL

Next, let's peek inside the database:

In [58]:
%%sql
SELECT count(*)
FROM statuses

 * postgresql://postgres:***@db:5432/example
1 rows affected.


count
150098


In [59]:
%%sql
SELECT
    ingested_at,
    status->>'created_at' as created_at,
    status->'account'->>'acct' as acct,
    url
FROM statuses
ORDER BY ingested_at DESC
LIMIT 5;

 * postgresql://postgres:***@db:5432/example
5 rows affected.


ingested_at,created_at,acct,url
2024-03-24 16:42:51.812344+00:00,2024-03-24T16:38:29.992000+00:00,dollkitty@eepy.moe,https://eepy.moe/notes/9r9b546gxo9v04ia
2024-03-24 16:42:51.659410+00:00,2024-03-24T16:42:51+00:00,ForbesBR@mastodon.world,https://mastodon.world/@ForbesBR/112151663214249304
2024-03-24 16:42:50.844712+00:00,2024-03-24T16:42:48+00:00,nuopKINKclassics@mstdn.social,https://mstdn.social/@nuopKINKclassics/112151662979125991
2024-03-24 16:42:46.837840+00:00,2024-03-24T16:42:47+00:00,ThatOneSeong@mstdn.games,https://mstdn.games/@ThatOneSeong/112151662889239822
2024-03-24 16:42:46.520336+00:00,2024-03-24T16:42:46+00:00,MmeLibertine@laserdisc.party,https://laserdisc.party/@MmeLibertine/112151662842416557


In [60]:
%%sql
select  
    status->>'created_at' as created_at,
    status->'account'->>'acct' as acct,
    url
from statuses
where url='https://mefi.social/@rodneylives/112148538695972876'

 * postgresql://postgres:***@db:5432/example
1 rows affected.


created_at,acct,url
2024-03-24T03:28:15+00:00,rodneylives@mefi.social,https://mefi.social/@rodneylives/112148538695972876


In [61]:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
import pandas as pd 

engine = create_engine(os.environ["DATABASE_URL"])
sql_df = pd.read_sql( 
    "SELECT * FROM statuses LIMIT 5", 
    con=engine 
) 
  
print(sql_df) 

                                                 url  \
0  https://mastodon.social/@flexghost/11214591436...   
1  https://mastodon.social/@wikihow/1121459147568...   
2  https://energiewende.social/@ews/1121445151382...   
3  https://pubeurope.com/@europesays/112145915260...   
4    https://mastodon.world/@dbc3/112145916312802203   

                                              status  \
0  {'id': 112145914547710451, 'created_at': '2024...   
1  {'id': 112145914876336295, 'created_at': '2024...   
2  {'id': 112144515145163652, 'created_at': '2024...   
3  {'id': 112145915529377191, 'created_at': '2024...   
4  {'id': 112145916344817024, 'created_at': '2024...   

                       ingested_at  
0 2024-03-24 03:45:27.424886+00:00  
1 2024-03-24 03:45:27.424886+00:00  
2 2024-03-24 03:45:27.424886+00:00  
3 2024-03-24 03:45:27.424886+00:00  
4 2024-03-24 03:45:27.424886+00:00  


In [62]:
from sqlalchemy.sql import text
with engine.connect() as conn:
    stmt = text("""
        SELECT
            ingested_at,
            url,
            status->>'created_at' as created_at,
            status->'account'->>'acct' as acct,
            status->>'content' as content
        FROM statuses
        ORDER BY ingested_at DESC
        LIMIT 500;
    """)
    result = conn.execute(stmt)
    
from collections import namedtuple

Record = namedtuple('Record', result.keys())
records = [Record(*r) for r in result.fetchall()]

texts = [r.content for r in records if r.content]

len(texts)

497

In [50]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.environ["HF_TOKEN"]

import requests

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

output = query(texts)

import pandas as pd
embeddings = pd.DataFrame(output)
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.1003,-0.010851,-0.034623,0.036346,-0.065682,-0.029552,-0.024287,0.074629,0.057447,0.07225,...,0.019523,-0.008885,-0.077388,0.037365,-0.005083,-0.020474,0.009549,-0.116174,-0.089271,-0.00739
1,-0.06791,0.00547,0.06797,0.113299,0.112527,-0.073462,-0.059319,-0.055771,0.063467,-0.044212,...,0.009749,-0.049348,0.019814,-0.046476,0.022843,0.073946,0.11018,-0.013547,-0.037518,0.109775
2,0.000719,-0.012539,0.032021,-0.067487,-0.041326,0.127709,0.119501,-0.049792,0.028721,0.042211,...,0.025907,0.011327,-0.026829,0.087655,-0.027822,0.047634,0.001941,0.003059,0.095492,-0.023399
3,0.037511,0.044581,0.030332,-0.023153,-0.030763,0.001008,0.040488,-0.050807,0.00516,-0.001805,...,-0.085664,0.060736,-0.001977,0.079599,0.026254,0.006632,-0.027923,0.020434,-0.070331,0.004485
4,-0.007013,0.092809,0.001108,-0.036603,-0.017564,-0.055933,0.060748,-0.061228,0.007798,0.028109,...,0.047596,0.056412,0.095493,0.099538,0.037703,0.076729,0.056135,0.107884,0.007432,-0.022197
5,-0.078623,-0.025328,0.028075,-0.027545,0.01982,0.006059,0.103441,0.074977,0.019548,0.009681,...,-0.061775,0.068855,0.08026,-0.000969,0.02586,-0.022743,0.055663,0.027918,-0.102274,0.032652
6,-0.037949,0.054235,-0.016899,0.048962,0.073815,-0.013273,0.045549,-0.047877,0.08072,-0.078653,...,-0.070494,-0.020681,0.065806,0.06522,0.006965,0.079274,0.078255,0.019477,0.049986,0.041979
7,-0.016637,0.057703,0.012327,0.03566,0.091406,-0.012396,-0.011756,-0.04741,0.036159,-0.065541,...,-0.060348,-0.020434,0.051089,0.013657,-0.011587,0.105816,0.021004,-0.043136,0.023047,0.022648
8,0.028436,0.060922,0.073959,0.072923,0.074735,0.026487,0.132479,-0.107002,0.017407,0.007162,...,-0.081334,0.016775,-0.034327,-0.012745,-0.023526,-0.043047,0.168033,-0.005798,-0.080661,0.02789
9,0.024459,0.062275,0.076008,0.070803,0.071658,0.028197,0.133511,-0.11091,0.017406,0.006326,...,-0.080693,0.014354,-0.02484,-0.009908,-0.021613,-0.044554,0.168824,-0.007658,-0.076766,0.031168


In [63]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(texts)

import pandas as pd
embeddings_pd = pd.DataFrame(embeddings)
embeddings_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.001576,-0.044549,0.068900,0.051478,0.065943,0.036167,-0.011818,-0.137685,0.020754,-0.082270,...,0.034994,0.034613,0.130006,-0.006049,0.028834,0.066540,0.064915,-0.031283,-0.028695,0.075974
1,-0.003909,0.025937,0.108607,0.061022,0.104501,-0.033058,0.008600,-0.030537,0.054267,-0.009156,...,-0.065552,0.028696,0.077573,0.036926,0.062552,0.076471,0.136393,-0.003233,0.019610,0.078161
2,-0.081343,-0.004263,0.008839,-0.015749,0.117912,-0.001043,0.084221,-0.052593,0.041261,-0.071807,...,-0.033437,-0.010702,0.055127,-0.113957,0.031162,0.052645,0.078290,-0.002862,0.011264,0.088883
3,-0.037027,0.086493,0.012666,0.075310,0.041972,-0.062141,-0.008178,-0.063790,0.094116,-0.068884,...,-0.028988,-0.061724,0.103738,0.077210,0.013356,0.084485,0.102253,0.001958,0.029453,0.053878
4,0.052143,-0.036124,0.079102,-0.039567,0.071045,0.057465,-0.001104,-0.058049,-0.085243,0.018180,...,0.012160,0.092444,0.026758,-0.007594,-0.002437,-0.017433,0.040831,-0.009671,0.013245,0.036506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,0.001926,0.091240,-0.065020,0.001004,-0.074283,-0.018068,0.071776,-0.007106,0.010477,-0.023126,...,-0.030088,-0.041765,-0.016414,-0.066717,0.024376,0.032387,0.069944,0.014654,0.016021,-0.005407
493,-0.042539,-0.012765,0.039880,0.059286,0.092646,-0.029950,-0.003453,-0.053284,0.072983,-0.074738,...,-0.020359,-0.016952,0.097114,0.071252,-0.001465,0.043654,0.112718,0.037225,-0.022271,0.085373
494,0.033273,0.120777,0.044410,-0.025196,-0.071069,0.038629,0.042502,0.012607,0.008715,0.025479,...,0.047509,-0.008152,0.051308,-0.004399,0.000438,-0.007170,0.070383,0.048057,-0.044719,-0.004001
495,-0.068592,-0.000517,-0.038314,-0.017944,0.043208,0.036312,0.048263,0.000371,-0.020842,0.022704,...,-0.011334,0.013087,-0.051736,-0.059857,0.001293,0.059323,0.138017,-0.016872,-0.013370,0.064209
