# PubPulse laboratory

First, a little setup to use the database (output hidden, because it's noisy):

In [None]:
%%capture
!pip install psycopg2-binary
!pip install ipython-sql
!pip install -U sentence-transformers
!pip install SQLAlchemy --quiet

%load_ext sql
%sql $DATABASE_URL

import sys
sys.path.append('/app')

Next, let's peek inside the database:

In [None]:
%%sql
SELECT count(*)
FROM statuses

In [None]:
%%sql
SELECT
    ingested_at,
    status->>'created_at' as created_at,
    status->'account'->>'acct' as acct,
    url
FROM statuses
ORDER BY ingested_at DESC
LIMIT 5;

In [None]:
%%sql
select  
    status->>'created_at' as created_at,
    status->'account'->>'acct' as acct,
    url
from statuses
where url='https://mefi.social/@rodneylives/112148538695972876'

In [None]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
import pandas as pd 

engine = create_engine(os.environ["DATABASE_URL"])
sql_df = pd.read_sql( 
    "SELECT * FROM statuses LIMIT 5", 
    con=engine 
) 
  
print(sql_df) 

In [None]:
from sqlalchemy.sql import text
with engine.connect() as conn:
    stmt = text("""
        SELECT
            ingested_at,
            url,
            status->>'created_at' as created_at,
            status->'account'->>'acct' as acct,
            status->>'content' as content
        FROM statuses
        ORDER BY ingested_at DESC
        LIMIT 500;
    """)
    result = conn.execute(stmt)
    
from collections import namedtuple

Record = namedtuple('Record', result.keys())
records = [Record(*r) for r in result.fetchall()]

texts = [r.content for r in records if r.content]

len(texts)

In [None]:
"""
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.environ["HF_TOKEN"]

import requests

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

output = query(texts)

import pandas as pd
embeddings = pd.DataFrame(output)
embeddings
"""

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(texts)

import pandas as pd
embeddings_pd = pd.DataFrame(embeddings)
embeddings_pd