In [1]:
import requests
import pandas as pd
import time
from typing import Optional, Tuple, List, Dict

PDS_URL  = "https://bsky.social"            # make sure this matches where your account lives
USERNAME = "timmyjunchen.bsky.social"        # include the “.bsky.social” suffix
PASSWORD = "Abcd1234" # use an App-Password if you have 2FA enabled
API_BASE = f"{PDS_URL}/xrpc/app.bsky.feed.searchPosts"

url     = f"{PDS_URL}/xrpc/com.atproto.server.createSession"
headers = {"Content-Type": "application/json"}
payload = {"identifier": USERNAME, "password": PASSWORD}

resp = requests.post(url, json=payload, headers=headers, timeout=10)

data = resp.json()
access_jwt = data["accessJwt"]
refresh_jwt = data.get("refreshJwt")  # for refreshing later, if needed

print("Logged in! Access JWT:", access_jwt)

Logged in! Access JWT: eyJ0eXAiOiJhdCtqd3QiLCJhbGciOiJFUzI1NksifQ.eyJzY29wZSI6ImNvbS5hdHByb3RvLmFjY2VzcyIsInN1YiI6ImRpZDpwbGM6enBsZzNpYmN5am5vZng2YTdqcWx1c3czIiwiaWF0IjoxNzQ2MjA2NzI0LCJleHAiOjE3NDYyMTM5MjQsImF1ZCI6ImRpZDp3ZWI6bGVjY2ludW0udXMtd2VzdC5ob3N0LmJza3kubmV0d29yayJ9.l1TTZhqI-283riMirBh27X6KWMzHvCQDJO3kvsuoZqGGc0TzFCjzbyCDtcb1hpSuzCLQe5NeE5mZJekVuHc6VA


In [50]:
def search_bluesky_posts(
    jwt: str,
    query: str,
    limit: int = 25,
    cursor: Optional[str] = None,
    sort: str = "latest",
    lang: Optional[str] = "en"
) -> Tuple[List[Dict], Optional[str]]:
    headers = {"Authorization": f"Bearer {jwt}"}
    params = {"q": query, "limit": limit, "sort": sort}
    if cursor:
        params["cursor"] = cursor
    if lang:
        params["lang"] = lang

    resp = requests.get(API_BASE, params=params, headers=headers, timeout=10)
    resp.raise_for_status()
    data = resp.json()

    posts = []
    for hit in data.get("posts", []):
        rec = hit
        posts.append({
            "uri":        rec["uri"],
            "author":     hit["author"]["handle"],
            "text":       rec["record"].get("text", ""),
            "created_at": rec["record"]["createdAt"],
            "reply_count":   rec["replyCount"],
            "like_count":    rec["likeCount"],
            "repost_count":  rec["repostCount"]
        })

    return posts, data.get("cursor")

In [62]:
import subprocess
import json

def scrape_twitter_cli(query: str, limit: int = 50) -> List[Dict]:
    """
    Calls `twscrape search <query> --limit <limit>`,
    parses JSON-per-line stdout into dicts.
    """
    cmd = ["twscrape", "search", query, f"--limit={str(limit)}"]
    completed = subprocess.run(cmd, capture_output=True, text=True, check=True)
    tweets = []
    for line in completed.stdout.splitlines():
        if not line.strip():
            continue
        rec = json.loads(line)
        rec.update({
            "platform": "twitter",
            "query":    query
        })
        tweets.append(rec)
    return tweets

In [76]:
def build_training_dataset(
    events_df: pd.DataFrame,
    twitter_limit: int,
    bluesky_limit: int,
    access_jwt: str
) -> pd.DataFrame:
    records = []
    for _, ev in events_df.iterrows():
        posts = []
        query = f"{ev['topic']} until:{ev['expiration_time']}"

        print("query: ", query)
        try:
            tw_recs = scrape_twitter_cli(query + " lang:en", limit=twitter_limit)
        except subprocess.CalledProcessError as e:
            print(f"twscrape failed for {query}: {e}")
            tw_recs = []
        print("twit num")
        print(len(tw_recs))
        for r in tw_recs:
            posts.append(r["rawContent"])

        bs_recs, _ = search_bluesky_posts(access_jwt, query, limit=bluesky_limit)
        print("bluesky num")
        print(len(bs_recs))
        for r in bs_recs:
            posts.append(r["text"])

        row = {f"post_{i+1}": txt for i, txt in enumerate(posts)}
        records.append(row)
        break
        time.sleep(80)  # throttle

    return pd.DataFrame(records)

In [15]:
KALSHI_EVENTS = "https://api.elections.kalshi.com/trade-api/v2/events"
culture_series = ["KXSPOTIFYD"]#, "KXGAMEAWARDS"]
events = []
for ticker in culture_series:
    cursor = None
    while True:
        params = dict(
            limit=200,
            cursor=cursor,
            series_ticker=ticker,
            with_nested_markets=True
        )
        payload = requests.get(KALSHI_EVENTS, params=params).json()
        for ev in payload["events"]:
            events.append({
                "series":      ticker,
                "title":       ev["title"],
                "topic":  ev["markets"][0]["no_sub_title"],
                "result": ev["markets"][0]["result"],
                "expiration_time": ev["markets"][0]["expiration_time"],
            })
        cursor = payload.get("cursor")
        if not cursor:
            break

events_df = pd.DataFrame(events)

In [16]:
events_df.iloc[::-1]

Unnamed: 0,series,title,topic,result,expiration_time
187,KXSPOTIFYD,"Top song on Spotify on Oct 28, 2024?",Taste,no,2024-11-11T14:00:00Z
186,KXSPOTIFYD,"Top song on Spotify on Oct 29, 2024?",St. Chroma,yes,2024-11-12T14:00:00Z
185,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 30, 2024?",That's so True,no,2024-11-13T14:00:00Z
184,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 31, 2024?",Sticky,no,2024-11-14T14:00:00Z
183,KXSPOTIFYD,"Top song on Spotify USA Chart on Nov 1, 2024?",That's So True,no,2024-11-15T14:00:00Z
...,...,...,...,...,...
4,KXSPOTIFYD,"Top USA Song on Spotify on Apr 29, 2025?",What Was That,no,2025-05-13T14:00:00Z
3,KXSPOTIFYD,"Top USA Song on Spotify on Apr 30, 2025?",tv off (feat. lefty gunplay),no,2025-05-14T14:00:00Z
2,KXSPOTIFYD,"Top USA Song on Spotify on May 1, 2025?",I Ain't Comin' Back (feat. Post Malone),no,2025-05-15T14:00:00Z
1,KXSPOTIFYD,Top USA Song on Spotify today?,Pink Pony Club,,2025-05-16T14:00:00Z


In [77]:
dataset_df = build_training_dataset(
        events_df.iloc[::-1],
        twitter_limit=20,
        bluesky_limit=20,
        access_jwt=access_jwt
    )

query:  Taste until:2024-11-11T14:00:00Z
twit num
32
bluesky num
19


In [78]:
dataset_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_42,post_43,post_44,post_45,post_46,post_47,post_48,post_49,post_50,post_51
0,*connoisseur = an expert judge in matters of t...,@Videl_I_Think Have good taste*,@RobertM77033031 @KILLTOPARTY A man of refined...,@chr_i_s_t_ine I always knew you had the best ...,@orangie Orangie is a man of taste. Enjoy !,I am enjoying an Egg McMuffin @McDonalds follo...,@dominiiiqueeee Whatchu got a taste for?,Rather than allowing their criticism to deter ...,the privlage princess got a 0.01% taste of wha...,@Mrterrazzo @ChefGruel That picture almost let...,...,ITS OKAY I LOVE IT SM!!!!! i love ur daily sif...,Might as well. I heard it's like Yu-Gi-Oh! Due...,taste is 💯,Obviously Mary has amazing taste in Disney mov...,"it's also quite healthy, another one I like bu...",Someone needs better taste then. I'm mostly la...,I need more than a taste 😈,The Persimmon Tree\n31 July 2021 | E-rated | 5...,It may not solve the mystery of Road Rash 64 i...,Toothless: mmmmm you taste good


In [82]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import json
analyzer = SentimentIntensityAnalyzer()
texts = dataset_df.astype(str)

# 2) applymap over every cell, extracting only 'compound'
compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])
compound_df

  compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])


Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_42,post_43,post_44,post_45,post_46,post_47,post_48,post_49,post_50,post_51
0,0.296,0.4404,0.0,0.91,0.5411,0.9098,0.0,-0.8402,0.0258,0.5859,...,0.9649,0.8635,0.0,0.4548,0.9633,0.1027,0.4588,0.0,0.4027,0.128


In [83]:
final_df = compound_df.join(events_df["result"])

In [84]:
final_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_43,post_44,post_45,post_46,post_47,post_48,post_49,post_50,post_51,result
0,0.296,0.4404,0.0,0.91,0.5411,0.9098,0.0,-0.8402,0.0258,0.5859,...,0.8635,0.0,0.4548,0.9633,0.1027,0.4588,0.0,0.4027,0.128,
