In [13]:
import requests
import pandas as pd
import time
from typing import Optional, Tuple, List, Dict

PDS_URL  = "https://bsky.social"            # make sure this matches where your account lives
USERNAME = "timmyjunchen.bsky.social"        # include the “.bsky.social” suffix
PASSWORD = "Abcd1234" # use an App-Password if you have 2FA enabled
API_BASE = f"{PDS_URL}/xrpc/app.bsky.feed.searchPosts"

url     = f"{PDS_URL}/xrpc/com.atproto.server.createSession"
headers = {"Content-Type": "application/json"}
payload = {"identifier": USERNAME, "password": PASSWORD}

resp = requests.post(url, json=payload, headers=headers, timeout=10)

data = resp.json()
access_jwt = data["accessJwt"]
refresh_jwt = data.get("refreshJwt")  # for refreshing later, if needed

print("Logged in! Access JWT:", access_jwt)

Logged in! Access JWT: eyJ0eXAiOiJhdCtqd3QiLCJhbGciOiJFUzI1NksifQ.eyJzY29wZSI6ImNvbS5hdHByb3RvLmFjY2VzcyIsInN1YiI6ImRpZDpwbGM6enBsZzNpYmN5am5vZng2YTdqcWx1c3czIiwiaWF0IjoxNzQ2MjEwNTQ5LCJleHAiOjE3NDYyMTc3NDksImF1ZCI6ImRpZDp3ZWI6bGVjY2ludW0udXMtd2VzdC5ob3N0LmJza3kubmV0d29yayJ9.Yzmyt5Un8cI2tc8-296VG8KJq46FLKJenYs5w69vBpe1iqYw6AilHil5EDNLDo_nGTROUdHnvODJt1N1Pc_R-w


In [14]:
def search_bluesky_posts(
    jwt: str,
    query: str,
    limit: int = 25,
    cursor: Optional[str] = None,
    sort: str = "latest",
    lang: Optional[str] = "en"
) -> Tuple[List[Dict], Optional[str]]:
    headers = {"Authorization": f"Bearer {jwt}"}
    params = {"q": query, "limit": limit, "sort": sort}
    if cursor:
        params["cursor"] = cursor
    if lang:
        params["lang"] = lang

    resp = requests.get(API_BASE, params=params, headers=headers, timeout=10)
    resp.raise_for_status()
    data = resp.json()

    posts = []
    for hit in data.get("posts", []):
        rec = hit
        posts.append({
            "uri":        rec["uri"],
            "author":     hit["author"]["handle"],
            "text":       rec["record"].get("text", ""),
            "created_at": rec["record"]["createdAt"],
            "reply_count":   rec["replyCount"],
            "like_count":    rec["likeCount"],
            "repost_count":  rec["repostCount"]
        })

    return posts, data.get("cursor")

In [16]:
import subprocess
import json

def scrape_twitter_cli(query: str, limit: int = 50) -> List[Dict]:
    """
    Calls `twscrape search <query> --limit <limit>`,
    parses JSON-per-line stdout into dicts.
    """
    cmd = ["twscrape", "search", query, f"--limit={str(limit)}"]
    completed = subprocess.run(cmd, capture_output=True, text=True, check=True)
    tweets = []
    for line in completed.stdout.splitlines():
        if not line.strip():
            continue
        rec = json.loads(line)
        rec.update({
            "platform": "twitter",
            "query":    query
        })
        tweets.append(rec)
    return tweets

In [None]:
def build_training_dataset(
    events_df: pd.DataFrame,
    twitter_limit: int,
    bluesky_limit: int,
    access_jwt: str
) -> pd.DataFrame:
    records = []
    num = 0
    for _, ev in events_df.iterrows():
        posts = []
        query = f"{ev['topic']} until:{ev['expiration_time']}"

        print("query: ", query)
        try:
            tw_recs = scrape_twitter_cli(query + " lang:en", limit=twitter_limit)
        except subprocess.CalledProcessError as e:
            print(f"twscrape failed for {query}: {e}")
            tw_recs = []
        print("twit num")
        print(len(tw_recs))
        for r in tw_recs:
            posts.append(r["rawContent"])

        bs_recs, _ = search_bluesky_posts(access_jwt, query, limit=bluesky_limit)
        print("bluesky num")
        print(len(bs_recs))
        for r in bs_recs:
            posts.append(r["text"])

        row = {f"post_{i+1}": txt for i, txt in enumerate(posts)}
        records.append(row)
        if num > 50:
            print("iteration: ", num)
            break
        time.sleep(150)  # throttle

    return pd.DataFrame(records)

In [39]:
KALSHI_EVENTS = "https://api.elections.kalshi.com/trade-api/v2/events"
culture_series = ["KXSPOTIFYD"]#, "KXGAMEAWARDS"]
events = []
for ticker in culture_series:
    cursor = None
    while True:
        params = dict(
            limit=200,
            cursor=cursor,
            series_ticker=ticker,
            with_nested_markets=True
        )
        payload = requests.get(KALSHI_EVENTS, params=params).json()
        for ev in payload["events"]:
            for market in ev["markets"]:
                events.append({
                    "series":      ticker,
                    "title":       ev["title"],
                    "topic":  market["no_sub_title"],
                    "result": market["result"],
                    "expiration_time": market["expiration_time"],
                    "yes_bid": market["yes_bid"],
                })
        cursor = payload.get("cursor")
        if not cursor:
            break

events_df = pd.DataFrame(events)

In [52]:
events_df = events_df.iloc[::-1]
events_df = events_df[events_df["yes_bid"] > 1]
events_df

Unnamed: 0,series,title,topic,result,expiration_time,yes_bid
2249,KXSPOTIFYD,"Top song on Spotify on Oct 29, 2024?",St. Chroma,yes,2024-11-12T14:00:00Z,95
2246,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 30, 2024?",St. Chroma,yes,2024-11-13T14:00:00Z,99
2240,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 31, 2024?",St. Chroma,no,2024-11-14T14:00:00Z,70
2237,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 31, 2024?",Sticky,no,2024-11-14T14:00:00Z,10
2236,KXSPOTIFYD,"Top song on Spotify USA Chart on Nov 1, 2024?","Darling, I",no,2024-11-15T14:00:00Z,2
...,...,...,...,...,...,...
116,KXSPOTIFYD,"Top USA Song on Spotify on Apr 26, 2025?",NOKIA,yes,2025-05-10T14:00:00Z,9
109,KXSPOTIFYD,"Top USA Song on Spotify on Apr 26, 2025?",What Was That,no,2025-05-10T14:00:00Z,2
30,KXSPOTIFYD,"Top USA Song on Spotify on May 2, 2025?",NOKIA,,2025-05-16T14:00:00Z,37
22,KXSPOTIFYD,"Top USA Song on Spotify on May 2, 2025?",luther (with sza),,2025-05-16T14:00:00Z,9


In [None]:
dataset_df = build_training_dataset(
        events_df,
        twitter_limit=20,
        bluesky_limit=20,
        access_jwt=access_jwt
    )

query:  APT. until:2024-11-11T14:00:00Z
twit num
36
bluesky num
20
query:  BIRDS OF A FEATHER until:2024-11-11T14:00:00Z
twit num
40
bluesky num
19
2258


In [33]:
dataset_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_50,post_51,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59
0,well now I need her on an actual APT remix,I’m screaming https://t.co/9crPBGagzJ,@rykyok furry feet 2 is peak though i think th...,"@JhayCripzzyy @xabierablume @chuubry not rlly,...",@Jukes_TV @elonmusk Rand Paul not exactly a te...,What folk are more apt to go native first?,🚨#WATCH: As Chaos and Shocked Travelers Witnes...,@asianjunkiecom My dad and brother sent me APT...,🚨🚨911 Emergency!\n 🚨🚨Posted in Self-Defense!\n...,will never get the obsession w numbers. tbh id...,...,aint even left my apt yet and I already wanna ...,Fool aid is such an apt descriptor.,Afternoon Bluesky. Very apt I joined yesterda...,I deep cleaned the fuck out of my apt. Like to...,arrrrgghh! got a new light up vanity for my ap...,"In Pa, they believe every word. Gas and rent w...",instructions unclear !!\n\ni did finally join ...,,,
1,@PInsider_ Birds of a feather obviously,"@AMAZlNGNATURE Birds of a feather, Ducks in a ...",Don’t be rash. There are pockets of love and p...,For those who are thinking about leaving Twitt...,@mikepompeo @SpeakerJohnson “Power of the Purs...,birds of a feather really do flock together th...,@GuntherEagleman Birds of a feather,"fuck it. idc if theres already one, when s6 en...",not birds of a feather playing on this,What kind of creature is this? 🦑🌊🤔 https://t.c...,...,"Horned Screamer, Amazonas, Colombia.\n\n#birds","How ironic ; or timely, that we all left toget...","❄️🌻 ironically, hrid gets along well with sky'...",I also dont want to be associated with that il...,My Grammy rankings:\n\n1) Good Luck Babe — Cha...,"A masked bandit (OK, a juvenile #Nuthatch) abo...","sometimes i flip in between of thinking, ""am i...","Birds of a feather flock together, unsurprisin...",Great Blue Heron hunting for food at low tide....,trump pretended to give a microphone a blowjob...


In [34]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import json
analyzer = SentimentIntensityAnalyzer()
texts = dataset_df.astype(str)

# 2) applymap over every cell, extracting only 'compound'
compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])
compound_df

  compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])


Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_50,post_51,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59
0,0.2732,-0.3818,0.0,0.296,0.4404,0.0,-0.1779,0.3919,-0.3147,0.7943,...,0.0,-0.4404,0.9214,-0.25,-0.5728,0.5927,-0.3907,0.0,0.0,0.0
1,0.0,0.0,-0.4939,0.0,0.0,0.0,0.0,-0.5423,0.2023,0.0,...,0.0,-0.128,0.2023,-0.0572,0.9311,0.5267,0.7337,0.7269,0.4767,0.3612


In [42]:
compound_df["result"] = events_df.iloc[::-1]["result"][:2].values
compound_df["yes_bid"] = events_df.iloc[::-1]["yes_bid"][:2].values

In [43]:
compound_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59,result,yes_bid
0,0.2732,-0.3818,0.0,0.296,0.4404,0.0,-0.1779,0.3919,-0.3147,0.7943,...,0.9214,-0.25,-0.5728,0.5927,-0.3907,0.0,0.0,0.0,no,1
1,0.0,0.0,-0.4939,0.0,0.0,0.0,0.0,-0.5423,0.2023,0.0,...,0.2023,-0.0572,0.9311,0.5267,0.7337,0.7269,0.4767,0.3612,no,0


In [53]:
compound_df.to_csv("training_data.csv", index=False)