In [13]:
import requests
import pandas as pd
import time
from typing import Optional, Tuple, List, Dict

PDS_URL  = "https://bsky.social"            # make sure this matches where your account lives
USERNAME = "timmyjunchen.bsky.social"        # include the “.bsky.social” suffix
PASSWORD = "Abcd1234" # use an App-Password if you have 2FA enabled
API_BASE = f"{PDS_URL}/xrpc/app.bsky.feed.searchPosts"

url     = f"{PDS_URL}/xrpc/com.atproto.server.createSession"
headers = {"Content-Type": "application/json"}
payload = {"identifier": USERNAME, "password": PASSWORD}

resp = requests.post(url, json=payload, headers=headers, timeout=10)

data = resp.json()
access_jwt = data["accessJwt"]
refresh_jwt = data.get("refreshJwt")  # for refreshing later, if needed

print("Logged in! Access JWT:", access_jwt)

Logged in! Access JWT: eyJ0eXAiOiJhdCtqd3QiLCJhbGciOiJFUzI1NksifQ.eyJzY29wZSI6ImNvbS5hdHByb3RvLmFjY2VzcyIsInN1YiI6ImRpZDpwbGM6enBsZzNpYmN5am5vZng2YTdqcWx1c3czIiwiaWF0IjoxNzQ2MjEwNTQ5LCJleHAiOjE3NDYyMTc3NDksImF1ZCI6ImRpZDp3ZWI6bGVjY2ludW0udXMtd2VzdC5ob3N0LmJza3kubmV0d29yayJ9.Yzmyt5Un8cI2tc8-296VG8KJq46FLKJenYs5w69vBpe1iqYw6AilHil5EDNLDo_nGTROUdHnvODJt1N1Pc_R-w


In [14]:
def search_bluesky_posts(
    jwt: str,
    query: str,
    limit: int = 25,
    cursor: Optional[str] = None,
    sort: str = "latest",
    lang: Optional[str] = "en"
) -> Tuple[List[Dict], Optional[str]]:
    headers = {"Authorization": f"Bearer {jwt}"}
    params = {"q": query, "limit": limit, "sort": sort}
    if cursor:
        params["cursor"] = cursor
    if lang:
        params["lang"] = lang

    resp = requests.get(API_BASE, params=params, headers=headers, timeout=10)
    resp.raise_for_status()
    data = resp.json()

    posts = []
    for hit in data.get("posts", []):
        rec = hit
        posts.append({
            "uri":        rec["uri"],
            "author":     hit["author"]["handle"],
            "text":       rec["record"].get("text", ""),
            "created_at": rec["record"]["createdAt"],
            "reply_count":   rec["replyCount"],
            "like_count":    rec["likeCount"],
            "repost_count":  rec["repostCount"]
        })

    return posts, data.get("cursor")

In [16]:
import subprocess
import json

def scrape_twitter_cli(query: str, limit: int = 50) -> List[Dict]:
    """
    Calls `twscrape search <query> --limit <limit>`,
    parses JSON-per-line stdout into dicts.
    """
    cmd = ["twscrape", "search", query, f"--limit={str(limit)}"]
    completed = subprocess.run(cmd, capture_output=True, text=True, check=True)
    tweets = []
    for line in completed.stdout.splitlines():
        if not line.strip():
            continue
        rec = json.loads(line)
        rec.update({
            "platform": "twitter",
            "query":    query
        })
        tweets.append(rec)
    return tweets

In [57]:
def build_training_dataset(
    events_df: pd.DataFrame,
    twitter_limit: int,
    bluesky_limit: int,
    access_jwt: str
) -> pd.DataFrame:
    records = []
    num = 0
    for _, ev in events_df.iterrows():
        posts = []
        query = f"{ev['topic']} until:{ev['expiration_time']}"

        print("query: ", query)
        try:
            tw_recs = scrape_twitter_cli(query + " lang:en", limit=twitter_limit)
        except subprocess.CalledProcessError as e:
            print(f"twscrape failed for {query}: {e}")
            tw_recs = []
        print("twit num")
        print(len(tw_recs))
        for r in tw_recs:
            posts.append(r["rawContent"])

        bs_recs, _ = search_bluesky_posts(access_jwt, query, limit=bluesky_limit)
        print("bluesky num")
        print(len(bs_recs))
        for r in bs_recs:
            posts.append(r["text"])

        row = {f"post_{i+1}": txt for i, txt in enumerate(posts)}
        records.append(row)
        if num > 2:
            print("iteration: ", num)
            break
        num += 1
        time.sleep(10)  # throttle

    return pd.DataFrame(records)

In [39]:
KALSHI_EVENTS = "https://api.elections.kalshi.com/trade-api/v2/events"
culture_series = ["KXSPOTIFYD"]#, "KXGAMEAWARDS"]
events = []
for ticker in culture_series:
    cursor = None
    while True:
        params = dict(
            limit=200,
            cursor=cursor,
            series_ticker=ticker,
            with_nested_markets=True
        )
        payload = requests.get(KALSHI_EVENTS, params=params).json()
        for ev in payload["events"]:
            for market in ev["markets"]:
                events.append({
                    "series":      ticker,
                    "title":       ev["title"],
                    "topic":  market["no_sub_title"],
                    "result": market["result"],
                    "expiration_time": market["expiration_time"],
                    "yes_bid": market["yes_bid"],
                })
        cursor = payload.get("cursor")
        if not cursor:
            break

events_df = pd.DataFrame(events)

In [52]:
events_df = events_df.iloc[::-1]
events_df = events_df[events_df["yes_bid"] > 1]
events_df

Unnamed: 0,series,title,topic,result,expiration_time,yes_bid
2249,KXSPOTIFYD,"Top song on Spotify on Oct 29, 2024?",St. Chroma,yes,2024-11-12T14:00:00Z,95
2246,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 30, 2024?",St. Chroma,yes,2024-11-13T14:00:00Z,99
2240,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 31, 2024?",St. Chroma,no,2024-11-14T14:00:00Z,70
2237,KXSPOTIFYD,"Top song on Spotify USA Chart on Oct 31, 2024?",Sticky,no,2024-11-14T14:00:00Z,10
2236,KXSPOTIFYD,"Top song on Spotify USA Chart on Nov 1, 2024?","Darling, I",no,2024-11-15T14:00:00Z,2
...,...,...,...,...,...,...
116,KXSPOTIFYD,"Top USA Song on Spotify on Apr 26, 2025?",NOKIA,yes,2025-05-10T14:00:00Z,9
109,KXSPOTIFYD,"Top USA Song on Spotify on Apr 26, 2025?",What Was That,no,2025-05-10T14:00:00Z,2
30,KXSPOTIFYD,"Top USA Song on Spotify on May 2, 2025?",NOKIA,,2025-05-16T14:00:00Z,37
22,KXSPOTIFYD,"Top USA Song on Spotify on May 2, 2025?",luther (with sza),,2025-05-16T14:00:00Z,9


In [58]:
dataset_df = build_training_dataset(
        events_df,
        twitter_limit=20,
        bluesky_limit=20,
        access_jwt=access_jwt
    )

query:  St. Chroma until:2024-11-12T14:00:00Z
twit num
23
bluesky num
20
query:  St. Chroma until:2024-11-13T14:00:00Z
twit num
30
bluesky num
20
query:  St. Chroma until:2024-11-14T14:00:00Z
twit num
36
bluesky num
20
query:  Sticky until:2024-11-14T14:00:00Z
twit num
40
bluesky num
20
iteration:  3


In [59]:
dataset_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_51,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59,post_60
0,Welcome to life St Chroma!,Welcome to the world SPECIALZ https://t.co/B2Z...,@guicosz st chroma no B 💀,@tlprkive st. chroma,@Glory_be_Satori mf rose from the dead,damn itft is really good,@tylerthecreator as i navigate my own life and...,@tylerthecreator st chroma - with music i base...,@VampireGhuleh Unironically listen to St. Chro...,@tylerthecreator the beats transitions instrum...,...,,,,,,,,,,
1,@tylerthecreator could you get someone to get ...,i still wanna do a felix darling i edit.. but ...,@allishastlouis Ain't no way boy https://t.co/...,St. Chroma is truly incredible man. Tyler can ...,umm…… yay? https://t.co/ITgyOi2dm3,@beaches_p Weezer favs: Undone - The Sweater S...,@cutegashu congrats now look at this funny guy...,MISS MY MO+LI,@immaleakyoshit they're correct,@fazrabbit2 @Cosmicnoone Are you stupid it say...,...,,,,,,,,,,
2,"No matter what happens tomorrow, it'll still b...",@Jasp3r_0 I just thinks that st. Chroma sounds...,tight\n\nbut St. Chroma is better,Stars vs St. Chroma \n\nWhat is the better int...,Idk why but the song st. Chroma makes me want ...,@satosugaru okay but sticky goes soooo hard to...,@Jasp3r_0 St. Chroma,@bbokariswingz that’s a bit 🏳️‍🌈,Close but St. Chroma is better.,@cutegashu gashu what demons were you fighting,...,My friend and I are working on a film about th...,St Chroma walk cycle,I need St. Chroma injected into my veins,ST chroma,I already knew St. Chroma was gonna be my fav ...,1. ST CHROMA\n\n#furry #furryvr #vrchat #furry...,,,,
3,@Cruela_DeVil_ Yeah I opened it all and realis...,@TeTheGamer It’s $jam \nI feel it in my balls,🎭 Akool vs. Synthesia\n\nWhen it comes to AI A...,I really like playing mercy,Shitty sticky notes shouldn’t be on the market...,@cookitup31 Link broke already 🤦🏾‍♂️,@ape_descendent Dust sprayer and sticky tape,@larryelder Transitory over 3 years. Hmmm…..Th...,"@jadelaui Sure, but I have to warn you, there’...",@cmonJD stop why did we both tweet about mingy...,...,The original choreographer of the viral Sticky...,제피Pure Zephyr🐺🌛 (🔞Vtuber/ASMRtist) - 【 V - Zep...,slaps this sticky note to your big ass forehead,It did a great job of creating a really tense ...,"Better find a mop , it’s getting sticky in thi...",Some little guys I was sketching for a creatur...,Woke up and EXTREMELY horny. This fox alreedy ...,Oh I’m a cold and frosty lover too! Give me th...,"Nov 14, 2024\nWild Provisions Beer Project: 3p...",Glo’s verse on sticky >>>>>>


In [60]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import json
analyzer = SentimentIntensityAnalyzer()
texts = dataset_df.astype(str)

# 2) applymap over every cell, extracting only 'compound'
compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])
compound_df

  compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])


Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_51,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59,post_60
0,0.5093,0.4588,-0.296,0.0,-0.6486,0.1263,0.8126,0.2585,0.0,0.7086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.3412,0.34,0.2235,0.4404,0.5267,-0.2519,0.7766,-0.1531,0.0,-0.5267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.0191,-0.4019,0.5927,0.4404,0.5859,0.7684,0.0,0.0,0.5927,-0.3612,...,0.7263,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
3,-0.1635,0.0,0.7569,0.7674,-0.802,-0.4215,0.0,0.0,0.09,-0.296,...,0.6249,-0.6808,-0.5423,0.8682,-0.4389,0.1007,0.0,0.6239,0.0,0.0


In [63]:
compound_df["result"] = events_df["result"][:4].values
compound_df["yes_bid"] = events_df["yes_bid"][:4].values

In [64]:
compound_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_53,post_54,post_55,post_56,post_57,post_58,post_59,post_60,result,yes_bid
0,0.5093,0.4588,-0.296,0.0,-0.6486,0.1263,0.8126,0.2585,0.0,0.7086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,yes,95
1,-0.3412,0.34,0.2235,0.4404,0.5267,-0.2519,0.7766,-0.1531,0.0,-0.5267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,yes,99
2,-0.0191,-0.4019,0.5927,0.4404,0.5859,0.7684,0.0,0.0,0.5927,-0.3612,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,no,70
3,-0.1635,0.0,0.7569,0.7674,-0.802,-0.4215,0.0,0.0,0.09,-0.296,...,-0.5423,0.8682,-0.4389,0.1007,0.0,0.6239,0.0,0.0,no,10


In [53]:
compound_df.to_csv("training_data.csv", index=False)