In [62]:
import requests
import pandas as pd
import time
from typing import Optional, Tuple, List, Dict

PDS_URL  = "https://bsky.social"            # make sure this matches where your account lives
USERNAME = "timmyjunchen.bsky.social"        # include the “.bsky.social” suffix
PASSWORD = "Abcd1234" # use an App-Password if you have 2FA enabled
API_BASE = f"{PDS_URL}/xrpc/app.bsky.feed.searchPosts"

url     = f"{PDS_URL}/xrpc/com.atproto.server.createSession"
headers = {"Content-Type": "application/json"}
payload = {"identifier": USERNAME, "password": PASSWORD}

resp = requests.post(url, json=payload, headers=headers, timeout=10)

data = resp.json()
access_jwt = data["accessJwt"]
refresh_jwt = data.get("refreshJwt")  # for refreshing later, if needed

print("Logged in! Access JWT:", access_jwt)

Logged in! Access JWT: eyJ0eXAiOiJhdCtqd3QiLCJhbGciOiJFUzI1NksifQ.eyJzY29wZSI6ImNvbS5hdHByb3RvLmFjY2VzcyIsInN1YiI6ImRpZDpwbGM6enBsZzNpYmN5am5vZng2YTdqcWx1c3czIiwiaWF0IjoxNzQ2OTYzMTE5LCJleHAiOjE3NDY5NzAzMTksImF1ZCI6ImRpZDp3ZWI6bGVjY2ludW0udXMtd2VzdC5ob3N0LmJza3kubmV0d29yayJ9.U1B8M8w_Uz_vlzoanZtEjlU4aZpdNc-_8YRKqpZXp5fFMZrumyW5U3MtydMzmlJ98WMj6zQRoIhHLdyH3H2SXw


In [5]:
def search_bluesky_posts(
    jwt: str,
    query: str,
    limit: int = 25,
    cursor: Optional[str] = None,
    sort: str = "latest",
    lang: Optional[str] = "en"
) -> Tuple[List[Dict], Optional[str]]:
    headers = {"Authorization": f"Bearer {jwt}"}
    params = {"q": query, "limit": limit, "sort": sort}
    if cursor:
        params["cursor"] = cursor
    if lang:
        params["lang"] = lang

    resp = requests.get(API_BASE, params=params, headers=headers, timeout=10)
    resp.raise_for_status()
    data = resp.json()

    posts = []
    for hit in data.get("posts", []):
        rec = hit
        posts.append({
            "uri":        rec["uri"],
            "author":     hit["author"]["handle"],
            "text":       rec["record"].get("text", ""),
            "created_at": rec["record"]["createdAt"],
            "reply_count":   rec["replyCount"],
            "like_count":    rec["likeCount"],
            "repost_count":  rec["repostCount"]
        })

    return posts, data.get("cursor")

In [6]:
import subprocess
import json

def scrape_twitter_cli(query: str, limit: int = 50) -> List[Dict]:
    """
    Calls `twscrape search <query> --limit <limit>`,
    parses JSON-per-line stdout into dicts.
    """
    cmd = ["twscrape", "search", query, f"--limit={str(limit)}"]
    completed = subprocess.run(cmd, capture_output=True, text=True, check=True)
    tweets = []
    for line in completed.stdout.splitlines():
        if not line.strip():
            continue
        rec = json.loads(line)
        rec.update({
            "platform": "twitter",
            "query":    query
        })
        tweets.append(rec)
    return tweets

In [17]:
import praw
import os
from datetime import datetime

def search_reddit_posts(query: str, date: str, limit: int = 20) -> List[Dict]:
    if isinstance(date, str):
        # parse ISO8601 with trailing Z
        date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")

    reddit = praw.Reddit(
        client_id=os.getenv("REDDIT_CLIENT_ID"),
        client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
        user_agent=os.getenv("REDDIT_USER_AGENT"),
    )

    matched = []
    for submission in reddit.subreddit("all").search(
        query,
        sort="top",           # get top posts first
        time_filter="all",    # consider all time
        limit=100           # fetch up to 100
    ):
        created = datetime.fromtimestamp(submission.created_utc)
        if created < date:
            matched.append({
                "title":      submission.title,
                "text":       submission.selftext,
                "upvotes":    submission.score,
                "comments":   submission.num_comments,
                "created_at": created,
            })
            if len(matched) >= limit:
                break  # we’ve got our 20

    return matched

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import re
analyzer = SentimentIntensityAnalyzer()

def build_training_dataset(
    events_df: pd.DataFrame,
    # twitter_limit: int,
    bluesky_limit: int,
    reddit_limit:int,
    access_jwt: str,
    max_retries: int = 5,
) -> pd.DataFrame:
    
    # empty = pd.DataFrame(columns=[f"post_{i+1}" for i in range(70)] + ["result","yes_bid"])
    # empty.to_csv("training_data.csv", index=False)

    num = 0
    for _, ev in events_df.iterrows():
        if num < 100:
            num += 1
            continue
        # 1) gather raw texts
        raw_texts = []

        is_negated = bool(re.match(r"(?i)^\s*Not:\s*", ev["topic"]))
        is_season   = bool(re.search(r"(?i)Season\s*\d+", ev["topic"]))

        clean_topic = re.sub(r"\s*\([^)]*\)", "", ev["topic"])

        #Remove Not... and negate result
        clean_topic = re.sub(r"(?i)^\s*Not\s*", "", clean_topic)

        #Remove ...:Season 1
        clean_topic  = re.sub(r"(?i):\s*Season\s*\d+", "", clean_topic).strip()

        #Remove by 'artist'
        clean_topic = re.sub(r"\s+by\s+.*$", "", clean_topic, flags=re.IGNORECASE)

        if 'app' in clean_topic.lower() or is_season:
            query = f"{clean_topic} until:{ev['expiration_time']}"
        else:
            query = f"{clean_topic} song until:{ev['expiration_time']}"
        
        print(f"Iteration: {num}")
        print(query)
        # — Twitter —
        # try:
        #     tw_recs = scrape_twitter_cli(query + " lang:en", limit=twitter_limit)
        # except subprocess.CalledProcessError as e:
        #     print(f"Twitter scrape failed for {query}: {e}")
        #     tw_recs = []
        # raw_texts += [r["rawContent"] for r in tw_recs]
        
        # — Bluesky —
        retries = 0
        while retries < max_retries:
            try:
                bs_recs, _ = search_bluesky_posts(access_jwt, query, limit=bluesky_limit)
            except Exception as e:
                print(f"Bluesky error: {e}")
                bs_recs = []
            if bs_recs:
                break
            retries += 1
            print(f"  → Bluesky empty, retry {retries}/{max_retries} in 10 s")
            time.sleep(10)

        raw_texts += [r["text"] for r in bs_recs]
        
        # — Reddit —
        retries = 0
        while retries < max_retries:
            try:
                rd_recs = search_reddit_posts(query, ev.expiration_time, limit=reddit_limit)
            except Exception as e:
                print(f"Reddit error: {e}")
                rd_recs = []
            if rd_recs:
                break
            retries += 1
            print(f"  → Reddit empty, retry {retries}/{max_retries} in 10 s")
            time.sleep(10)
        raw_texts += [
            (r["title"] + "\n\n" + r["text"]).strip()
            for r in rd_recs
        ]
        
        # 2) compute compound sentiment scores
        compounds = [analyzer.polarity_scores(txt)["compound"] 
                     for txt in raw_texts]
        
        print(f"Num of posts: {len(compounds)}; Bluesky: {len(bs_recs)}; Reddit: {len(rd_recs)}")
        
        orig_result = ev["result"]
        if is_negated:
            # assuming 'yes'/'no' strings; adjust if yours are booleans or other labels
            flipped = "no" if str(orig_result).lower() == "yes" else "yes"
        else:
            flipped = orig_result
        
        row_df = pd.DataFrame([{
            **{f"post_{i+1}": compounds[i] for i in range(len(compounds))},
            "result": flipped,
            "yes_bid": ev["yes_bid"]
        }])
        
        row_df.to_csv("training_data.csv", mode="a", header=False, index=False)
        num += 1
        time.sleep(10)

In [None]:
KALSHI_EVENTS = "https://api.elections.kalshi.com/trade-api/v2/events"
culture_series = ["KXTOPSONG", "KXNETFLIXRANKSHOW", "KXAPPRANKFREE"] #KXSPOTIFYD
events = []
for ticker in culture_series:
    cursor = None
    while True:
        params = dict(
            limit=200,
            cursor=cursor,
            series_ticker=ticker,
            with_nested_markets=True
        )
        payload = requests.get(KALSHI_EVENTS, params=params).json()
        for ev in payload.get("events", []):
            for market in ev["markets"]:
                events.append({
                    "series":      ticker,
                    "title":       ev["title"],
                    "topic":  market["no_sub_title"],
                    "result": market["result"],
                    "expiration_time": market["expiration_time"],
                    "yes_bid": market["yes_bid"],
                    "category": market["category"],
                })
        cursor = payload.get("cursor")
        if not cursor:
            break

events_df = pd.DataFrame(events)

In [None]:
events_df = events_df.iloc[::-1]
events_df = events_df[events_df["yes_bid"] > 0]
events_df.head(83)

Unnamed: 0,series,title,topic,result,expiration_time,yes_bid,category
5,KXTOPSONG,"Billboard Hot 100 #1 on May 24, 2025 chart?",Ordinary,,2025-06-07T14:00:00Z,43,
7,KXTOPSONG,"Billboard Hot 100 #1 on May 24, 2025 chart?",Luther,,2025-06-07T14:00:00Z,41,
13,KXTOPSONG,"Billboard Hot 100 #1 on May 17, 2025 chart?",luther,,2025-05-31T14:00:00Z,94,
51,KXTOPSONG,"Billboard Hot 100 #1 on Apr 26, 2025 chart?",luther,yes,2025-05-10T14:00:00Z,97,
57,KXTOPSONG,"Billboard Hot 100 #1 on Apr 19, 2025 chart?",luther,yes,2025-05-03T14:00:00Z,99,
...,...,...,...,...,...,...,...
482,KXTOPSONG,"Billboard Hot 100 #1 on Jan 27, 2024?",Not Lovin on Me,no,2024-02-10T15:00:00Z,1,
490,KXTOPSONG,"Billboard Hot 100 #1 on Jan 20, 2024?",Not Lovin On Me,yes,2024-02-03T15:00:00Z,95,
495,KXTOPSONG,"Billboard Hot 100 #1 on Jan 13, 2024?",Not Lovin On Me,yes,2024-01-27T15:00:00Z,97,
498,KXTOPSONG,"Billboard Hot 100 #1 on Jan 6, 2024?",Not Rockin' Around The Christmas Tree,yes,2024-01-20T15:00:00Z,80,


In [87]:
dataset_df = build_training_dataset(
        events_df,
        # twitter_limit=20,
        bluesky_limit=20,
        reddit_limit=20,
        access_jwt=access_jwt
    )

UnboundLocalError: cannot access local variable 'query' where it is not associated with a value

In [59]:
dataset_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_51,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59,post_60
0,Welcome to life St Chroma!,Welcome to the world SPECIALZ https://t.co/B2Z...,@guicosz st chroma no B 💀,@tlprkive st. chroma,@Glory_be_Satori mf rose from the dead,damn itft is really good,@tylerthecreator as i navigate my own life and...,@tylerthecreator st chroma - with music i base...,@VampireGhuleh Unironically listen to St. Chro...,@tylerthecreator the beats transitions instrum...,...,,,,,,,,,,
1,@tylerthecreator could you get someone to get ...,i still wanna do a felix darling i edit.. but ...,@allishastlouis Ain't no way boy https://t.co/...,St. Chroma is truly incredible man. Tyler can ...,umm…… yay? https://t.co/ITgyOi2dm3,@beaches_p Weezer favs: Undone - The Sweater S...,@cutegashu congrats now look at this funny guy...,MISS MY MO+LI,@immaleakyoshit they're correct,@fazrabbit2 @Cosmicnoone Are you stupid it say...,...,,,,,,,,,,
2,"No matter what happens tomorrow, it'll still b...",@Jasp3r_0 I just thinks that st. Chroma sounds...,tight\n\nbut St. Chroma is better,Stars vs St. Chroma \n\nWhat is the better int...,Idk why but the song st. Chroma makes me want ...,@satosugaru okay but sticky goes soooo hard to...,@Jasp3r_0 St. Chroma,@bbokariswingz that’s a bit 🏳️‍🌈,Close but St. Chroma is better.,@cutegashu gashu what demons were you fighting,...,My friend and I are working on a film about th...,St Chroma walk cycle,I need St. Chroma injected into my veins,ST chroma,I already knew St. Chroma was gonna be my fav ...,1. ST CHROMA\n\n#furry #furryvr #vrchat #furry...,,,,
3,@Cruela_DeVil_ Yeah I opened it all and realis...,@TeTheGamer It’s $jam \nI feel it in my balls,🎭 Akool vs. Synthesia\n\nWhen it comes to AI A...,I really like playing mercy,Shitty sticky notes shouldn’t be on the market...,@cookitup31 Link broke already 🤦🏾‍♂️,@ape_descendent Dust sprayer and sticky tape,@larryelder Transitory over 3 years. Hmmm…..Th...,"@jadelaui Sure, but I have to warn you, there’...",@cmonJD stop why did we both tweet about mingy...,...,The original choreographer of the viral Sticky...,제피Pure Zephyr🐺🌛 (🔞Vtuber/ASMRtist) - 【 V - Zep...,slaps this sticky note to your big ass forehead,It did a great job of creating a really tense ...,"Better find a mop , it’s getting sticky in thi...",Some little guys I was sketching for a creatur...,Woke up and EXTREMELY horny. This fox alreedy ...,Oh I’m a cold and frosty lover too! Give me th...,"Nov 14, 2024\nWild Provisions Beer Project: 3p...",Glo’s verse on sticky >>>>>>


In [60]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import json
analyzer = SentimentIntensityAnalyzer()
texts = dataset_df.astype(str)

# 2) applymap over every cell, extracting only 'compound'
compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])
compound_df

  compound_df = texts.applymap(lambda txt: analyzer.polarity_scores(txt)['compound'])


Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_51,post_52,post_53,post_54,post_55,post_56,post_57,post_58,post_59,post_60
0,0.5093,0.4588,-0.296,0.0,-0.6486,0.1263,0.8126,0.2585,0.0,0.7086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.3412,0.34,0.2235,0.4404,0.5267,-0.2519,0.7766,-0.1531,0.0,-0.5267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.0191,-0.4019,0.5927,0.4404,0.5859,0.7684,0.0,0.0,0.5927,-0.3612,...,0.7263,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
3,-0.1635,0.0,0.7569,0.7674,-0.802,-0.4215,0.0,0.0,0.09,-0.296,...,0.6249,-0.6808,-0.5423,0.8682,-0.4389,0.1007,0.0,0.6239,0.0,0.0


In [63]:
compound_df["result"] = events_df["result"][:4].values
compound_df["yes_bid"] = events_df["yes_bid"][:4].values

In [64]:
compound_df

Unnamed: 0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,...,post_53,post_54,post_55,post_56,post_57,post_58,post_59,post_60,result,yes_bid
0,0.5093,0.4588,-0.296,0.0,-0.6486,0.1263,0.8126,0.2585,0.0,0.7086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,yes,95
1,-0.3412,0.34,0.2235,0.4404,0.5267,-0.2519,0.7766,-0.1531,0.0,-0.5267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,yes,99
2,-0.0191,-0.4019,0.5927,0.4404,0.5859,0.7684,0.0,0.0,0.5927,-0.3612,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,no,70
3,-0.1635,0.0,0.7569,0.7674,-0.802,-0.4215,0.0,0.0,0.09,-0.296,...,-0.5423,0.8682,-0.4389,0.1007,0.0,0.6239,0.0,0.0,no,10


In [53]:
compound_df.to_csv("training_data.csv", index=False)

In [66]:
import csv

input_csv  = "training_data.csv"
output_csv = "training_data_reordered.csv"

with open(input_csv,  newline="", encoding="utf-8") as fin, \
     open(output_csv, "w", newline="", encoding="utf-8") as fout:
    
    reader = csv.reader(fin)
    writer = csv.writer(fout)

    # 1) Reorder the header row
    header      = next(reader)
    new_header  = header[-2:] + header[:-2]
    writer.writerow(new_header)

    # 2) For each data row, move its last two fields to the front
    for row in reader:
        if len(row) >= 2:
            new_row = row[-2:] + row[:-2]
        else:
            # edge case: fewer than 2 fields—just write as-is
            new_row = row
        writer.writerow(new_row)