In [1]:
# cell 1=== Grab yesterday's Ethereum-trading Reddit posts (UTC) -> CSV ===
# Dependencies: praw, pandas, python-dotenv (will be installed automatically)
import os, sys, time, re, datetime as dt
from datetime import timezone
from typing import List

def _pip_install(pkgs: List[str]):
    import importlib, subprocess
    for p in pkgs:
        try:
            importlib.import_module(p.split("==")[0])
        except Exception:
            subprocess.check_call([sys.executable, "-m", "pip", "install", p])

_pip_install(["praw", "pandas", "python-dotenv"])

from dotenv import load_dotenv
load_dotenv()  # Load .env from the project root; if your notebook is in a subfolder, os.chdir to root first
import praw
import pandas as pd

# Read credentials (from .env)
CLIENT_ID     = os.getenv("REDDIT_CLIENT_ID")
CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
USER_AGENT    = os.getenv("REDDIT_USER_AGENT", "eth-sentiment-bot/0.1")

if not CLIENT_ID or not CLIENT_SECRET or not USER_AGENT:
    raise RuntimeError("Missing Reddit credentials: set REDDIT_CLIENT_ID / REDDIT_CLIENT_SECRET / REDDIT_USER_AGENT in .env")

# Subreddits & keywords (adjust as needed)
SUBREDDITS = ["ethereum", "ethfinance", "ethtrader", "CryptoCurrency", "defi", "ethdev"]
KEYWORDS = [
    r"\beth\b", r"\bethereum\b", r"\beth/usdt\b", r"\bethusd\b", r"\bethusdt\b",
    r"\bspot\b", r"\bfutures?\b", r"\bperps?\b", r"\btrade|trading|trader\b",
    r"\bposition\b", r"\blong\b", r"\bshort\b", r"\bentry\b", r"\bexit\b",
    r"\bmarket\b", r"\border\b", r"\bliquidation\b", r"\bhedge\b", r"\bleverage\b"
]
KW_REGEX = re.compile("|".join(KEYWORDS), flags=re.IGNORECASE)

MAX_PER_SUBREDDIT = 5000
REQUEST_SLEEP = 0.5
SAVE_DIR = "./data/reddit/yesterday"

# Yesterday (UTC) time window
now_utc = dt.datetime.now(timezone.utc)
day_end = dt.datetime(year=now_utc.year, month=now_utc.month, day=now_utc.day, tzinfo=timezone.utc)
day_start = day_end - dt.timedelta(days=1)
D = day_start.date()
print(f"[UTC window] {day_start.isoformat()} -> {day_end.isoformat()} (D={D})")

# Reddit client (read-only)
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
    ratelimit_seconds=5,
)
reddit.read_only = True

def _to_utc(ts: float) -> dt.datetime:
    return dt.datetime.fromtimestamp(ts, tz=timezone.utc)

def _match_eth_trading(title: str, selftext: str) -> bool:
    return bool(KW_REGEX.search(f"{title or ''}\n{selftext or ''}"))

# Collect
rows = []
os.makedirs(SAVE_DIR, exist_ok=True)

for sub in SUBREDDITS:
    print(f"\n[collect] r/{sub} ...")
    count = 0
    try:
        for submission in reddit.subreddit(sub).new(limit=None):
            created = _to_utc(getattr(submission, "created_utc", 0.0))
            if created >= day_end:
                continue
            if created < day_start:
                break

            if not _match_eth_trading(submission.title, submission.selftext):
                continue

            rows.append(dict(
                id=submission.id,
                subreddit=submission.subreddit.display_name,
                created_utc=created.isoformat(),
                title=submission.title or "",
                selftext=submission.selftext or "",
                url=submission.url or "",
                is_self=bool(submission.is_self),
                author=str(submission.author) if submission.author else None,
                score=int(submission.score or 0),
                upvote_ratio=float(submission.upvote_ratio or 0.0),
                num_comments=int(submission.num_comments or 0),
                over_18=bool(getattr(submission, "over_18", False)),
                stickied=bool(getattr(submission, "stickied", False)),
                crosspost_parent=getattr(submission, "crosspost_parent", None),
                permalink=f"https://www.reddit.com{submission.permalink}" if getattr(submission, "permalink", None) else "",
                fetched_at=dt.datetime.now(timezone.utc).isoformat(),
                fetch_window_start=day_start.isoformat(),
                fetch_window_end=day_end.isoformat(),
                fetch_version="submissions_only_v1"
            ))
            count += 1
            if count % 200 == 0:
                print(f"  r/{sub}: matched {count} ...")
            if count >= MAX_PER_SUBREDDIT:
                print(f"  r/{sub}: hit MAX_PER_SUBREDDIT={MAX_PER_SUBREDDIT}, stop.")
                break
            time.sleep(REQUEST_SLEEP)
    except Exception as e:
        print(f"  [warn] subreddit {sub} failed: {e}")

# Deduplicate and save CSV
df = pd.DataFrame(rows)
csv_path = os.path.join(SAVE_DIR, f"reddit_eth_submissions_{D}.csv")

if df.empty:
    print("\n[summary] No matched submissions found. You can tweak keywords/subreddits and try again.")
else:
    df.sort_values(["id", "score", "fetched_at"], ascending=[True, False, False], inplace=True)
    df = df.drop_duplicates(subset=["id"], keep="first").reset_index(drop=True)
    df["is_crosspost"] = df["crosspost_parent"].notna()
    df.to_csv(csv_path, index=False, encoding="utf-8")

    dt_min = pd.to_datetime(df["created_utc"]).min()
    dt_max = pd.to_datetime(df["created_utc"]).max()
    print("\n[summary]")
    print(f"  window (UTC): {day_start.isoformat()} → {day_end.isoformat()}")
    print(f"  collected rows: {len(df)}  (after de-dup)")
    print(f"  time coverage:  {dt_min} → {dt_max}")
    print(f"  subreddits:     {', '.join(sorted(df['subreddit'].unique()))}")
    print(f"  saved csv:      {csv_path}")

    # Preview top 10
    preview = (
        df[["created_utc","subreddit","score","num_comments","title"]]
        .sort_values(["score","num_comments"], ascending=False)
        .head(10)
    )
    print("\n[top 10 by score/num_comments]")
    for _, r in preview.iterrows():
        print(f"- [{r['created_utc']}] r/{r['subreddit']} | score={r['score']} com={r['num_comments']} | {r['title'][:140]}")


[UTC window] 2025-09-26T00:00:00+00:00 -> 2025-09-27T00:00:00+00:00 (D=2025-09-26)

[collect] r/ethereum ...

[collect] r/ethfinance ...

[collect] r/ethtrader ...

[collect] r/CryptoCurrency ...

[collect] r/defi ...

[collect] r/ethdev ...

[summary]
  window (UTC): 2025-09-26T00:00:00+00:00 → 2025-09-27T00:00:00+00:00
  collected rows: 32  (after de-dup)
  time coverage:  2025-09-26 00:00:56+00:00 → 2025-09-26 23:02:28+00:00
  subreddits:     CryptoCurrency, defi, ethdev, ethereum, ethfinance, ethtrader
  saved csv:      ./data/reddit/yesterday\reddit_eth_submissions_2025-09-26.csv

[top 10 by score/num_comments]
- [2025-09-26T05:01:29+00:00] r/ethereum | score=133 com=162 | Daily General Discussion September 26, 2025
- [2025-09-26T10:16:53+00:00] r/ethtrader | score=125 com=47 | Whales accumulate while retail panics, same old story for ETH.
- [2025-09-26T18:44:29+00:00] r/ethtrader | score=107 com=14 | ethereum co founder sells $6M but whales scoop up $1.6B whats really happening h

In [2]:
# cell 2=== Normalize yesterday's grabbed CSV to your legacy KEEP_FIELDS schema ===
# Input:  data/reddit/yesterday/reddit_eth_submissions_YYYY-MM-DD.csv
# Output: data/reddit/processed/reddit_eth_standard_YYYY-MM-DD.csv
# Purpose: Map/fill the newly grabbed columns into the legacy 17-field schema so downstream scoring/aggregation code can be reused

import os
import pandas as pd
import datetime as dt

# Your legacy KEEP_FIELDS (from sentiment.ipynb)
KEEP_FIELDS = [
    "id", "author", "subreddit",
    "created_utc", "created", "created_time_utc",
    "title", "selftext", "body",
    "url", "permalink",
    "score", "upvote_ratio", "num_comments", "num_crossposts",
    "over_18", "is_self"
]

# Yesterday's UTC filename (aligned with the grabbing script)
D = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=1)).date()
in_dir  = os.path.join("data", "reddit", "yesterday")
in_csv  = os.path.join(in_dir, f"reddit_eth_submissions_{D}.csv")

out_dir = os.path.join("data", "reddit", "processed")
os.makedirs(out_dir, exist_ok=True)
out_csv = os.path.join(out_dir, f"reddit_eth_standard_{D}.csv")

if not os.path.exists(in_csv):
    raise FileNotFoundError(f"找不到输入文件: {os.path.abspath(in_csv)}")

df = pd.read_csv(in_csv)

# ---- Field mapping/filling (minimal changes, reuse existing columns whenever possible) ----
out = pd.DataFrame()

# Directly mapped columns (if missing, use defaults)
def pick(col, default=None):
    return df[col] if col in df.columns else default

out["id"]         = pick("id")
out["author"]     = pick("author")
out["subreddit"]  = pick("subreddit")
out["created_utc"]= pick("created_utc")  # grabbed as ISO string

# Because the legacy flow had both created_utc and created/created_time_utc:
# For compatibility: created = naive local display; created_time_utc = same as created_utc
ts = pd.to_datetime(out["created_utc"], errors="coerce", utc=True)
out["created"]          = ts.dt.tz_convert(None).dt.strftime("%Y-%m-%d %H:%M:%S")
out["created_time_utc"] = out["created_utc"]

out["title"]    = pick("title", "")
out["selftext"] = pick("selftext", "")
out["body"]     = ""  # legacy schema for comments; we grabbed submissions, so leave blank

out["url"]       = pick("url", "")
out["permalink"] = pick("permalink", "")

out["score"]        = pick("score", 0).fillna(0).astype("Int64")
out["upvote_ratio"] = pick("upvote_ratio", 0.0).fillna(0.0)
out["num_comments"] = pick("num_comments", 0).fillna(0).astype("Int64")

# num_crossposts: not explicitly grabbed; approximate via presence of crosspost_parent
if "num_crossposts" in df.columns:
    out["num_crossposts"] = df["num_crossposts"].fillna(0).astype("Int64")
else:
    out["num_crossposts"] = df.get("crosspost_parent").notna().astype(int)

out["over_18"] = pick("over_18", False).fillna(False).astype(bool)
out["is_self"] = pick("is_self", False).fillna(False).astype(bool)

# Keep only the legacy KEEP_FIELDS order and names
out = out[KEEP_FIELDS]

# De-duplicate (by id)
out = out.sort_values(["id", "score"], ascending=[True, False]).drop_duplicates("id").reset_index(drop=True)

# Save CSV
out.to_csv(out_csv, index=False, encoding="utf-8-sig")

# Summary
print("[normalize] input file :", os.path.abspath(in_csv))
print("[normalize] output file:", os.path.abspath(out_csv))
print("[normalize] rows       :", len(out))
print("[normalize] columns    :", list(out.columns))
print(out[["created_utc","subreddit","score","num_comments","title"]].head(5))


[normalize] input file : C:\Users\Jimmy\Desktop\760\data\reddit\yesterday\reddit_eth_submissions_2025-09-26.csv
[normalize] output file: C:\Users\Jimmy\Desktop\760\data\reddit\processed\reddit_eth_standard_2025-09-26.csv
[normalize] rows       : 32
[normalize] columns    : ['id', 'author', 'subreddit', 'created_utc', 'created', 'created_time_utc', 'title', 'selftext', 'body', 'url', 'permalink', 'score', 'upvote_ratio', 'num_comments', 'num_crossposts', 'over_18', 'is_self']
                 created_utc       subreddit  score  num_comments  \
0  2025-09-26T00:00:56+00:00       ethtrader      9            53   
1  2025-09-26T00:01:00+00:00  CryptoCurrency     21           495   
2  2025-09-26T01:03:38+00:00  CryptoCurrency     37            23   
3  2025-09-26T01:06:58+00:00          ethdev      5             2   
4  2025-09-26T02:13:10+00:00  CryptoCurrency      0            14   

                                               title  
0  Daily General Discussion - September 26, 2025 .

In [3]:
# === Cell 3: Clean processed file -> cleaned/reddit_eth_standard_{D}_clean.csv ===
import os, re, glob
import pandas as pd
import datetime as dt
from datetime import timezone

# ---- Auto-locate input file ----
D = (dt.datetime.now(timezone.utc) - dt.timedelta(days=1)).date()
default_path = f"data/reddit/processed/reddit_eth_standard_{D}.csv"
if os.path.exists(default_path):
    INPUT_CSV = default_path
else:
    # Fallback: find the latest reddit_eth_standard_*.csv under processed directory
    cand = sorted(glob.glob("data/reddit/processed/reddit_eth_standard_*.csv"))
    if not cand:
        raise FileNotFoundError("找不到 processed 文件：data/reddit/processed/reddit_eth_standard_*.csv")
    INPUT_CSV = cand[-1]
    # Sync D (extract date from filename)
    try:
        D = os.path.basename(INPUT_CSV).split("_")[-1].split(".")[0]
    except Exception:
        pass

OUT_DIR   = "cleaned"
MAX_LEN   = 20000
MIN_LEN   = 5
os.makedirs(OUT_DIR, exist_ok=True)

def clean_text(text: str) -> str:
    if pd.isna(text):
        return ""
    s = str(text)
    s = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"\1", s)               # markdown link -> plain text
    s = re.sub(r"http\S+|www\.\S+", " ", s, flags=re.IGNORECASE)   # URLs
    s = re.sub(r"<[^>]+>", " ", s)                                 # HTML tags
    s = re.sub(r"[^A-Za-z0-9\s]", " ", s)                          # keep only letters, numbers, spaces
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

# Read
try:
    df = pd.read_csv(INPUT_CSV, low_memory=False)
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_CSV, encoding="latin-1", low_memory=False)

# Ensure text columns
if "title" not in df.columns:
    raise ValueError("CSV 缺少 'title' 列")
if "selftext" not in df.columns: df["selftext"] = ""
if "body" not in df.columns: df["body"] = ""

# Drop extraneous columns
df = df.drop(columns=[c for c in df.columns if str(c).startswith("Unnamed")], errors="ignore")
df = df.dropna(subset=["title"])

# Concatenate raw text
df["text_raw"] = (df["title"].astype(str).fillna("") + " " + df["selftext"].astype(str).fillna(""))
mask_short = df["text_raw"].str.len().fillna(0) < 3
df.loc[mask_short, "text_raw"] = df.loc[mask_short, "text_raw"] + " " + df.loc[mask_short, "body"].astype(str)

# Truncate & clean
df["text_raw"] = df["text_raw"].astype(str).str.slice(0, MAX_LEN)
df["text_clean"] = df["text_raw"].map(clean_text)
df = df[df["text_clean"].str.len() >= MIN_LEN].copy()

# Parse time (supports ISO or numeric timestamp)
def parse_created_any(s):
    ts = pd.to_datetime(s, errors="coerce", utc=True)  # try ISO first
    if ts.isna().mean() > 0.5:  # if mostly NaT, try numeric
        c = pd.to_numeric(s, errors="coerce")
        if c.notna().any():
            unit = "ms" if (c.dropna().median() > 10**12) else "s"
            ts = pd.to_datetime(c, unit=unit, errors="coerce", utc=True)
    return ts

if "created_time_utc" not in df.columns:
    if "created_utc" in df.columns:
        df["created_time_utc"] = parse_created_any(df["created_utc"])
    elif "created" in df.columns:
        df["created_time_utc"] = parse_created_any(df["created"])
    else:
        df["created_time_utc"] = pd.NaT

# year_month
if df["created_time_utc"].notna().any():
    df["year_month"] = pd.to_datetime(df["created_time_utc"]).dt.strftime("%Y-%m")
elif "source_file" in df.columns:
    df["year_month"] = df["source_file"].str.extract(r'((20\d{2})[-_](\d{2}))')[0]
else:
    df["year_month"] = None

# De-duplicate
before = len(df)
if "id" in df.columns:
    df = df.sort_values(["id","score"] if "score" in df.columns else ["id"]) \
           .drop_duplicates("id")
else:
    keys = [k for k in ["title","created_time_utc","subreddit"] if k in df.columns]
    df = df.drop_duplicates(subset=keys) if keys else df.drop_duplicates()
after = len(df)

# Save
base = os.path.splitext(os.path.basename(INPUT_CSV))[0]
out_path = os.path.join(OUT_DIR, f"{base}_clean.csv")
df.to_csv(out_path, index=False, encoding="utf-8-sig")

print("✅ Cleaning completed")
print(" - Input :", os.path.abspath(INPUT_CSV))
print(" - Output:", os.path.abspath(out_path))
print(f" - Records: {after} (before de-dup {before})")
print(" - Key columns present:", [c for c in ["text_raw","text_clean","created_time_utc","year_month"] if c in df.columns])
df.head(3)


✅ 清洗完成
 - 输入: C:\Users\Jimmy\Desktop\760\data\reddit\processed\reddit_eth_standard_2025-09-26.csv
 - 输出: C:\Users\Jimmy\Desktop\760\cleaned\reddit_eth_standard_2025-09-26_clean.csv
 - 记录数: 32（去重前 32）
 - 关键列存在： ['text_raw', 'text_clean', 'created_time_utc', 'year_month']


Unnamed: 0,id,author,subreddit,created_utc,created,created_time_utc,title,selftext,body,url,permalink,score,upvote_ratio,num_comments,num_crossposts,over_18,is_self,text_raw,text_clean,year_month
0,1nqmorm,AutoModerator,ethtrader,2025-09-26T00:00:56+00:00,2025-09-26 00:00:56,2025-09-26T00:00:56+00:00,"Daily General Discussion - September 26, 2025 ...",Welcome to the Daily General Discussion thread...,,https://www.reddit.com/r/ethtrader/comments/1n...,https://www.reddit.com/r/ethtrader/comments/1n...,9,1.0,53,0,False,True,"Daily General Discussion - September 26, 2025 ...",daily general discussion september 26 2025 utc...,2025-09
1,1nqmotq,AutoModerator,CryptoCurrency,2025-09-26T00:01:00+00:00,2025-09-26 00:01:00,2025-09-26T00:01:00+00:00,"Daily Crypto Discussion - September 26, 2025 (...",**Welcome to the Daily Crypto Discussion threa...,,https://www.reddit.com/r/CryptoCurrency/commen...,https://www.reddit.com/r/CryptoCurrency/commen...,21,0.89,495,0,False,True,"Daily Crypto Discussion - September 26, 2025 (...",daily crypto discussion september 26 2025 gmt ...,2025-09
2,1nqo0l7,LazyJury,CryptoCurrency,2025-09-26T01:03:38+00:00,2025-09-26 01:03:38,2025-09-26T01:03:38+00:00,BTC Perpetual Trading Goes Live on Cardano,Strike Finance has officially gone live with B...,,https://app.strikefinance.org/perpetuals/btc,https://www.reddit.com/r/CryptoCurrency/commen...,37,0.81,23,0,False,False,BTC Perpetual Trading Goes Live on Cardano Str...,btc perpetual trading goes live on cardano str...,2025-09


In [4]:
# === Cell 4: Score the cleaned file with VADER and write to master posts_scores_{D}.csv ===
import os, pandas as pd, datetime as dt
from datetime import timezone

# Input (output from previous cell)
D = (dt.datetime.now(timezone.utc) - dt.timedelta(days=1)).date()
clean_in = f"cleaned/reddit_eth_standard_{D}_clean.csv"
if not os.path.exists(clean_in):
    # If you used the "latest file fallback", the name may not match yesterday's date; find latest *_clean.csv
    import glob
    cand = sorted(glob.glob("cleaned/*_clean.csv"))
    if not cand:
        raise FileNotFoundError("No cleaned file found under cleaned/*.csv")
    clean_in = cand[-1]

# Read
df = pd.read_csv(clean_in)

# Select base columns for the master table
base_cols = [c for c in [
    "id","subreddit","created_time_utc","title","selftext","body",
    "text_clean","score","num_comments","upvote_ratio","permalink","url"
] if c in df.columns]
dfb = df[base_cols].copy()

# VADER
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
try:
    _ = nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
    nltk.download("vader_lexicon")

text_col = "text_clean" if "text_clean" in dfb.columns else None
if text_col is None:
    def _join(*xs): return " ".join([str(x) for x in xs if pd.notna(x)])
    dfb["__text_tmp__"] = [_join(t, s, b) for t, s, b in zip(
        dfb.get("title",""), dfb.get("selftext",""), dfb.get("body","")
    )]
    text_col = "__text_tmp__"

sia = SentimentIntensityAnalyzer()
dfb["vader"] = dfb[text_col].map(lambda t: sia.polarity_scores(str(t))["compound"])

# Save master table (you can append s1…s5 columns later)
os.makedirs("data/reddit/scored", exist_ok=True)
master_p = f"data/reddit/scored/posts_scores_{D}.csv"
tmp_p = master_p + ".tmp"
dfb.to_csv(tmp_p, index=False, encoding="utf-8-sig")
os.replace(tmp_p, master_p)

print("✅ VADER scoring completed & master file saved:", os.path.abspath(master_p))
print(" - Columns:", list(dfb.columns))
print(" - Rows   :", len(dfb))
dfb[["id","vader","title"]].head(3)


✅ VADER scoring completed & master file saved: C:\Users\Jimmy\Desktop\760\data\reddit\scored\posts_scores_2025-09-26.csv
 - Columns: ['id', 'subreddit', 'created_time_utc', 'title', 'selftext', 'body', 'text_clean', 'score', 'num_comments', 'upvote_ratio', 'permalink', 'url', 'vader']
 - Rows   : 32


Unnamed: 0,id,vader,title
0,1nqmorm,0.9509,"Daily General Discussion - September 26, 2025 ..."
1,1nqmotq,0.9801,"Daily Crypto Discussion - September 26, 2025 (..."
2,1nqo0l7,0.25,BTC Perpetual Trading Goes Live on Cardano


In [5]:
# =========================
# cell 5: Score—— meta-llama-3.1-8b-instruct——s1
# =========================
# Dependencies: pip install openai pandas numpy tqdm nest_asyncio

import os, re, glob, time, asyncio
import numpy as np
import pandas as pd
import nest_asyncio

nest_asyncio.apply()
from tqdm.auto import tqdm
from openai import AsyncOpenAI
import datetime as dt
from datetime import timezone

# -------- LM Studio basic configuration --------
LMSTUDIO_BASE_URL = "http://127.0.0.1:1234/v1"
LMSTUDIO_API_KEY = "lm-studio"
MODEL_NAME = "meta-llama-3.1-8b-instruct"  # API identifier shown in LM Studio (right-side Info)
OUT_COL = "s1"  # Output column for this model (your first small model)

# -------- Master table path (must already contain the 'vader' column) --------
D = (dt.datetime.now(timezone.utc) - dt.timedelta(days=1)).date()
INPUT_CSV = f"data/reddit/scored/posts_scores_{D}.csv"  # Master table (VADER written in previous step)
OUTPUT_CSV = INPUT_CSV  # In-place atomic write

if not os.path.exists(INPUT_CSV):
    # Fallback: pick the newest posts_scores_*.csv under the scored directory
    cands = sorted(glob.glob("data/reddit/scored/posts_scores_*.csv"), key=os.path.getmtime)
    if not cands:
        raise FileNotFoundError("Master table posts_scores_{D}.csv not found. Please complete the VADER step first.")
    INPUT_CSV = OUTPUT_CSV = cands[-1]

TEXT_COL = "text_clean"  # Prefer the cleaned text
VADER_COL = "vader"  # Can be used to select “hard cases” only (optional)

# -------- Speed & control parameters --------
MAX_TEXT_LEN = 256  # Truncate to avoid overly long context
VADER_EDGE = None  # If set, only score samples with |vader| < threshold; None = score all
CONCURRENCY = 8  # Concurrency
RETRY = 3

# —— Prompt (note: curly braces must be escaped as double braces in Python format strings) ——
PROMPT = (
    "Classify the sentiment as exactly one token from {{POS, NEU, NEG}}.\n"
    "Text:\n{text}\n"
    "Answer:"
)
# Restrict outputs to POS/NEU/NEG (llama.cpp grammar)
GRAMMAR = r'root ::= "POS" | "NEU" | "NEG"'


def map_label_to_score(label: str) -> float:
    s = (label or "").strip().upper()
    first = re.split(r"\s+", s)[0] if s else ""
    if first in {"POS", "NEU", "NEG"}:
        return 1.0 if first == "POS" else (0.0 if first == "NEU" else -1.0)
    if "POS" in s: return 1.0
    if "NEG" in s: return -1.0
    if "NEU" in s: return 0.0
    return 0.0


# -------- Read master table & select samples to score --------
try:
    df = pd.read_csv(INPUT_CSV, low_memory=False)
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_CSV, encoding="latin-1", low_memory=False)

# Text column fallback: if no text_clean, join title+selftext+body
if TEXT_COL not in df.columns:
    def _join(*xs): return " ".join([str(x) for x in xs if pd.notna(x)])


    df["__text_tmp__"] = [_join(t, s, b) for t, s, b in zip(
        df.get("title", ""), df.get("selftext", ""), df.get("body", "")
    )]
    TEXT_COL = "__text_tmp__"

# Ensure the output column exists (initialize with NaN)
if OUT_COL not in df.columns:
    df[OUT_COL] = np.nan

mask = df[OUT_COL].isna()
if VADER_EDGE is not None and VADER_COL in df.columns:
    mask &= df[VADER_COL].abs() < VADER_EDGE  # only (re)score ambiguous samples

todo = df[mask].copy()
if todo.empty:
    print(f"No new samples to score ({OUT_COL} already exists or filtered by threshold). File: {INPUT_CSV}")
else:
    # De-duplicate texts to reduce requests
    todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)
    groups = todo.groupby("__txt").indices
    unique_texts = list(groups.keys())
    print(f"Master table: {os.path.basename(INPUT_CSV)}")
    print(f"Unique texts to score: {len(unique_texts)} (raw samples {len(todo)} / total {len(df)})")

    aclient = AsyncOpenAI(base_url=LMSTUDIO_BASE_URL, api_key=LMSTUDIO_API_KEY)


    async def classify_text(t: str) -> float:
        msg = PROMPT.format(text=t)
        for attempt in range(RETRY):
            try:
                resp = await aclient.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": msg}],
                    temperature=0,
                    max_tokens=1,
                    top_p=1,
                    stop=["\n"],
                    extra_body={"grammar": GRAMMAR}
                )
                return map_label_to_score(resp.choices[0].message.content)
            except Exception:
                if attempt == RETRY - 1:
                    return np.nan
                await asyncio.sleep(0.6 * (attempt + 1))


    async def run_all(texts, concurrency=8):
        sem = asyncio.Semaphore(concurrency)
        results = {}

        async def bound_task(t):
            async with sem:
                score = await classify_text(t)
            return t, score

        tasks = [asyncio.create_task(bound_task(t)) for t in texts]
        with tqdm(total=len(tasks), desc=f"{MODEL_NAME} → {OUT_COL}", unit="req") as pbar:
            for fut in asyncio.as_completed(tasks):
                t, sc = await fut
                results[t] = sc
                pbar.update(1)
        return results


    loop = asyncio.get_event_loop()
    scores_map = loop.run_until_complete(run_all(unique_texts, CONCURRENCY))

    # Fill back & atomic save
    for t, idxs in groups.items():
        df.loc[idxs, OUT_COL] = scores_map.get(t, np.nan)

    tmp_out = OUTPUT_CSV + ".tmp"
    df.to_csv(tmp_out, index=False, encoding="utf-8-sig")
    os.replace(tmp_out, OUTPUT_CSV)

    print(f"✅ Column {OUT_COL} written back -> {OUTPUT_CSV}")


Master table: posts_scores_2025-09-26.csv
Unique texts to score: 31 (raw samples 32 / total 32)


meta-llama-3.1-8b-instruct → s1:   0%|          | 0/31 [00:00<?, ?req/s]

✅ Column s1 written back -> data/reddit/scored/posts_scores_2025-09-26.csv


In [7]:
# cell6 score——google/gemma-2-9b——s2===
import os, re, glob, asyncio, numpy as np, pandas as pd
import nest_asyncio;

nest_asyncio.apply()
from tqdm.auto import tqdm
from openai import AsyncOpenAI
import datetime as dt
from datetime import timezone

LMSTUDIO_BASE_URL = "http://127.0.0.1:1234/v1"
LMSTUDIO_API_KEY = "lm-studio"
MODEL_NAME = "google/gemma-2-9b"
OUT_COL = "s2"

# — Configuration: score all rows, lower concurrency, increase retries —
VADER_EDGE = None  # no filtering
DEDUP_UNIQUE_TEXTS = True  # de-duplicate texts (faster; failures affect all identical texts)
MAX_TEXT_LEN = 256
CONCURRENCY = 4  # lower for stability (tune as needed)
RETRY = 5  # higher for stability
FALLBACK_TO_VADER_SIGN = True  # if still NaN, fallback to sign of VADER

# — Paths —
D = (dt.datetime.now(timezone.utc) - dt.timedelta(days=1)).date()
INPUT_CSV = f"data/reddit/scored/posts_scores_{D}.csv"
if not os.path.exists(INPUT_CSV):
    cands = sorted(glob.glob("data/reddit/scored/posts_scores_*.csv"), key=os.path.getmtime)
    if not cands: raise FileNotFoundError("Master table posts_scores_*.csv not found")
    INPUT_CSV = cands[-1]
OUTPUT_CSV = INPUT_CSV

df = pd.read_csv(INPUT_CSV, low_memory=False)

# Text column fallback
TEXT_COL = "text_clean"
if TEXT_COL not in df.columns:
    def _join(*xs): return " ".join([str(x) for x in xs if pd.notna(x)])


    df["__text_tmp__"] = [_join(t, s, b) for t, s, b in zip(
        df.get("title", ""), df.get("selftext", ""), df.get("body", "")
    )]
    TEXT_COL = "__text_tmp__"

# Prepare rows to score
if OUT_COL not in df.columns: df[OUT_COL] = np.nan
mask = df[OUT_COL].isna()
if (VADER_EDGE is not None) and ("vader" in df.columns):
    mask &= df["vader"].abs() < VADER_EDGE

todo = df.loc[mask].copy()
if todo.empty:
    print(f"No samples to score ({OUT_COL} already exists or filtered by threshold). File: {INPUT_CSV}")
else:
    todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)

    if DEDUP_UNIQUE_TEXTS:
        groups = todo.groupby("__txt").indices
        work_items = list(groups.keys())  # unique texts


        def write_back(scores_map):
            for t, idxs in groups.items():
                df.loc[idxs, OUT_COL] = scores_map.get(t, np.nan)
    else:
        work_items = list(todo.index)  # row indices


        def write_back(scores_map):
            for idx, sc in scores_map.items():
                df.loc[idx, OUT_COL] = sc

    print(f"Master table: {os.path.basename(INPUT_CSV)} | to score: {len(work_items)} (dedup={DEDUP_UNIQUE_TEXTS})")

    PROMPT = (
        "Classify the sentiment as exactly one token from {{POS, NEU, NEG}}.\n"
        "Text:\n{text}\n"
        "Answer:"
    )
    GRAMMAR = r'root ::= "POS" | "NEU" | "NEG"'


    def map_label_to_score(label: str) -> float:
        s = (label or "").strip().upper()
        first = re.split(r"\s+", s)[0] if s else ""
        if first in {"POS", "NEU", "NEG"}:
            return 1.0 if first == "POS" else (0.0 if first == "NEU" else -1.0)
        if "POS" in s: return 1.0
        if "NEG" in s: return -1.0
        if "NEU" in s: return 0.0
        return 0.0


    aclient = AsyncOpenAI(base_url=LMSTUDIO_BASE_URL, api_key=LMSTUDIO_API_KEY)


    async def ask(text) -> float:
        msg = PROMPT.format(text=text)
        for attempt in range(RETRY):
            try:
                r = await aclient.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": msg}],
                    temperature=0,
                    max_tokens=1,
                    top_p=1,
                    stop=["\n"],
                    extra_body={"grammar": GRAMMAR}
                )
                return map_label_to_score(r.choices[0].message.content)
            except Exception:
                # exponential backoff
                await asyncio.sleep(0.7 * (attempt + 1))
                if attempt == RETRY - 1:
                    return np.nan


    async def run_batch(items, desc):
        sem = asyncio.Semaphore(CONCURRENCY)
        results = {}

        async def one(item):
            text = item if DEDUP_UNIQUE_TEXTS else todo.at[item, "__txt"]
            async with sem:
                sc = await ask(text)
            return item, sc

        tasks = [asyncio.create_task(one(x)) for x in items]
        with tqdm(total=len(tasks), desc=desc, unit="req") as pbar:
            for fut in asyncio.as_completed(tasks):
                k, sc = await fut
                results[k] = sc
                pbar.update(1)
        return results


    loop = asyncio.get_event_loop()
    scores = loop.run_until_complete(run_batch(work_items, f"{MODEL_NAME} → {OUT_COL}"))

    # Second pass: retry failed (NaN) items with reduced concurrency
    failed = [k for k, v in scores.items() if (v is None) or (isinstance(v, float) and np.isnan(v))]
    if failed:
        print(f"⚠️ {len(failed)} failed in the first pass, reducing concurrency and retrying…")
        CONCURRENCY = max(2, CONCURRENCY // 2)
        retry_scores = loop.run_until_complete(run_batch(failed, f"retry {MODEL_NAME} → {OUT_COL}"))
        scores.update(retry_scores)

    # Write back
    write_back(scores)

    # Final fallback: remaining NaNs -> sign(vader) or 0
    still_nan = df[OUT_COL].isna().sum()
    if still_nan and FALLBACK_TO_VADER_SIGN:
        print(f"⚠️ {still_nan} rows still NaN; filling with sign of VADER.")
        sign = np.sign(df.get("vader", 0.0).fillna(0.0))
        df.loc[df[OUT_COL].isna(), OUT_COL] = sign.replace(0, 0.0)

    tmp = OUTPUT_CSV + ".tmp"
    df.to_csv(tmp, index=False, encoding="utf-8-sig")
    os.replace(tmp, OUTPUT_CSV)
    print(f"✅ Column {OUT_COL} written back -> {OUTPUT_CSV}")


Master table: posts_scores_2025-09-26.csv | to score: 31 (dedup=True)


google/gemma-2-9b → s2:   0%|          | 0/31 [00:00<?, ?req/s]

✅ Column s2 written back -> data/reddit/scored/posts_scores_2025-09-26.csv


In [8]:
# cell 7 score——qwen2.5-7b-instruct-1m——s3

import os, re, glob, asyncio, numpy as np, pandas as pd
import nest_asyncio;

nest_asyncio.apply()
from tqdm.auto import tqdm
from openai import AsyncOpenAI
import datetime as dt

# ---- Paths (relative to project root) ----
D = (dt.datetime.utcnow().date() - dt.timedelta(days=1))
MASTER_DIR = "data/reddit/scored"
PREF = os.path.join(MASTER_DIR, f"posts_scores_{D}.csv")

if os.path.exists(PREF):
    MASTER_PATH = PREF
else:
    cands = sorted(glob.glob(os.path.join(MASTER_DIR, "posts_scores_*.csv")), key=os.path.getmtime)
    if not cands:
        raise FileNotFoundError(
            "Master table not found: data/reddit/scored/posts_scores_*.csv. Run the VADER step first.")
    MASTER_PATH = cands[-1]

# ---- LM Studio configuration ----
LMSTUDIO_BASE_URL = "http://127.0.0.1:1234/v1"
LMSTUDIO_API_KEY = "lm-studio"
MODEL_NAME = "qwen2.5-7b-instruct-1m"  # ← set to the API identifier shown in LM Studio (right panel)
OUT_COL = "s3"

# ---- Scoring parameters (full coverage) ----
TEXT_COL = "text_clean"
MAX_TEXT_LEN = 256
CONCURRENCY = 8
RETRY = 3

PROMPT = (
    "Classify the sentiment as exactly one token from {{POS, NEU, NEG}}.\n"
    "Text:\n{text}\n"
    "Answer:"
)
GRAMMAR = r'root ::= "POS" | "NEU" | "NEG"'


def map_label_to_score(label: str) -> float:
    s = (label or "").strip().upper()
    first = re.split(r"\s+", s)[0] if s else ""
    if first in {"POS", "NEU", "NEG"}:
        return 1.0 if first == "POS" else (0.0 if first == "NEU" else -1.0)
    if "POS" in s: return 1.0
    if "NEG" in s: return -1.0
    if "NEU" in s: return 0.0
    return 0.0


# ---- Read master table & select samples to score (full; fill missing only) ----
df = pd.read_csv(MASTER_PATH, low_memory=False)

# Fallback for text column
if TEXT_COL not in df.columns:
    def _join(*xs): return " ".join([str(x) for x in xs if pd.notna(x)])


    df["__text_tmp__"] = [_join(t, s, b) for t, s, b in zip(
        df.get("title", ""), df.get("selftext", ""), df.get("body", "")
    )]
    TEXT_COL = "__text_tmp__"

# Ensure output column exists
if OUT_COL not in df.columns:
    df[OUT_COL] = np.nan

# Only fill missing values
mask = df[OUT_COL].isna()
todo = df.loc[mask].copy()

if todo.empty:
    print(f"No samples to score ({OUT_COL} already exists with no missing). File: {MASTER_PATH}")
else:
    # De-duplicate by text to speed up requests
    todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)
    groups = todo.groupby("__txt").indices
    unique_texts = list(groups.keys())
    print(f"Target file: {MASTER_PATH}")
    print(f"Unique texts to score: {len(unique_texts)} (raw {len(todo)} / total {len(df)})")

    aclient = AsyncOpenAI(base_url=LMSTUDIO_BASE_URL, api_key=LMSTUDIO_API_KEY)


    async def classify_text(t: str) -> float:
        msg = PROMPT.format(text=t)
        for attempt in range(RETRY):
            try:
                r = await aclient.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": msg}],
                    temperature=0,
                    max_tokens=1,
                    top_p=1,
                    stop=["\n"],
                    extra_body={"grammar": GRAMMAR}
                )
                return map_label_to_score(r.choices[0].message.content)
            except Exception:
                if attempt == RETRY - 1:
                    return np.nan
                await asyncio.sleep(0.6 * (attempt + 1))


    async def run_all(texts, concurrency=8):
        sem = asyncio.Semaphore(concurrency)
        results = {}

        async def bound_task(t):
            async with sem:
                sc = await classify_text(t)
            return t, sc

        tasks = [asyncio.create_task(bound_task(t)) for t in texts]
        with tqdm(total=len(tasks), desc=f"{MODEL_NAME} → {OUT_COL}", unit="req") as pbar:
            for fut in asyncio.as_completed(tasks):
                t, sc = await fut
                results[t] = sc
                pbar.update(1)
        return results


    loop = asyncio.get_event_loop()
    scores_map = loop.run_until_complete(run_all(unique_texts, CONCURRENCY))

    # Write back & atomically persist (same file)
    for t, idxs in groups.items():
        df.loc[idxs, OUT_COL] = scores_map.get(t, np.nan)

    tmp_out = MASTER_PATH + ".tmp"
    df.to_csv(tmp_out, index=False, encoding="utf-8-sig")
    os.replace(tmp_out, MASTER_PATH)

    print(f"✅ Column {OUT_COL} written back -> {MASTER_PATH}")


  D = (dt.datetime.utcnow().date() - dt.timedelta(days=1))


Target file: data/reddit/scored\posts_scores_2025-09-26.csv
Unique texts to score: 31 (raw 32 / total 32)


qwen2.5-7b-instruct-1m → s3:   0%|          | 0/31 [00:00<?, ?req/s]

✅ Column s3 written back -> data/reddit/scored\posts_scores_2025-09-26.csv


In [9]:
# =========================
# cell8 score mistralai/mistral-7b-instruct-v0.3-s4
import os, re, glob, asyncio, numpy as np, pandas as pd
import nest_asyncio;

nest_asyncio.apply()
from tqdm.auto import tqdm
from openai import AsyncOpenAI
import datetime as dt

# ---- Paths (relative to project root) ----
D = (dt.datetime.utcnow().date() - dt.timedelta(days=1))
MASTER_DIR = "data/reddit/scored"
PREF = os.path.join(MASTER_DIR, f"posts_scores_{D}.csv")
MASTER_PATH = PREF if os.path.exists(PREF) else sorted(
    glob.glob(os.path.join(MASTER_DIR, "posts_scores_*.csv")),
    key=os.path.getmtime
)[-1]

# ---- LM Studio configuration (set MODEL_NAME to your LM Studio API identifier) ----
LMSTUDIO_BASE_URL = "http://127.0.0.1:1234/v1"
LMSTUDIO_API_KEY = "lm-studio"
MODEL_NAME = "mistralai/mistral-7b-instruct-v0.3"  # ← change to the API identifier shown in LM Studio (right panel)
OUT_COL = "s4"

# ---- Scoring parameters ----
TEXT_COL = "text_clean"
MAX_TEXT_LEN = 256
CONCURRENCY = 8
RETRY = 3
DEDUP_UNIQUE_TEXTS = True  # True: score each unique text once (faster); False: score every row

PROMPT = (
    "Classify the sentiment as exactly one token from {{POS, NEU, NEG}}.\n"
    "Text:\n{text}\n"
    "Answer:"
)
GRAMMAR = r'root ::= "POS" | "NEU" | "NEG"'


def map_label_to_score(label: str) -> float:
    s = (label or "").strip().upper()
    first = re.split(r"\s+", s)[0] if s else ""
    if first in {"POS", "NEU", "NEG"}:
        return 1.0 if first == "POS" else (0.0 if first == "NEU" else -1.0)
    if "POS" in s: return 1.0
    if "NEG" in s: return -1.0
    if "NEU" in s: return 0.0
    return 0.0


# ---- Read master table & select rows to score (fill missing only) ----
df = pd.read_csv(MASTER_PATH, low_memory=False)

if TEXT_COL not in df.columns:
    def _join(*xs): return " ".join([str(x) for x in xs if pd.notna(x)])


    df["__text_tmp__"] = [_join(t, s, b) for t, s, b in zip(
        df.get("title", ""), df.get("selftext", ""), df.get("body", "")
    )]
    TEXT_COL = "__text_tmp__"

if OUT_COL not in df.columns:
    df[OUT_COL] = np.nan

todo = df.loc[df[OUT_COL].isna()].copy()
if todo.empty:
    print(f"No samples to score ({OUT_COL} already exists with no missing). File: {MASTER_PATH}")
else:
    if DEDUP_UNIQUE_TEXTS:
        todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)
        groups = todo.groupby("__txt").indices
        work_items = list(groups.keys())  # set of unique texts


        def write_back(scores_map):
            for t, idxs in groups.items():
                df.loc[idxs, OUT_COL] = scores_map.get(t, np.nan)
    else:
        todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)
        work_items = list(todo.index)  # set of row indices


        def write_back(scores_map):
            for idx, sc in scores_map.items():
                df.loc[idx, OUT_COL] = sc

    print(f"Target file: {MASTER_PATH}")
    print(f"To score: {len(work_items)} (dedup={DEDUP_UNIQUE_TEXTS})")

    aclient = AsyncOpenAI(base_url=LMSTUDIO_BASE_URL, api_key=LMSTUDIO_API_KEY)


    async def classify_text(text: str) -> float:
        msg = PROMPT.format(text=text)
        for attempt in range(RETRY):
            try:
                r = await aclient.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": msg}],
                    temperature=0,
                    max_tokens=1,
                    top_p=1,
                    stop=["\n"],
                    extra_body={"grammar": GRAMMAR}
                )
                return map_label_to_score(r.choices[0].message.content)
            except Exception:
                if attempt == RETRY - 1:
                    return np.nan
                await asyncio.sleep(0.6 * (attempt + 1))


    async def run_all(items):
        sem = asyncio.Semaphore(CONCURRENCY)
        results = {}

        async def run_one(item):
            text = item if DEDUP_UNIQUE_TEXTS else todo.at[item, "__txt"]
            async with sem:
                sc = await classify_text(text)
            return item, sc

        tasks = [asyncio.create_task(run_one(x)) for x in items]
        with tqdm(total=len(tasks), desc=f"{MODEL_NAME} → {OUT_COL}", unit="req") as pbar:
            for fut in asyncio.as_completed(tasks):
                k, sc = await fut
                results[k] = sc
                pbar.update(1)
        return results


    loop = asyncio.get_event_loop()
    scores_map = loop.run_until_complete(run_all(work_items))

    # Write back & atomic persist
    write_back(scores_map)
    tmp_out = MASTER_PATH + ".tmp"
    df.to_csv(tmp_out, index=False, encoding="utf-8-sig")
    os.replace(tmp_out, MASTER_PATH)

    print(f"✅ Column {OUT_COL} written back -> {MASTER_PATH}")


  D = (dt.datetime.utcnow().date() - dt.timedelta(days=1))


Target file: data/reddit/scored\posts_scores_2025-09-26.csv
To score: 31 (dedup=True)


mistralai/mistral-7b-instruct-v0.3 → s4:   0%|          | 0/31 [00:00<?, ?req/s]

✅ Column s4 written back -> data/reddit/scored\posts_scores_2025-09-26.csv


In [10]:
# =========================
# cell 9 score nous-hermes-2-mistral-7b-dpo——s5

import os, re, glob, asyncio, numpy as np, pandas as pd
import nest_asyncio;

nest_asyncio.apply()
from tqdm.auto import tqdm
from openai import AsyncOpenAI
import datetime as dt

# ---- Paths (relative to project root) ----
D = (dt.datetime.utcnow().date() - dt.timedelta(days=1))
MASTER_DIR = "data/reddit/scored"
PREF = os.path.join(MASTER_DIR, f"posts_scores_{D}.csv")
MASTER_PATH = PREF if os.path.exists(PREF) else sorted(
    glob.glob(os.path.join(MASTER_DIR, "posts_scores_*.csv")),
    key=os.path.getmtime
)[-1]

# ---- LM Studio configuration (set MODEL_NAME to your LM Studio API identifier) ----
LMSTUDIO_BASE_URL = "http://127.0.0.1:1234/v1"
LMSTUDIO_API_KEY = "lm-studio"
MODEL_NAME = "nous-hermes-2-mistral-7b-dpo"  # ← change to the API identifier shown in LM Studio (right panel)
OUT_COL = "s5"

# ---- Scoring parameters ----
TEXT_COL = "text_clean"
MAX_TEXT_LEN = 256
CONCURRENCY = 8
RETRY = 3
DEDUP_UNIQUE_TEXTS = True  # True: score each unique text once (faster); False: score every row

PROMPT = (
    "Classify the sentiment as exactly one token from {{POS, NEU, NEG}}.\n"
    "Text:\n{text}\n"
    "Answer:"
)
GRAMMAR = r'root ::= "POS" | "NEU" | "NEG"'


def map_label_to_score(label: str) -> float:
    s = (label or "").strip().upper()
    first = re.split(r"\s+", s)[0] if s else ""
    if first in {"POS", "NEU", "NEG"}:
        return 1.0 if first == "POS" else (0.0 if first == "NEU" else -1.0)
    if "POS" in s: return 1.0
    if "NEG" in s: return -1.0
    if "NEU" in s: return 0.0
    return 0.0


# ---- Read master table & select rows to score (fill missing only) ----
df = pd.read_csv(MASTER_PATH, low_memory=False)

if TEXT_COL not in df.columns:
    def _join(*xs): return " ".join([str(x) for x in xs if pd.notna(x)])


    df["__text_tmp__"] = [_join(t, s, b) for t, s, b in zip(
        df.get("title", ""), df.get("selftext", ""), df.get("body", "")
    )]
    TEXT_COL = "__text_tmp__"

if OUT_COL not in df.columns:
    df[OUT_COL] = np.nan

todo = df.loc[df[OUT_COL].isna()].copy()
if todo.empty:
    print(f"No samples to score ({OUT_COL} already exists with no missing). File: {MASTER_PATH}")
else:
    if DEDUP_UNIQUE_TEXTS:
        todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)
        groups = todo.groupby("__txt").indices
        work_items = list(groups.keys())


        def write_back(scores_map):
            for t, idxs in groups.items():
                df.loc[idxs, OUT_COL] = scores_map.get(t, np.nan)
    else:
        todo["__txt"] = todo[TEXT_COL].astype(str).str.slice(0, MAX_TEXT_LEN)
        work_items = list(todo.index)


        def write_back(scores_map):
            for idx, sc in scores_map.items():
                df.loc[idx, OUT_COL] = sc

    print(f"Target file: {MASTER_PATH}")
    print(f"To score: {len(work_items)} (dedup={DEDUP_UNIQUE_TEXTS})")

    aclient = AsyncOpenAI(base_url=LMSTUDIO_BASE_URL, api_key=LMSTUDIO_API_KEY)


    async def classify_text(text: str) -> float:
        msg = PROMPT.format(text=text)
        for attempt in range(RETRY):
            try:
                r = await aclient.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": msg}],
                    temperature=0,
                    max_tokens=1,
                    top_p=1,
                    stop=["\n"],
                    extra_body={"grammar": GRAMMAR}
                )
                return map_label_to_score(r.choices[0].message.content)
            except Exception:
                if attempt == RETRY - 1:
                    return np.nan
                await asyncio.sleep(0.6 * (attempt + 1))


    async def run_all(items):
        sem = asyncio.Semaphore(CONCURRENCY)
        results = {}

        async def run_one(item):
            text = item if DEDUP_UNIQUE_TEXTS else todo.at[item, "__txt"]
            async with sem:
                sc = await classify_text(text)
            return item, sc

        tasks = [asyncio.create_task(run_one(x)) for x in items]
        with tqdm(total=len(tasks), desc=f"{MODEL_NAME} → {OUT_COL}", unit="req") as pbar:
            for fut in asyncio.as_completed(tasks):
                k, sc = await fut
                results[k] = sc
                pbar.update(1)
        return results


    loop = asyncio.get_event_loop()
    scores_map = loop.run_until_complete(run_all(work_items))

    # Write back & atomic persist
    write_back(scores_map)
    tmp_out = MASTER_PATH + ".tmp"
    df.to_csv(tmp_out, index=False, encoding="utf-8-sig")
    os.replace(tmp_out, MASTER_PATH)

    print(f"✅ Column {OUT_COL} written back -> {MASTER_PATH}")


  D = (dt.datetime.utcnow().date() - dt.timedelta(days=1))


Target file: data/reddit/scored\posts_scores_2025-09-26.csv
To score: 31 (dedup=True)


nous-hermes-2-mistral-7b-dpo → s5:   0%|          | 0/31 [00:00<?, ?req/s]

✅ Column s5 written back -> data/reddit/scored\posts_scores_2025-09-26.csv


In [11]:
# =========================
# Weighted daily -> 1min forward-fill (vader + s1..s5)
# Aggregate from data/reddit/scored to daily values; expand to 1-minute with forward-fill
# =========================
# pip install pandas numpy

import os, glob, math
import pandas as pd
import numpy as np

# ---- Paths ----
SCORED_DIR = "data/reddit/scored"
OUT_DIR    = "data/reddit/weighted"
os.makedirs(OUT_DIR, exist_ok=True)

# ---- Read all posts_scores_*.csv ----
paths = sorted(glob.glob(os.path.join(SCORED_DIR, "posts_scores_*.csv")))
if not paths:
    raise FileNotFoundError("No posts_scores_*.csv found under data/reddit/scored/. Please generate the master table first.")

dfs = []
for p in paths:
    try:
        dfp = pd.read_csv(p, low_memory=False)
    except UnicodeDecodeError:
        dfp = pd.read_csv(p, encoding="latin-1", low_memory=False)
    dfp["__source"] = os.path.basename(p)
    dfs.append(dfp)

df = pd.concat(dfs, ignore_index=True)

# ---- Sentiment columns: support both naming styles (s1..s5 or sent_s1..sent_s5); normalize to s1..s5 ----
sent_cols = []
if "vader" in df.columns:
    sent_cols.append("vader")

# Prefer s1..s5; if missing, map sent_s1..sent_s5 -> s1..s5
for k in ["s1","s2","s3","s4","s5"]:
    if k in df.columns:
        sent_cols.append(k)
    elif f"sent_{k}" in df.columns:
        df[k] = pd.to_numeric(df[f"sent_{k}"], errors="coerce")
        sent_cols.append(k)

if not sent_cols:
    raise ValueError("No sentiment columns found (expect at least one among vader + s1..s5).")

# ---- Timestamps (prefer created_time_utc, else created_utc seconds) ----
if "created_time_utc" in df.columns:
    t = pd.to_datetime(df["created_time_utc"], errors="coerce", utc=True)
elif "created_utc" in df.columns:
    t = pd.to_datetime(df["created_utc"], unit="s", errors="coerce", utc=True)
else:
    raise ValueError("Requires 'created_time_utc' or 'created_utc' column.")

df["created_time_utc"] = t
df = df.dropna(subset=["created_time_utc"])
df = df[df["created_time_utc"] >= "2005-01-01"]

# ---- De-duplicate (by id if available) ----
if "id" in df.columns:
    df = df.sort_values("created_time_utc").drop_duplicates("id", keep="last")

# ---- Weights: log(1+score) + 0.5*log(1+num_comments) ----
score = pd.to_numeric(df.get("score", 0), errors="coerce").fillna(0).clip(lower=0)
if "num_comments" in df.columns:
    numc = pd.to_numeric(df["num_comments"], errors="coerce").fillna(0).clip(lower=0)
    weight = score.apply(math.log1p) + 0.5 * numc.apply(math.log1p)
else:
    weight = score.apply(math.log1p)
df["__w"] = weight

# ---- Cast sentiment columns to numeric and clip to [-1, 1] ----
for c in sent_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").clip(-1, 1)

# ---- Daily aggregation (UTC day): weighted average; if total weight=0, fall back to simple mean ----
df["date_utc"] = df["created_time_utc"].dt.date

def wavg(series: pd.Series, w: pd.Series) -> float:
    s = series.astype(float)
    w = w.astype(float)
    den = np.nansum(w)
    return float(np.nansum(s * w) / den) if den > 0 else float(np.nanmean(s))

daily_rows = []
for d, g in df.groupby("date_utc"):
    row = {"date_utc": d}
    for c in sent_cols:
        row[c] = wavg(g[c], g["__w"])
    daily_rows.append(row)

daily = pd.DataFrame(daily_rows).sort_values("date_utc").reset_index(drop=True)

# ---- Expand to 1-minute and forward-fill (merge_asof for stability) ----
# 1) Set daily timestamps to 00:00 UTC
daily_ff = daily.copy()
daily_ff["ts_day"] = pd.to_datetime(daily_ff["date_utc"]).dt.tz_localize("UTC")
daily_ff = daily_ff.sort_values("ts_day").reset_index(drop=True)

# 2) Build minute index (UTC)
start = daily_ff["ts_day"].min()
end   = daily_ff["ts_day"].max() + pd.Timedelta(days=1) - pd.Timedelta(minutes=1)
minute_df = pd.DataFrame({"ts": pd.date_range(start=start, end=end, freq="min", tz="UTC")})

# 3) Backward asof-merge: each minute takes the most recent daily 00:00 value (not later than itself)
joined = pd.merge_asof(
    minute_df.sort_values("ts"),
    daily_ff[["ts_day"] + sent_cols].sort_values("ts_day"),
    left_on="ts", right_on="ts_day",
    direction="backward"
)

# 4) Clean columns: keep ts + sentiment columns only
minute_df = joined.drop(columns=["ts_day"])
minute_df = minute_df[["ts"] + sent_cols]

# ---- Save ----
daily_out  = os.path.join(OUT_DIR, "sentiment_daily_vader_s1_s5.csv")
minute_out = os.path.join(OUT_DIR, "sentiment_1min_vader_s1_s5.csv")

daily_out_df = daily.rename(columns={"date_utc": "ts"})
daily_out_df["ts"] = pd.to_datetime(daily_out_df["ts"]).dt.tz_localize("UTC")

daily_out_df.to_csv(daily_out, index=False, encoding="utf-8")
minute_df.to_csv(minute_out, index=False, encoding="utf-8")

print("✅ Done.")
print(f"Days   : {daily_out_df['ts'].min().date()} ~ {daily_out_df['ts'].max().date()}")
print(f"Daily  : {daily_out}")
print(f"1-min  : {minute_out}")
print("\nPreview (daily):")
display(daily_out_df.head(3))
print("\nPreview (1-min):")
display(minute_df.head(3))



✅ Done.
Days   : 2025-09-24 ~ 2025-09-26
Daily  : data/reddit/weighted\sentiment_daily_vader_s1_s5.csv
1-min  : data/reddit/weighted\sentiment_1min_vader_s1_s5.csv

Preview (daily):


Unnamed: 0,ts,vader,s1,s2,s3,s4,s5
0,2025-09-24 00:00:00+00:00,0.4048,-0.23979,0.035424,-0.152483,-0.232451,-0.167206
1,2025-09-26 00:00:00+00:00,0.489822,-0.225582,-0.042975,-0.204605,-0.303092,-0.252197



Preview (1-min):


Unnamed: 0,ts,vader,s1,s2,s3,s4,s5
0,2025-09-24 00:00:00+00:00,0.4048,-0.23979,0.035424,-0.152483,-0.232451,-0.167206
1,2025-09-24 00:01:00+00:00,0.4048,-0.23979,0.035424,-0.152483,-0.232451,-0.167206
2,2025-09-24 00:02:00+00:00,0.4048,-0.23979,0.035424,-0.152483,-0.232451,-0.167206
