In [None]:
# STEP 1 — SCRAPE POSTS
# Outputs: ip_posts.csv

# Inspired by https://github.com/fferegrino/r-worldnews-live-threads-ukraine/blob/main/download_threads.ipynb

import datetime as dt

# Reddit API keys
CLIENT_ID     = "..."        
CLIENT_SECRET = "..." 
USER_AGENT    = "..."

# Date window (inclusive start, inclusive end)
DATE_START = dt.date(2020, 1, 1)
DATE_END   = dt.date.today()  # or a fixed date like dt.date(2025, 9, 1)

# Subreddits & keywords
SUBREDDITS = ["IsraelPalestine", "worldnews", "Palestine", "Judaism"]
KEYWORDS = [
    "Israel","Palestine","Gaza","West Bank","Hamas","IDF",
    "ceasefire","hostages","Hezbollah","settlements","UNRWA","rafah"
]

# Output files 
OUT_CSV   = "ip_posts.csv"
OUT_JSONL = "ip_posts.jsonl" 

# Politeness
SLEEP_BETWEEN = 0.4  # seconds between API reads


In [2]:
import os, time, json, csv
import pandas as pd
from dataclasses import dataclass, asdict
from typing import Optional, List, Iterable, Tuple

import praw

@dataclass
class Row:
    id: str
    created_utc: int
    subreddit: str
    author: Optional[str]
    title: Optional[str]
    selftext: Optional[str]
    score: Optional[int]
    num_comments: Optional[int]
    link_flair_text: Optional[str]
    url: Optional[str]
    permalink: Optional[str]

FIELDS = [f.name for f in Row.__dataclass_fields__.values()]

def to_epoch(d: dt.date, end_of_day=False) -> int:
    if end_of_day:
        ts = dt.datetime(d.year, d.month, d.day, 23, 59, 59, tzinfo=dt.timezone.utc)
    else:
        ts = dt.datetime(d.year, d.month, d.day, 0, 0, 0, tzinfo=dt.timezone.utc)
    return int(ts.timestamp())

EPOCH_START = to_epoch(DATE_START, end_of_day=False)
EPOCH_END   = to_epoch(DATE_END,   end_of_day=True)

def flair_keep(sr: str, flair: Optional[str]) -> bool:
    # Paper-style filtering
    if sr.lower() == "worldnews":
        return (flair or "").strip() == "Israel/Palestine"
    if sr.lower() == "judaism":
        return (flair or "").strip() == "Israel Megathread"
    return True

def keyword_hit(title: str, selftext: str) -> bool:
    blob = f"{title or ''} {selftext or ''}".lower()
    return any(k.lower() in blob for k in KEYWORDS)

def ensure_csv(path: str):
    if not os.path.exists(path):
        with open(path, "w", encoding="utf-8", newline="") as f:
            csv.DictWriter(f, fieldnames=FIELDS).writeheader()

def append_csv(path: str, rows: Iterable[dict]):
    with open(path, "a", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=FIELDS)
        for r in rows:
            w.writerow(r)

def append_jsonl(path: str, rows: Iterable[dict]):
    with open(path, "a", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def load_seen_ids(path: str) -> set:
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        return set()
    try:
        return set(pd.read_csv(path, usecols=["id"])["id"].astype(str))
    except Exception:
        ids = set()
        with open(path, "r", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                ids.add(row["id"])
        return ids

def init_reddit():
    assert all([CLIENT_ID, CLIENT_SECRET, USER_AGENT]) and CLIENT_ID != "YOUR_CLIENT_ID", \
        "Fill CLIENT_ID/CLIENT_SECRET/USER_AGENT in the Config cell."
    return praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT, check_for_async=False)


In [None]:
def search_month_window(sr_name: str, month_start: dt.date) -> List[Row]:
    """
    Try time-bounded search via legacy cloudsearch for one month.
    Fallback to scanning 'new' (recent) and 'top(all)' (historic) for that window.
    """
    reddit = init_reddit()
    rows: List[Row] = []

    # month bounds
    if month_start.month == 12:
        month_end = dt.date(month_start.year + 1, 1, 1) - dt.timedelta(days=1)
    else:
        month_end = dt.date(month_start.year, month_start.month + 1, 1) - dt.timedelta(days=1)

    # clamp to global
    if month_start < DATE_START: month_start = DATE_START
    if month_end   > DATE_END:   month_end   = DATE_END
    if month_start > month_end:  return rows

    t0 = to_epoch(month_start, end_of_day=False)
    t1 = to_epoch(month_end,   end_of_day=True)

    sr = reddit.subreddit(sr_name)

    #  A) Try cloudsearch timestamp query ---
    q = f"timestamp:{t0}..{t1}"
    got_any = False
    try:
        for s in sr.search(q, sort="new", syntax="cloudsearch", limit=None):
            cu = int(getattr(s, "created_utc", 0) or 0)
            if cu < t0 or cu > t1:  # defensive
                continue
            flair = getattr(s, "link_flair_text", None)
            if not flair_keep(sr_name, flair):
                continue
            title = getattr(s, "title", "") or ""
            selftext = getattr(s, "selftext", "") or ""
            if not keyword_hit(title, selftext):
                continue
            rows.append(Row(
                id=s.id,
                created_utc=cu,
                subreddit=str(getattr(s, "subreddit", "")),
                author=str(getattr(s, "author", "") or "") or None,
                title=title,
                selftext=selftext,
                score=int(getattr(s, "score", 0) or 0),
                num_comments=int(getattr(s, "num_comments", 0) or 0),
                link_flair_text=flair,
                url=getattr(s, "url", None),
                permalink=getattr(s, "permalink", None),
            ))
            got_any = True
        if got_any:
            return rows
    except Exception:
        # cloudsearch often fails or returns nothing — fall back
        pass

    # B) Fallback 1: scroll 'new' until older than t0 (good for recent months) ---
    try:
        for s in sr.new(limit=None):
            cu = int(getattr(s, "created_utc", 0) or 0)
            if cu < t0:
                break  # older than our month
            if cu > t1:
                continue  # too new
            flair = getattr(s, "link_flair_text", None)
            if not flair_keep(sr_name, flair):
                continue
            title = getattr(s, "title", "") or ""
            selftext = getattr(s, "selftext", "") or ""
            if not keyword_hit(title, selftext):
                continue
            rows.append(Row(
                id=s.id,
                created_utc=cu,
                subreddit=str(getattr(s, "subreddit", "")),
                author=str(getattr(s, "author", "") or "") or None,
                title=title,
                selftext=selftext,
                score=int(getattr(s, "score", 0) or 0),
                num_comments=int(getattr(s, "num_comments", 0) or 0),
                link_flair_text=flair,
                url=getattr(s, "url", None),
                permalink=getattr(s, "permalink", None),
            ))
        if rows:
            return rows
    except Exception:
        pass

    # C) Fallback 2: scan 'top("all")' and pick those in [t0, t1] (not exhaustive, but gets salient posts) ---
    try:
        for s in sr.top(time_filter="all", limit=2000):  # adjust if needed
            cu = int(getattr(s, "created_utc", 0) or 0)
            if cu < t0:
                # top(all) is not chronological; we can't break early safely
                continue
            if cu > t1:
                continue
            flair = getattr(s, "link_flair_text", None)
            if not flair_keep(sr_name, flair):
                continue
            title = getattr(s, "title", "") or ""
            selftext = getattr(s, "selftext", "") or ""
            if not keyword_hit(title, selftext):
                continue
            rows.append(Row(
                id=s.id,
                created_utc=cu,
                subreddit=str(getattr(s, "subreddit", "")),
                author=str(getattr(s, "author", "") or "") or None,
                title=title,
                selftext=selftext,
                score=int(getattr(s, "score", 0) or 0),
                num_comments=int(getattr(s, "num_comments", 0) or 0),
                link_flair_text=flair,
                url=getattr(s, "url", None),
                permalink=getattr(s, "permalink", None),
            ))
    except Exception:
        pass

    return rows


In [4]:
def run_official_only():
    ensure_csv(OUT_CSV)
    seen = load_seen_ids(OUT_CSV)

    # build month starts from DATE_START → DATE_END
    months = []
    d = dt.date(DATE_START.year, DATE_START.month, 1)
    end_month = dt.date(DATE_END.year, DATE_END.month, 1)
    while d <= end_month:
        months.append(d)
        # advance a month
        if d.month == 12:
            d = dt.date(d.year + 1, 1, 1)
        else:
            d = dt.date(d.year, d.month + 1, 1)

    total = 0
    for sr in SUBREDDITS:
        print(f"[subreddit] r/{sr}")
        for m in months:
            rows = search_month_window(sr, m)
            if not rows:
                continue
            # dedupe against existing file + within this batch
            deduped = []
            for r in rows:
                if r.id not in seen:
                    deduped.append(asdict(r))
                    seen.add(r.id)
            if deduped:
                append_csv(OUT_CSV, deduped)
                if OUT_JSONL:
                    append_jsonl(OUT_JSONL, deduped)
                total += len(deduped)
                print(f"  [{m.strftime('%Y-%m')}] wrote {len(deduped)} posts")
            time.sleep(SLEEP_BETWEEN)
    print(f"[done] total posts written: {total}")

run_official_only()


[subreddit] r/IsraelPalestine
  [2020-05] wrote 2 posts
  [2020-07] wrote 2 posts
  [2020-08] wrote 3 posts
  [2020-09] wrote 3 posts
  [2020-12] wrote 1 posts
  [2021-01] wrote 1 posts
  [2021-02] wrote 2 posts
  [2021-03] wrote 2 posts
  [2021-04] wrote 2 posts
  [2021-05] wrote 38 posts
  [2021-06] wrote 26 posts
  [2021-07] wrote 13 posts
  [2021-08] wrote 4 posts
  [2021-09] wrote 3 posts
  [2021-10] wrote 2 posts
  [2021-11] wrote 2 posts
  [2022-01] wrote 1 posts
  [2022-02] wrote 2 posts
  [2022-03] wrote 3 posts
  [2022-04] wrote 2 posts
  [2022-05] wrote 7 posts
  [2022-06] wrote 4 posts
  [2022-07] wrote 4 posts
  [2022-08] wrote 3 posts
  [2022-09] wrote 1 posts
  [2022-10] wrote 1 posts
  [2023-01] wrote 1 posts
  [2023-02] wrote 1 posts
  [2023-04] wrote 2 posts
  [2023-05] wrote 3 posts
  [2023-06] wrote 1 posts
  [2023-07] wrote 2 posts
  [2023-08] wrote 1 posts
  [2023-09] wrote 1 posts
  [2023-10] wrote 161 posts
  [2023-11] wrote 98 posts
  [2023-12] wrote 52 posts
 