In [None]:
# STEP 2 — FETCH COMMENTS
# Inputs: ip_posts.csv
# Outputs: ip_comments.csv

# %pip install praw pandas tqdm
# Inspired by [https://praw.readthedocs.io/en/stable/tutorials/comments.html](https://praw.readthedocs.io/en/stable/tutorials/comments.html)
# [https://www.kaggle.com/code/gpreda/collect-and-update-data-on-reddit](https://www.kaggle.com/code/gpreda/collect-and-update-data-on-reddit)
# [https://github.com/loganblackstad/Universal-Reddit-Scraper/tree/master](https://github.com/loganblackstad/Universal-Reddit-Scraper/tree/master)
# download_threads.ipynb)

import os

# Reddit API keys (same as Step 1)
CLIENT_ID     = "..."        
CLIENT_SECRET = "..." 
USER_AGENT    = "..."

# Input from Step 1
POSTS_CSV = "ip_posts.csv"   # produced by Step 1

# Outputs for Step 2 (comments)
COMMENTS_CSV   = "ip_comments.csv"
COMMENTS_JSONL = "ip_comments.jsonl" 

# Politeness / limits 
SLEEP_BETWEEN_REQUESTS = 0.4
MAX_SUBMISSIONS_THIS_RUN = None   # 200 to test or None for all


In [None]:
import time, csv, json
import pandas as pd
from dataclasses import dataclass, asdict
from typing import Optional, List, Iterable, Dict
from tqdm.auto import tqdm
import praw

@dataclass
class CommentRow:
    id: str
    created_utc: int
    subreddit: str
    link_id: str        
    parent_id: str
    author: Optional[str]
    body: Optional[str]
    score: Optional[int]
    submission_id: str
    submission_flair: Optional[str]

COM_FIELDS = [f.name for f in CommentRow.__dataclass_fields__.values()]

def ensure_csv(path: str, fields: List[str]):
    if not os.path.exists(path):
        with open(path, "w", encoding="utf-8", newline="") as f:
            csv.DictWriter(f, fieldnames=fields).writeheader()

def append_csv(path: str, rows: Iterable[dict], fields: List[str]):
    if not rows:
        return
    with open(path, "a", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fields)
        for r in rows:
            w.writerow(r)

def append_jsonl(path: str, rows: Iterable[dict]):
    if not rows or not path:
        return
    with open(path, "a", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def load_seen_ids(path: str, id_field: str) -> set:
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        return set()
    try:
        return set(pd.read_csv(path, usecols=[id_field])[id_field].astype(str))
    except Exception:
        seen = set()
        with open(path, "r", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                seen.add(str(row.get(id_field)))
        return seen

def init_reddit():
    assert all([CLIENT_ID, CLIENT_SECRET, USER_AGENT]) and CLIENT_ID != "YOUR_CLIENT_ID", \
        "Fill CLIENT_ID/CLIENT_SECRET/USER_AGENT in the Config cell."
    return praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT, check_for_async=False)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd

# Load posts from Step 1
posts = pd.read_csv(POSTS_CSV)

# Keep only what we need
if "id" not in posts.columns:
    raise ValueError("ip_posts.csv must have a column named 'id' (the submission id).")

# Create a flair map (submission_id -> link_flair_text)
flair_map = {}
if "link_flair_text" in posts.columns:
    flair_map = dict(zip(posts["id"].astype(str), posts["link_flair_text"].astype(str)))
else:
    # safe default if Step 1 didn't include flair
    flair_map = {sid: None for sid in posts["id"].astype(str)}

# De-duplicate submissions and (optionally) cap how many to fetch in this run
submission_ids = posts["id"].astype(str).drop_duplicates().tolist()
if MAX_SUBMISSIONS_THIS_RUN is not None:
    submission_ids = submission_ids[:int(MAX_SUBMISSIONS_THIS_RUN)]

len(submission_ids), submission_ids[:5]


(3056, ['gf34wp', 'gnle2w', 'ho0fmu', 'hm3cjv', 'i3c9om'])

In [5]:
reddit = init_reddit()

ensure_csv(COMMENTS_CSV, COM_FIELDS)
seen_comment_ids = load_seen_ids(COMMENTS_CSV, "id")

total_written = 0
batch = []

for sid in tqdm(submission_ids, desc="Submissions"):
    try:
        subm = reddit.submission(id=sid)
        # Do NOT expand more comments (fast path)
        subm.comments.replace_more(limit=0)
    except Exception as e:
        print(f"[warn] failed to prep submission {sid}: {e}")
        continue

    inherited_flair = flair_map.get(sid)

    # Flatten whatever is already loaded (top-level + any preloaded replies)
    try:
        flat = list(subm.comments.list())
    except Exception as e:
        print(f"[warn] listing comments failed for {sid}: {e}")
        flat = []

    for c in flat:
        try:
            cid = str(getattr(c, "id", ""))
            if not cid or cid in seen_comment_ids:
                continue

            row = CommentRow(
                id=cid,
                created_utc=int(getattr(c, "created_utc", 0) or 0),
                subreddit=str(getattr(c, "subreddit", "")),
                link_id=str(getattr(c, "link_id", "")),   # e.g., t3_<sid>
                parent_id=str(getattr(c, "parent_id", "")),
                author=str(getattr(c, "author", "") or "") or None,
                body=getattr(c, "body", None),
                score=int(getattr(c, "score", 0) or 0),
                submission_id=sid,
                submission_flair=inherited_flair,
            )
            batch.append(asdict(row))
            seen_comment_ids.add(cid)
        except Exception:
            continue

    # Write periodically
    if len(batch) >= 2000:
        append_csv(COMMENTS_CSV, batch, COM_FIELDS)
        if COMMENTS_JSONL:
            append_jsonl(COMMENTS_JSONL, batch)
        total_written += len(batch)
        batch.clear()

    time.sleep(SLEEP_BETWEEN_REQUESTS)

# Flush any remaining rows
if batch:
    append_csv(COMMENTS_CSV, batch, COM_FIELDS)
    if COMMENTS_JSONL:
        append_jsonl(COMMENTS_JSONL, batch)
    total_written += len(batch)
    batch.clear()

print(f"[done] wrote {total_written} comments to {COMMENTS_CSV}" + (f" and {COMMENTS_JSONL}" if COMMENTS_JSONL else ""))

Submissions: 100%|██████████| 3056/3056 [2:09:37<00:00,  2.55s/it]     

[done] wrote 140588 comments to ip_comments.csv and ip_comments.jsonl





In [6]:
import pandas as pd
try:
    dfc = pd.read_csv(COMMENTS_CSV)
    display(dfc.head(5))
    print(f"Total comments in file: {len(dfc):,}")
except Exception as e:
    print("Couldn't preview comments CSV:", e)


Unnamed: 0,id,created_utc,subreddit,link_id,parent_id,author,body,score,submission_id,submission_flair
0,fprlg3x,1588856974,IsraelPalestine,t3_gf34wp,t3_gf34wp,RosintheBow3,I think Israelis are critical of their governm...,32,gf34wp,
1,fprapg5,1588848450,IsraelPalestine,t3_gf34wp,t3_gf34wp,samtony234,There are many critical Israelis. You have the...,17,gf34wp,
2,fprla8t,1588856871,IsraelPalestine,t3_gf34wp,t3_gf34wp,JeffB1517,"> Like all other things, criticisms of Israel...",12,gf34wp,
3,fprymsf,1588864482,IsraelPalestine,t3_gf34wp,t3_gf34wp,zidbutt21,Would you mind translating the Hebrew article ...,3,gf34wp,
4,fpta3xd,1588887532,IsraelPalestine,t3_gf34wp,t3_gf34wp,ShabbatShalomSamurai,I’ve been in Israel the last five months and I...,3,gf34wp,


Total comments in file: 550,629
