# Reddit Data Collector
This notebook pulls posts from Reddit using PRAW, cleans them, and exports to CSV.
Author: Manvir Kaur


In [None]:

import pandas as pd
from typing import List, Dict, Any

FIELDNAMES = [
    "title",          # String
    "score",          # Integer
    "upvote_ratio",   # Float
    "num_comments",   # Integer
    "author",         # String
    "subreddit",      # String
    "url",            # String
    "permalink",      # String
    "created_utc",    # Integer
    "is_self",        # Boolean
    "selftext",       # String (truncated to 500)
    "flair",          # String
    "domain",         # String
    "search_query",   # String (None if not using search)
]

def _to_int(x):
    try:
        return int(x)
    except Exception:
        return None

def _row_from_post(post, subreddit_name: str, search_query: str | None) -> Dict[str, Any]:
    author = post.author.name if post.author else None
    text = post.selftext if isinstance(post.selftext, str) else None
    if text and len(text) > 500:
        text = text[:500] + "..."
    return {
        "title": getattr(post, "title", None),
        "score": getattr(post, "score", None),
        "upvote_ratio": getattr(post, "upvote_ratio", None),
        "num_comments": getattr(post, "num_comments", None),
        "author": author,
        "subreddit": subreddit_name,
        "url": getattr(post, "url", None),
        "permalink": ("https://reddit.com" + getattr(post, "permalink","")) if getattr(post,"permalink",None) else None,
        "created_utc": _to_int(getattr(post, "created_utc", None)),
        "is_self": getattr(post, "is_self", None),
        "selftext": text,
        "flair": getattr(post, "link_flair_text", None),
        "domain": getattr(post, "domain", None),
        "search_query": search_query if search_query else None,
    }

def _normalize_subs(subreddit_name) -> List[str]:
    if isinstance(subreddit_name, str):
        subs = [subreddit_name]
    else:
        subs = subreddit_name
    return [s.strip().lstrip("r/") for s in subs if isinstance(s, str) and s.strip()]

# ---- Task 1: HOT posts
def fetch_hot_posts(subreddit_name, limit=50) -> List[Dict[str,Any]]:
    subs = _normalize_subs(subreddit_name)
    rows: List[Dict[str,Any]] = []
    for sub in subs:
        print(f"⬇️  r/{sub}: collecting HOT (limit={limit}) ...")
        count = 0
        for post in reddit.subreddit(sub).hot(limit=limit):
            rows.append(_row_from_post(post, sub, search_query=None))
            count += 1
        print(f"✅ Collected {count} hot posts from r/{sub}.")
    return rows

# ---- Task 2: keyword-based search
def search_posts(query: str, subreddit_name, limit=50) -> List[Dict[str,Any]]:
    subs = _normalize_subs(subreddit_name)
    rows: List[Dict[str,Any]] = []
    for sub in subs:
        print(f"🔎 r/{sub}: searching '{query}' (limit={limit}) ...")
        count = 0
        for post in reddit.subreddit(sub).search(query=query, sort="new", limit=limit):
            rows.append(_row_from_post(post, sub, search_query=query))
            count += 1
        print(f"✅ Collected {count} search posts from r/{sub}.")
    return rows

# ---- Task 3: data export to CSV
def export_posts_to_csv(rows: List[Dict[str,Any]], out_path: str = "reddit_data.csv") -> bool:
    if not rows:
        print("⚠️ No rows to export.")
        return False
    df = pd.DataFrame(rows)

    # enforce column order where present
    df = df[[c for c in FIELDNAMES if c in df.columns]]

    before = len(df)
    df = df.drop_duplicates(subset="permalink", keep="first")
    after = len(df)
    print(f"🧹 Deduplicated by permalink: {before} → {after}")

    df.to_csv(out_path, index=False)
    print(f"📁 Saved {len(df)} rows to '{out_path}' (no index).")
    return True

# ---------------- main usage example ----------------
SUBS = ["education", "teachers", "college"]

hot_rows = fetch_hot_posts(SUBS, limit=50)
search_rows = search_posts("homework", SUBS, limit=25)
all_rows = hot_rows + search_rows
export_posts_to_csv(all_rows, out_path="reddit_data.csv")

print("🎉 Data collection completed successfully!")
