In [1]:
# STEP 4 — PREPROCESS FOR TOPICS
# Input : merged_posts_comments.csv
# Output: merged_preprocessed_for_topics.csv

import re, pandas as pd

SRC = "merged_posts_comments.csv"
DST = "merged_preprocessed_for_topics.csv"

JUNK = {"[deleted]","[removed]","deleted","removed","nan","none","null","amp"}

URL_RE         = re.compile(r"(https?://\S+|www\.\S+)")
REDDIT_TAGS_RE = re.compile(r"(?<!\w)(r/|u/)\w+")
HTML_AMP_RE    = re.compile(r"&amp;")
MULTISPACE_RE  = re.compile(r"\s+")
NONALPHA_RE    = re.compile(r"[^a-zA-Z\s]")

def clean_for_topics(x: str) -> str:
    if not isinstance(x, str): return ""
    t = x
    t = URL_RE.sub(" ", t)
    t = HTML_AMP_RE.sub("&", t)
    t = REDDIT_TAGS_RE.sub(" ", t)
    t = t.lower()
    t = NONALPHA_RE.sub(" ", t)
    t = MULTISPACE_RE.sub(" ", t).strip()
    return t

df = pd.read_csv(SRC, low_memory=False)
df["clean_for_topics"] = df["text"].fillna("").map(clean_for_topics)

# drop obvious junk rows
def is_junk(s: str) -> bool:
    ss = s.strip().lower()
    return (ss in JUNK) or (len(ss) < 3)

before = len(df)
df = df[~df["clean_for_topics"].map(is_junk)].copy()
df = df[df["clean_for_topics"].str.split().map(len) >= 3]
after = len(df)

# keep useful cols
keep = ["id","kind","subreddit","dt","text","clean_for_topics","link_flair_text","score","permalink"]
keep = [k for k in keep if k in df.columns]
df[keep].to_csv(DST, index=False)

print(f"[ok] wrote {after:,} rows → {DST} (dropped {before-after:,})")


[ok] wrote 447,012 rows → merged_preprocessed_for_topics.csv (dropped 38,380)
