In [None]:
# Clone or pull the repo first
%cd /content
!test -d ADS509_FinalProject || git clone https://github.com/mitosisgg/ADS509_FinalProject.git
%cd /content/ADS509_FinalProject
!git pull

In [None]:
import json_to_csv_clean as j2c
from pathlib import Path

In [None]:
import json
import re
from pathlib import Path
import pandas as pd

# JSONs from the repo and write CSV back into the repo
RAW_DIR  = Path("/content/ADS509_FinalProject/data/raw")
REPO_OUT = Path("/content/ADS509_FinalProject/articles.csv")
DRIVE_OUT = Path("/content/drive/MyDrive/Data/articles.csv")

# Regex helpers
CAT_RX = re.compile(r"([A-Za-z]+)_articles_", re.IGNORECASE)
TRUNC_RX = re.compile(r"\s*\[\+\d+\s+chars\]\s*$")

def pick_category(path: Path) -> str:
    m = CAT_RX.search(path.name)
    return (m.group(1) if m else "unknown").lower()

def clean_content(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return TRUNC_RX.sub("", s)

def load_items(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, list):
        return data
    if isinstance(data, dict) and isinstance(data.get("articles"), list):
        return data["articles"]
    return []

def main():
    # Gather all JSON files
    if not RAW_DIR.exists():
        raise SystemExit(f"Input folder not found: {RAW_DIR}")

    json_files = sorted(RAW_DIR.glob("*_articles_*.json"))
    if not json_files:
        raise SystemExit(f"No *_articles_*.json files found in {RAW_DIR}")

    frames = []

    # Process each file
    for fp in json_files:
        recs = load_items(fp)
        if not recs:
            continue

        df = pd.json_normalize(recs, sep=".")
        out = pd.DataFrame({
            "category":    [pick_category(fp)] * len(df),
            "title":       df.get("title"),
            "description": df.get("description"),
            "content":     df.get("content"),
        })
        cleaned = out["content"].fillna("").astype(str).map(clean_content)
        out["content"] = cleaned
        out["content_len"] = cleaned.str.len()
        frames.append(out)

    # Combine all DataFrames
    if not frames:
        raise SystemExit("All inputs parsed to empty; no rows to write.")
    combined = pd.concat(frames, ignore_index=True)

    # Remove duplicates
    combined = combined.drop_duplicates(subset=["title", "description"], keep="first")

    # Save CSV to repo and Drive
    REPO_OUT.parent.mkdir(parents=True, exist_ok=True)
    combined.to_csv(REPO_OUT, index=False, encoding="utf-8")

    DRIVE_OUT.parent.mkdir(parents=True, exist_ok=True)
    combined.to_csv(DRIVE_OUT, index=False, encoding="utf-8")

    # Print summary + preview
    print(f"Wrote {REPO_OUT} (repo copy)")
    print(f"Wrote {DRIVE_OUT} (Drive backup)")
    print(f"Rows={len(combined)}  Cols={len(combined.columns)}")
    print("\ncontent_len summary:")
    print(combined['content_len'].describe().to_string())
    print("\nPreview of first 5 rows:")
    print(combined[['category', 'title']].head(5).to_string(index=False))

if __name__ == "__main__":
    main()