In [5]:
import os
import glob
import pandas as pd

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
RAW_DIR = os.path.join(BASE_DIR, "data_raw")
OUT_DIR = os.path.join(BASE_DIR, "data_processed")

os.makedirs(OUT_DIR, exist_ok=True)

OUTPUT_FILE = os.path.join(OUT_DIR, "ratings_master.csv")


def normalize_poem_key(name: str) -> str:
    """Normalize rating-file poem names to match lookup keys."""
    s = str(name).strip().lower()

    # Replace common punctuation variants
    s = s.replace("…", " ")
    s = s.replace(". . .", " ")
    s = s.replace("...", " ")
    s = s.replace("—", " ")
    s = s.replace("-", " ")

    # Keep alnum/spaces only
    s = "".join(ch if ch.isalnum() or ch.isspace() else " " for ch in s)
    s = " ".join(s.split())
    return s


def main():
    # Grab participant CSVs (exclude anything that's not participant ratings)
    csv_files = sorted(glob.glob(os.path.join(RAW_DIR, "*.csv")))

    if not csv_files:
        print("No CSV files found in:", RAW_DIR)
        return

    all_rows = []
    print("Found rating files:", len(csv_files))

    for f in csv_files:
        fname = os.path.basename(f)
        participant_id = os.path.splitext(fname)[0]  # e.g., P101

        try:
            df = pd.read_csv(f, encoding="cp1252")
        except Exception as e:
            print(f"Skipping {fname} (read error): {e}")
            continue

        # Expected columns from your screenshot
        # PoemName, PoemType, Block, AA, Imagery, Moved, Originality, Creativity
        required = ["PoemName", "AA"]
        if not all(col in df.columns for col in required):
            print(f"Skipping {fname} (missing columns)")
            continue

        out = pd.DataFrame()
        out["PoemNameRaw"] = df["PoemName"].astype(str)
        out["PoemKey"] = out["PoemNameRaw"].apply(normalize_poem_key)
        out["participant_id"] = participant_id

        # Keep these if present
        for col in ["PoemType", "Block", "AA", "Imagery", "Moved", "Originality", "Creativity"]:
            out[col] = df[col] if col in df.columns else pd.NA

        # Rename AA now for consistency
        out = out.rename(columns={"AA": "aesthetic_appeal"})

        all_rows.append(out)
        print(f"{fname} -> {len(out)} rows")

    ratings_df = pd.concat(all_rows, ignore_index=True)

    # Drop empty keys if any
    ratings_df = ratings_df[ratings_df["PoemKey"].notna()].copy()
    ratings_df = ratings_df[ratings_df["PoemKey"].astype(str).str.strip() != ""]

    ratings_df.to_csv(OUTPUT_FILE, index=False)

    print("\nDone!")
    print("Saved ratings master to:", OUTPUT_FILE)
    print("Total rows:", len(ratings_df))
    print("Participants:", ratings_df['participant_id'].nunique())
    print("\nPreview:")
    print(ratings_df.head())


if __name__ == "__main__":
    main()

Found rating files: 51
P101.csv -> 210 rows
P102.csv -> 210 rows
P103.csv -> 210 rows
P104.csv -> 210 rows
P105.csv -> 210 rows
P106.csv -> 210 rows
P107.csv -> 210 rows
P108.csv -> 210 rows
P109.csv -> 210 rows
P110.csv -> 210 rows
P111.csv -> 210 rows
P112.csv -> 210 rows
P113.csv -> 210 rows
P114.csv -> 210 rows
P115.csv -> 210 rows
P116.csv -> 210 rows
P117.csv -> 210 rows
P118.csv -> 210 rows
P119.csv -> 210 rows
P120.csv -> 210 rows
P121.csv -> 210 rows
P122.csv -> 210 rows
P123.csv -> 210 rows
P124.csv -> 210 rows
P125.csv -> 210 rows
P126.csv -> 210 rows
P127.csv -> 210 rows
P128.csv -> 210 rows
P129.csv -> 210 rows
P130.csv -> 210 rows
P131.csv -> 210 rows
P132.csv -> 210 rows
P133.csv -> 210 rows
P134.csv -> 210 rows
P135.csv -> 210 rows
P136.csv -> 210 rows
P137.csv -> 210 rows
P138.csv -> 210 rows
P139.csv -> 210 rows
P140.csv -> 210 rows
P141.csv -> 210 rows
P142.csv -> 210 rows
P143.csv -> 210 rows
P144.csv -> 210 rows
P145.csv -> 210 rows
P146.csv -> 210 rows
P147.csv ->