In [1]:
import os
import pandas as pd

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_DIR = os.path.join(BASE_DIR, "data_processed")
OUT_DIR = os.path.join(BASE_DIR, "data_processed")

LOOKUP_FILE = os.path.join(IN_DIR, "poem_lookup_full.csv")
RATINGS_FILE = os.path.join(IN_DIR, "ratings_master.csv")

OUTPUT_FILE = os.path.join(OUT_DIR, "poetry_dataset_merged.csv")


def main():
    lookup = pd.read_csv(LOOKUP_FILE)
    ratings = pd.read_csv(RATINGS_FILE)

    # Merge on normalized key
    merged = ratings.merge(
        lookup[["PoemKey", "text", "PoemType", "Block"]],
        on="PoemKey",
        how="left",
        suffixes=("_rating", "_lookup")
    )

    # Use lookup type/block where available (cleaner source)
    if "PoemType_lookup" in merged.columns:
        merged["PoemType"] = merged["PoemType_lookup"].combine_first(merged.get("PoemType_rating"))
    if "Block_lookup" in merged.columns:
        merged["Block"] = merged["Block_lookup"].combine_first(merged.get("Block_rating"))

    # Keep a clean column order
    preferred_cols = [
        "participant_id",
        "PoemKey",
        "PoemNameRaw",
        "text",
        "PoemType",
        "Block",
        "aesthetic_appeal",
        "Imagery",
        "Moved",
        "Originality",
        "Creativity",
    ]

    # Add any extra columns at the end
    final_cols = [c for c in preferred_cols if c in merged.columns] + [c for c in merged.columns if c not in preferred_cols]
    merged = merged[final_cols]

    # Check merge success
    total_rows = len(merged)
    matched_rows = merged["text"].notna().sum()
    unmatched_rows = total_rows - matched_rows
    match_rate = matched_rows / total_rows * 100 if total_rows else 0

    merged.to_csv(OUTPUT_FILE, index=False)

    print("Done!")
    print("Saved merged dataset to:", OUTPUT_FILE)
    print("Total rows:", total_rows)
    print("Matched text rows:", matched_rows)
    print("Unmatched rows:", unmatched_rows)
    print(f"Match rate: {match_rate:.2f}%")

    print("\nPreview:")
    print(merged.head())


if __name__ == "__main__":
    main()

Done!
Saved merged dataset to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\poetry_dataset_merged.csv
Total rows: 10710
Matched text rows: 10710
Unmatched rows: 0
Match rate: 100.00%

Preview:
  participant_id                                            PoemKey  \
0           P101             aim camera adjust aperture snap a shot   
1           P101    council votes new bike lanes cyclists celebrate   
2           P101  in soap bubbles again and again his face is br...   
3           P101   whalebone from a beach near savoonga winter rain   
4           P101  having no thought we ve come to see them dogwo...   

                                         PoemNameRaw  \
0          aim camera \nadjust aperture\nsnap a shot   
1  Council votes\nnew bike lanes\ncyclists celebrate   
2  in soap bubbles\nagain and again\nhis face is ...   
3  whalebone\nfrom a beach near Savoonga窶能
winter...   
4  having no thought\nwe've come to see them窶能
do...   

                        