In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
RATINGS_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_with_scores_v1.csv")
PROFILES_FILE = os.path.join(BASE_DIR, "data_processed", "participant_profiles_step1.csv")
OUT_FILE = os.path.join(BASE_DIR, "data_processed", "participant_taste_clusters_step1.csv")


def safe_corr(x: pd.Series, y: pd.Series) -> float:
    """Correlation guard for constant series / edge cases."""
    if x.nunique() <= 1 or y.nunique() <= 1:
        return np.nan
    return x.corr(y)


def main():
    df = pd.read_csv(RATINGS_FILE)
    profiles = pd.read_csv(PROFILES_FILE)

    # --- Build participant affinity metrics (this is the key fix) ---
    affinity_rows = []
    for pid, g in df.groupby("participant_id"):
        affinity_rows.append({
            "participant_id": pid,
            "order_affinity": safe_corr(g["aesthetic_appeal"], g["order_score"]),
            "surprise_affinity": safe_corr(g["aesthetic_appeal"], g["surprise_score"]),
        })

    affinity = pd.DataFrame(affinity_rows)

    # Merge with existing participant profiles
    part = profiles.merge(affinity, on="participant_id", how="left")

    # Fill any rare NaNs with 0
    part["order_affinity"] = part["order_affinity"].fillna(0)
    part["surprise_affinity"] = part["surprise_affinity"].fillna(0)

    # Features for clustering
    cluster_features = [
        "avg_beauty_given",
        "beauty_std",
        "avg_rating_C",
        "avg_rating_H",
        "avg_rating_S",
        "pref_H_minus_C",
        "pref_S_minus_C",
        "order_affinity",
        "surprise_affinity",
    ]

    X = part[cluster_features].copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 4 user segments to start
    kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
    part["taste_cluster"] = kmeans.fit_predict(X_scaled)

    # Save
    part.to_csv(OUT_FILE, index=False)

    print("Done!")
    print("Saved participant taste clusters to:", OUT_FILE)
    print("Participants:", len(part))

    print("\n=== Cluster sizes ===")
    print(part["taste_cluster"].value_counts().sort_index())

    print("\n=== Cluster profile (means) ===")
    profile_cols = [
        "avg_beauty_given", "beauty_std",
        "avg_rating_C", "avg_rating_H", "avg_rating_S",
        "pref_H_minus_C", "pref_S_minus_C",
        "order_affinity", "surprise_affinity"
    ]
    print(part.groupby("taste_cluster")[profile_cols].mean().round(3))

    print("\nDone!")


if __name__ == "__main__":
    main()

Done!
Saved participant taste clusters to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\participant_taste_clusters_step1.csv
Participants: 51

=== Cluster sizes ===
taste_cluster
0    14
1    23
2     5
3     9
Name: count, dtype: int64

=== Cluster profile (means) ===
               avg_beauty_given  beauty_std  avg_rating_C  avg_rating_H  \
taste_cluster                                                             
0                         3.901       1.953         1.994         5.039   
1                         4.388       1.202         3.584         4.918   
2                         4.670       0.629         4.571         4.766   
3                         3.010       1.263         2.433         3.446   

               avg_rating_S  pref_H_minus_C  pref_S_minus_C  order_affinity  \
taste_cluster                                                                 
0                     4.670           3.045           2.677           0.006   
1                     4

