In [1]:
import os
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
RATINGS_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_with_scores_v1.csv")
PARTICIPANT_SEG_FILE = os.path.join(BASE_DIR, "data_processed", "participant_taste_clusters_step1.csv")


def eval_rf_segment(seg_df: pd.DataFrame, feature_cols, target_col="aesthetic_appeal"):
    model_df = seg_df[feature_cols + [target_col]].dropna().copy()

    X = model_df[feature_cols]
    y = model_df[target_col]

    # Guard: need enough data and target variation
    if len(model_df) < 200 or y.nunique() < 3:
        return None, None, None, None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    rf = RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        min_samples_leaf=2
    )
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)

    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = math.sqrt(mean_squared_error(y_test, preds))

    imp_df = pd.DataFrame({
        "feature": feature_cols,
        "importance": rf.feature_importances_
    }).sort_values("importance", ascending=False)

    return r2, mae, rmse, imp_df


def main():
    df = pd.read_csv(RATINGS_FILE)
    part_seg = pd.read_csv(PARTICIPANT_SEG_FILE)

    # Keep only participant_id + cluster label for merge
    part_seg = part_seg[["participant_id", "taste_cluster"]].drop_duplicates()

    merged = df.merge(part_seg, on="participant_id", how="left")

    feature_cols = [
        "num_words",
        "avg_word_length",
        "line_length_mean",
        "line_length_variance",
        "syllables_total",
        "syllables_per_line_mean",
        "syllables_per_line_variance",
        "word_entropy",
        "char_entropy",
        "order_score",
        "surprise_score",
    ]

    print("Rows after merge:", len(merged))
    print("Participants:", merged["participant_id"].nunique())
    print("\nRows by taste_cluster:")
    print(merged["taste_cluster"].value_counts().sort_index())

    for cluster_id in sorted(merged["taste_cluster"].dropna().unique()):
        seg_df = merged[merged["taste_cluster"] == cluster_id].copy()

        print("\n" + "=" * 70)
        print(f"TASTE CLUSTER {int(cluster_id)}")
        print("=" * 70)
        print(f"Rows: {len(seg_df)}")
        print(f"Participants in cluster: {seg_df['participant_id'].nunique()}")

        result = eval_rf_segment(seg_df, feature_cols)

        if result[0] is None:
            print("Not enough data or target variation for stable modeling.")
            continue

        r2, mae, rmse, imp_df = result
        print(f"R2   : {r2:.4f}")
        print(f"MAE  : {mae:.4f}")
        print(f"RMSE : {rmse:.4f}")

        print("\nTop feature importances:")
        print(imp_df.head(8).to_string(index=False))

    print("\nDone!")


if __name__ == "__main__":
    main()

Rows after merge: 10710
Participants: 51

Rows by taste_cluster:
taste_cluster
0    2940
1    4830
2    1050
3    1890
Name: count, dtype: int64

TASTE CLUSTER 0
Rows: 2940
Participants in cluster: 14
R2   : 0.4560
MAE  : 1.2025
RMSE : 1.5372

Top feature importances:
                feature  importance
        avg_word_length    0.345808
           char_entropy    0.159553
         surprise_score    0.129230
            order_score    0.123245
              num_words    0.045425
       line_length_mean    0.043242
        syllables_total    0.036323
syllables_per_line_mean    0.036290

TASTE CLUSTER 1
Rows: 4830
Participants in cluster: 23
R2   : 0.2331
MAE  : 0.8942
RMSE : 1.1318

Top feature importances:
                    feature  importance
            avg_word_length    0.360463
             surprise_score    0.151110
               char_entropy    0.146585
                order_score    0.106970
            syllables_total    0.044545
    syllables_per_line_mean    0.043388
syl

In [2]:
## cluster 0
#The most style-selective readers were also the most mathematically predictable.
#they strongly prefer H/S over C
#they reward surprise
#they appear to use a relatively stable internal rule for beauty

## cluster 1
#This group is more open and balanced, so their ratings are less driven by a single style rule.
#The largest audience segment was moderately predictable: they reward structure and surprise, but with broader tolerance.

## cluster 2 
#Generalist readers were the hardest to model 
#because they rated most poems positively, reducing the variance needed for predictive learning
#This segment rates almost everything high
#has low variance
#doesn’t discriminate much between styles So there’s less signal to learn.

## cluster 3
#Tough critics may rely on deeper semantic or emotional criteria beyond surface structure and entropy