In [1]:
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_with_scores_v1.csv")
OUT_FILE = os.path.join(BASE_DIR, "data_processed", "poem_style_clusters_step1.csv")


def main():
    df = pd.read_csv(IN_FILE)

    # One row per poem (ratings repeat across participants)
    # Keep poem-level features and average beauty across participants
    poem_df = (
        df.groupby(["PoemKey", "PoemType", "text"], as_index=False)
        .agg({
            "aesthetic_appeal": "mean",
            "num_words": "first",
            "avg_word_length": "first",
            "line_length_mean": "first",
            "line_length_variance": "first",
            "syllables_total": "first",
            "syllables_per_line_mean": "first",
            "syllables_per_line_variance": "first",
            "word_entropy": "first",
            "char_entropy": "first",
            "order_score": "first",
            "surprise_score": "first",
        })
    )

    # Features for clustering
    cluster_features = [
        "num_words",
        "avg_word_length",
        "line_length_variance",
        "syllables_per_line_variance",
        "word_entropy",
        "order_score",
        "surprise_score",
    ]

    X = poem_df[cluster_features].copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 4 clusters = simple starting point
    kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
    poem_df["style_cluster"] = kmeans.fit_predict(X_scaled)

    # Save
    poem_df.to_csv(OUT_FILE, index=False)

    print("Done!")
    print("Saved clustered poems to:", OUT_FILE)
    print("Unique poems:", len(poem_df))

    print("\n=== Cluster sizes ===")
    print(poem_df["style_cluster"].value_counts().sort_index())

    print("\n=== Cluster profile (means) ===")
    profile_cols = ["aesthetic_appeal", "order_score", "surprise_score", "word_entropy", "num_words"]
    print(poem_df.groupby("style_cluster")[profile_cols].mean().round(3))

    print("\n=== PoemType mix by cluster ===")
    print(pd.crosstab(poem_df["style_cluster"], poem_df["PoemType"]))


if __name__ == "__main__":
    main()

Done!
Saved clustered poems to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\poem_style_clusters_step1.csv
Unique poems: 210

=== Cluster sizes ===
style_cluster
0    53
1    51
2    32
3    74
Name: count, dtype: int64

=== Cluster profile (means) ===
               aesthetic_appeal  order_score  surprise_score  word_entropy  \
style_cluster                                                                
0                         3.373       -0.142          -0.646         2.578   
1                         4.581        0.345           0.336         3.306   
2                         4.502       -1.072           1.332         3.090   
3                         3.941        0.328          -0.345         2.867   

               num_words  
style_cluster             
0                  6.038  
1                 10.275  
2                  9.000  
3                  7.500  

=== PoemType mix by cluster ===
PoemType        C   H   S
style_cluster            
0           

