In [1]:
import os
import pandas as pd

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_step3_entropy.csv")
OUT_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_with_scores_v1.csv")


def zscore(series: pd.Series) -> pd.Series:
    std = series.std()
    if std == 0 or pd.isna(std):
        return pd.Series([0] * len(series), index=series.index)
    return (series - series.mean()) / std


def main():
    df = pd.read_csv(IN_FILE)

    # Standardize core features
    df["z_line_var"] = zscore(df["line_length_variance"])
    df["z_syll_var"] = zscore(df["syllables_per_line_variance"])
    df["z_word_entropy"] = zscore(df["word_entropy"])
    df["z_avg_word_len"] = zscore(df["avg_word_length"])

    # ---- V1 scores ----
    # Order = structural regularity + lexical simplicity
    df["order_score"] = (
        (-df["z_line_var"]) +
        (-df["z_syll_var"]) +
        (-df["z_avg_word_len"])
    ) / 3

    # Surprise = lexical entropy + structural variation
    df["surprise_score"] = (
        (df["z_word_entropy"]) +
        (df["z_line_var"]) +
        (df["z_syll_var"])
    ) / 3

    df.to_csv(OUT_FILE, index=False)

    print("Done!")
    print("Saved scored dataset to:", OUT_FILE)
    print("Rows:", len(df))

    print("\nPreview:")
    print(df[[
        "participant_id", "PoemType", "aesthetic_appeal",
        "order_score", "surprise_score"
    ]].head())

    print("\nScore summary:")
    print(df[["order_score", "surprise_score"]].describe().round(3))


if __name__ == "__main__":
    main()

Done!
Saved scored dataset to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\poetry_features_with_scores_v1.csv
Rows: 10710

Preview:
  participant_id PoemType  aesthetic_appeal  order_score  surprise_score
0           P101        C                 2     0.501598       -0.460442
1           P101        C                 3    -0.296353       -0.076552
2           P101        S                 5     0.781384       -0.262208
3           P101        H                 5    -1.396046        1.458690
4           P101        H                 5     0.426593        0.357655

Score summary:
       order_score  surprise_score
count    10710.000       10710.000
mean        -0.000          -0.000
std          0.624           0.770
min         -3.551          -1.426
25%         -0.285          -0.525
50%          0.124          -0.205
75%          0.416           0.361
max          1.011           4.276
