In [1]:
import os
import re
import math
from collections import Counter

import pandas as pd

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_step2_syllables.csv")
OUT_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_step3_entropy.csv")


# ---------- Helpers ----------
def tokenize(text: str):
    return re.findall(r"[A-Za-z']+", str(text).lower())


def split_lines(text: str):
    return [ln.strip() for ln in str(text).split("\n") if ln.strip()]


def shannon_entropy(items):
    if not items:
        return 0.0
    counts = Counter(items)
    total = len(items)
    probs = [c / total for c in counts.values()]
    return -sum(p * math.log2(p) for p in probs if p > 0)


def word_entropy(text: str) -> float:
    return shannon_entropy(tokenize(text))


def char_entropy(text: str) -> float:
    chars = [c.lower() for c in str(text) if c.isalpha()]
    return shannon_entropy(chars)


def line_ending_novelty(text: str) -> float:
    lines = split_lines(text)
    end_words = []

    for line in lines:
        words = tokenize(line)
        if words:
            end_words.append(words[-1])

    if not end_words:
        return 0.0

    # proportion of unique end words (1.0 = all different)
    return len(set(end_words)) / len(end_words)


# ---------- Main ----------
def main():
    df = pd.read_csv(IN_FILE)

    df["word_entropy"] = df["text"].apply(word_entropy)
    df["char_entropy"] = df["text"].apply(char_entropy)
    df["line_ending_novelty"] = df["text"].apply(line_ending_novelty)

    df.to_csv(OUT_FILE, index=False)

    print("Done!")
    print("Saved feature dataset to:", OUT_FILE)
    print("Rows:", len(df))
    print("\nPreview:")
    print(df[[
        "participant_id", "PoemType", "aesthetic_appeal",
        "word_entropy", "char_entropy", "line_ending_novelty"
    ]].head())


if __name__ == "__main__":
    main()

Done!
Saved feature dataset to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\poetry_features_step3_entropy.csv
Rows: 10710

Preview:
  participant_id PoemType  aesthetic_appeal  word_entropy  char_entropy  \
0           P101        C                 2      2.807355      3.604030   
1           P101        C                 3      2.807355      3.710582   
2           P101        S                 5      3.121928      3.708146   
3           P101        H                 5      3.000000      3.726558   
4           P101        H                 5      3.459432      3.760892   

   line_ending_novelty  
0                  1.0  
1                  1.0  
2                  1.0  
3                  1.0  
4                  1.0  


In [None]:
#word_entropy → how unpredictable the word distribution is
#char_entropy → character-level complexity
#line_ending_novelty → how varied the end words are across the 3 lines