In [1]:
pip install pronouncing

Collecting pronouncing
  Downloading pronouncing-0.2.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting cmudict>=0.4.0 (from pronouncing)
  Obtaining dependency information for cmudict>=0.4.0 from https://files.pythonhosted.org/packages/fe/1c/580ae2d5b3eef081305b4b8bd89f7e7945c4d6351c10069ecd2590da1310/cmudict-1.1.3-py3-none-any.whl.metadata
  Downloading cmudict-1.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading cmudict-1.1.3-py3-none-any.whl (939 kB)
   ---------------------------------------- 0.0/939.7 kB ? eta -:--:--
   ---------------------- ---------------- 532.5/939.7 kB 16.8 MB/s eta 0:00:01
   --------------------------------------- 939.7/939.7 kB 14.8 MB/s eta 0:00:00
Building wheels for collected packages: pronouncing
  Building wheel for pronouncing (setup.py): started
  Building wheel for pronouncing (setup.py): finished with status 'done'
  Created wheel for pronouncing: filename=pronouncing

In [2]:
import os
import re
import numpy as np
import pandas as pd
import pronouncing

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_step1_full.csv")
OUT_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_step2_syllables.csv")


# ---------- Helpers ----------
def tokenize(text: str):
    return re.findall(r"[A-Za-z']+", str(text).lower())


def split_lines(text: str):
    return [ln.strip() for ln in str(text).split("\n") if ln.strip()]


def syllable_count_word(word: str) -> int:
    phones = pronouncing.phones_for_word(word)
    if phones:
        return pronouncing.syllable_count(phones[0])

    # Fallback heuristic for words not found in CMU dict
    vowels = re.findall(r"[aeiouy]+", word.lower())
    return max(1, len(vowels))


def syllables_in_line(line: str) -> int:
    words = tokenize(line)
    return sum(syllable_count_word(w) for w in words)


def line_syllable_counts(text: str):
    lines = split_lines(text)
    return [syllables_in_line(line) for line in lines]


def syllables_total(text: str) -> int:
    return int(sum(line_syllable_counts(text)))


def syllables_per_line_mean(text: str) -> float:
    vals = line_syllable_counts(text)
    return float(np.mean(vals)) if vals else 0.0


def syllables_per_line_variance(text: str) -> float:
    vals = line_syllable_counts(text)
    return float(np.var(vals)) if vals else 0.0


# ---------- Main ----------
def main():
    df = pd.read_csv(IN_FILE)

    # Add syllable features
    df["syllables_total"] = df["text"].apply(syllables_total)
    df["syllables_per_line_mean"] = df["text"].apply(syllables_per_line_mean)
    df["syllables_per_line_variance"] = df["text"].apply(syllables_per_line_variance)

    df.to_csv(OUT_FILE, index=False)

    print("Done!")
    print("Saved feature dataset to:", OUT_FILE)
    print("Rows:", len(df))
    print("\nPreview:")
    print(df[[
        "participant_id", "PoemType", "aesthetic_appeal",
        "num_words", "line_length_mean",
        "syllables_total", "syllables_per_line_mean", "syllables_per_line_variance"
    ]].head())


if __name__ == "__main__":
    main()

Done!
Saved feature dataset to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\poetry_features_step2_syllables.csv
Rows: 10710

Preview:
  participant_id PoemType  aesthetic_appeal  num_words  line_length_mean  \
0           P101        C                 2          7          2.333333   
1           P101        C                 3          7          2.333333   
2           P101        S                 5         10          3.333333   
3           P101        H                 5          8          2.666667   
4           P101        H                 5         11          3.666667   

   syllables_total  syllables_per_line_mean  syllables_per_line_variance  
0               12                 4.000000                     0.666667  
1               12                 4.000000                     2.000000  
2               14                 4.666667                     0.222222  
3               14                 4.666667                     2.888889  
4             