In [1]:
import os
import re
import numpy as np
import pandas as pd

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_dataset_merged.csv")
OUT_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_step1_full.csv")


# ---------- Helpers ----------
def tokenize(text: str):
    return re.findall(r"[A-Za-z']+", str(text).lower())


def split_lines(text: str):
    # Real line breaks are preserved in your merged dataset
    lines = [ln.strip() for ln in str(text).split("\n") if ln.strip()]
    return lines


def num_words(text: str) -> int:
    return len(tokenize(text))


def avg_word_length(text: str) -> float:
    words = tokenize(text)
    if not words:
        return 0.0
    return sum(len(w) for w in words) / len(words)


def num_lines(text: str) -> int:
    return len(split_lines(text))


def line_length_mean(text: str) -> float:
    lines = split_lines(text)
    lengths = [len(tokenize(line)) for line in lines]
    if not lengths:
        return 0.0
    return float(np.mean(lengths))


def line_length_variance(text: str) -> float:
    lines = split_lines(text)
    lengths = [len(tokenize(line)) for line in lines]
    if not lengths:
        return 0.0
    return float(np.var(lengths))


# ---------- Main ----------
def main():
    df = pd.read_csv(IN_FILE)

    # Add features
    df["num_words"] = df["text"].apply(num_words)
    df["avg_word_length"] = df["text"].apply(avg_word_length)
    df["num_lines"] = df["text"].apply(num_lines)
    df["line_length_mean"] = df["text"].apply(line_length_mean)
    df["line_length_variance"] = df["text"].apply(line_length_variance)

    # Save
    df.to_csv(OUT_FILE, index=False)

    print("Done!")
    print("Saved feature dataset to:", OUT_FILE)
    print("Rows:", len(df))
    print("\nPreview:")
    print(df[[
        "participant_id", "PoemType", "aesthetic_appeal",
        "num_words", "avg_word_length", "num_lines",
        "line_length_mean", "line_length_variance"
    ]].head())


if __name__ == "__main__":
    main()

Done!
Saved feature dataset to: C:\PG, IELTS, DOCS\research paper\poetry project\data_processed\poetry_features_step1_full.csv
Rows: 10710

Preview:
  participant_id PoemType  aesthetic_appeal  num_words  avg_word_length  \
0           P101        C                 2          7         4.571429   
1           P101        C                 3          7         5.857143   
2           P101        S                 5         10         4.100000   
3           P101        H                 5          8         5.125000   
4           P101        H                 5         11         4.363636   

   num_lines  line_length_mean  line_length_variance  
0          3          2.333333              0.222222  
1          3          2.333333              0.222222  
2          3          3.333333              0.222222  
3          3          2.666667              2.888889  
4          3          3.666667              0.888889  
