In [1]:
import os
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- Paths ----------
BASE_DIR = r"C:\PG, IELTS, DOCS\research paper\poetry project"
IN_FILE = os.path.join(BASE_DIR, "data_processed", "poetry_features_with_scores_v1.csv")


def eval_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = math.sqrt(mean_squared_error(y_test, preds))

    print(f"\n=== {name} ===")
    print(f"R2   : {r2:.4f}")
    print(f"MAE  : {mae:.4f}")
    print(f"RMSE : {rmse:.4f}")

    return model, preds


def main():
    df = pd.read_csv(IN_FILE)

    # Features (v1)
    feature_cols = [
        "num_words",
        "avg_word_length",
        "line_length_mean",
        "line_length_variance",
        "syllables_total",
        "syllables_per_line_mean",
        "syllables_per_line_variance",
        "word_entropy",
        "char_entropy",
        "order_score",
        "surprise_score",
    ]

    target_col = "aesthetic_appeal"

    model_df = df[feature_cols + [target_col]].dropna().copy()

    X = model_df[feature_cols]
    y = model_df[target_col]

    print("Rows used:", len(model_df))
    print("Features used:", len(feature_cols))
    print("Target:", target_col)

    # Random split baseline
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 1) Linear regression baseline
    lr = LinearRegression()
    lr_model, _ = eval_model("Linear Regression", lr, X_train, X_test, y_train, y_test)

    # Coefficients (interpretability)
    coef_df = pd.DataFrame({
        "feature": feature_cols,
        "coefficient": lr_model.coef_
    }).sort_values("coefficient", ascending=False)

    print("\nLinear Regression coefficients (direction + strength):")
    print(coef_df.to_string(index=False))

    # 2) Random Forest baseline
    rf = RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        min_samples_leaf=2
    )
    rf_model, _ = eval_model("Random Forest Regressor", rf, X_train, X_test, y_train, y_test)

    # Feature importances (non-linear importance)
    imp_df = pd.DataFrame({
        "feature": feature_cols,
        "importance": rf_model.feature_importances_
    }).sort_values("importance", ascending=False)

    print("\nRandom Forest feature importances:")
    print(imp_df.to_string(index=False))

    print("\nDone!")


if __name__ == "__main__":
    main()

Rows used: 10710
Features used: 11
Target: aesthetic_appeal

=== Linear Regression ===
R2   : 0.0930
MAE  : 1.2792
RMSE : 1.5743

Linear Regression coefficients (direction + strength):
                    feature   coefficient
    syllables_per_line_mean  2.337047e+13
                  num_words  5.162115e+12
               word_entropy  1.311782e+12
               char_entropy -1.883071e-01
             surprise_score -1.265036e+12
syllables_per_line_variance -4.104203e+12
            avg_word_length -4.998130e+12
       line_length_variance -5.770799e+12
            syllables_total -7.790156e+12
           line_length_mean -1.548635e+13
                order_score -1.551984e+13

=== Random Forest Regressor ===
R2   : 0.2169
MAE  : 1.1802
RMSE : 1.4628

Random Forest feature importances:
                    feature  importance
            avg_word_length    0.340697
               char_entropy    0.163255
             surprise_score    0.154367
                order_score    0.119703


In [2]:
#The linear baseline showed multicollinearity due to engineered composite features and overlapping structural metrics, 
#so I relied more on the Random Forest for feature importance and treated linear coefficients as a diagnostic baseline

In [3]:
#giant coefficients (like 2.3e+13) are a multicollinearity issue
#num_words and line_length_mean are tightly linked (all poems have 3 lines)
#syllables_total and syllables_per_line_mean are linked
#order_score and surprise_score are built from other features
#So Linear Regression struggles to assign clean coefficients