In [8]:
from src.data_loader import load_full_data, load_filtered_data
from src.features import get_features_and_target
from src.models import train_linear, train_ridge, train_lasso
from src.utils import split_data
from src.evaluation import evaluate_model, print_coefficients, cross_validate_model, bootstrap_prediction_intervals, kfold_bootstrap_ci_analysis
import pandas as pd

## Linear Regression (authentic cadence, notes only)

In [3]:
# --- Load truncated stems of authentic cadence ---
filepath = 'data/truncated_stems.csv'
df = load_filtered_data(filepath, prefix="HC")
X, y = get_features_and_target(df, feature_steps=range(1,7), target_col='step7')

# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_linear(x_train, y_train)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="linear",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

***** Evaluation Results *****
Training set mean squared error: 7.3774
Training set r-squared score: 0.1518
Validation set mean squared error: 12.1147
Validation set r-squared score: -2.8636
Testing set mean squared error: 6.6147
Testing set r-squared score: 0.0637

Learned coefficients (weights for each step):
Step 1: 0.0204
Step 2: -0.0118
Step 3: 0.0379
Step 4: -0.3298
Step 5: 0.0316
Step 6: 0.1828

***** 5-Fold Cross-Validation (Linear) Results *****
Fold 1: MSE = 20.7250, R² = -1.3075
Fold 2: MSE = 27.5917, R² = -0.7690
Fold 3: MSE = 12.5057, R² = -0.6966
Fold 4: MSE = 16.2414, R² = -0.7736
Fold 5: MSE = 13.8226, R² = -3.7817

***** Average over 5 folds *****
Average MSE: 18.1773
Average R²: -1.4657

Overall coverage across all folds: 48.89% within 95% CI


## Linear Regression (authentic cadence, notes and key)

In [15]:
list(range(1,7) +  9)

TypeError: unsupported operand type(s) for +: 'range' and 'int'

In [22]:
# --- Load truncated stems of authentic cadence ---
filepath = 'data/truncated_stems.csv'
df = load_filtered_data(filepath, prefix="HC")
# --- Add key feature ---
KEY_TO_MIDI = {
    'C': 60, 'C_sharp': 61, 'D_flat': 61, 'D': 62, 'E_flat': 63, 'E': 64,
    'F': 65, 'F_sharp': 66, 'G': 67, 'A_flat': 68, 'A': 69, 'B_flat': 70, 'B': 71
}
df['key'] = df['key'].map(KEY_TO_MIDI)

# --- Extract features and target ---
X_array, y = get_features_and_target(df, feature_steps=range(1, 7), target_col='step7')
X = pd.DataFrame(X_array, columns=[f'step{i}' for i in range(1, 7)])
X['key_midi'] = df['key']
X['V_midi'] = df['V_chord_root']

print(X.head())
# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_linear(x_train, y_train)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="linear",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

   step1  step2  step3  step4  step5  step6  key_midi  V_midi
0     62     69     66     64     57     61      62.0    69.0
1     67     65     64     62     60     59      60.0    67.0
2     64     59     68     69     66     63       NaN     NaN
3     70     63     68     67     65     62       NaN     NaN
4     61     68     63     68     60     68      64.0    71.0


KeyError: "None of [Index([29, 36, 39, 32, 40,  5, 26, 17, 14, 28, 34, 16, 13, 38, 35,  1, 15, 44,\n       11, 41, 23,  4, 18,  0, 24,  7, 30, 31, 43,  9, 10,  6, 27, 19,  3,  2],\n      dtype='int64')] are in the [columns]"

## Linear Regression (all cadences, notes only)

In [3]:
# --- Load truncated stems of all cadences ---
filepath = 'data/truncated_stems.csv'
df = load_full_data(filepath)
X, y = get_features_and_target(df, feature_steps=range(1,7), target_col='step7')

# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_linear(x_train, y_train)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="linear",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

***** Evaluation Results *****
Training set mean squared error: 5.6634
Training set r-squared score: 0.2112
Validation set mean squared error: 10.2998
Validation set r-squared score: -0.2790
Testing set mean squared error: 8.6078
Testing set r-squared score: 0.2082

Learned coefficients (weights for each step):
Step 1: 0.0133
Step 2: -0.0184
Step 3: -0.1234
Step 4: -0.2153
Step 5: 0.0817
Step 6: 0.2319

***** 5-Fold Cross-Validation (Linear) Results *****
Fold 1: MSE = 15.5600, R² = -0.3990
Fold 2: MSE = 12.7756, R² = -0.7904
Fold 3: MSE = 12.8228, R² = -1.2000
Fold 4: MSE = 17.9676, R² = -2.3728
Fold 5: MSE = 8.2513, R² = -0.0406

***** Average over 5 folds *****
Average MSE: 13.4755
Average R²: -0.9605

Overall coverage across all folds: 40.00% within 95% CI


## Ridge Regression (authentic cadence, notes only)

In [4]:
# --- Load truncated stems of authentic cadence ---
filepath = 'data/truncated_stems.csv'
df = load_filtered_data(filepath)
X, y = get_features_and_target(df, feature_steps=range(1,7), target_col='step7')

# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_ridge(x_train, y_train, alpha=1.0)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5, model_type="ridge", alpha=1.0)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="ridge",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

***** Evaluation Results *****
Training set mean squared error: 7.6793
Training set r-squared score: 0.2272
Validation set mean squared error: 6.2976
Validation set r-squared score: -0.1631
Testing set mean squared error: 8.7092
Testing set r-squared score: -0.5411

Learned coefficients (weights for each step):
Step 1: 0.0072
Step 2: -0.0285
Step 3: -0.0547
Step 4: -0.3580
Step 5: 0.0635
Step 6: 0.2785

***** 5-Fold Cross-Validation (Ridge) Results *****
Fold 1: MSE = 20.7225, R² = -1.3072
Fold 2: MSE = 27.6016, R² = -0.7696
Fold 3: MSE = 12.5043, R² = -0.6965
Fold 4: MSE = 16.2126, R² = -0.7704
Fold 5: MSE = 13.8129, R² = -3.7783

***** Average over 5 folds *****
Average MSE: 18.1708
Average R²: -1.4644

Overall coverage across all folds: 48.89% within 95% CI


## Ridge Regression (all cadences, notes only)

In [5]:
# --- Load truncated stems of all cadences ---
filepath = 'data/truncated_stems.csv'
df = load_full_data(filepath)
X, y = get_features_and_target(df, feature_steps=range(1,7), target_col='step7')

# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_ridge(x_train, y_train, alpha=1.0)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5, model_type="ridge", alpha=1.0)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="ridge",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

***** Evaluation Results *****
Training set mean squared error: 6.0329
Training set r-squared score: 0.2457
Validation set mean squared error: 11.1367
Validation set r-squared score: -0.2184
Testing set mean squared error: 6.1125
Testing set r-squared score: -1.8013

Learned coefficients (weights for each step):
Step 1: -0.0146
Step 2: -0.0244
Step 3: -0.1317
Step 4: -0.1984
Step 5: 0.0409
Step 6: 0.3217

***** 5-Fold Cross-Validation (Ridge) Results *****
Fold 1: MSE = 15.5606, R² = -0.3990
Fold 2: MSE = 12.7722, R² = -0.7899
Fold 3: MSE = 12.8167, R² = -1.1989
Fold 4: MSE = 17.9668, R² = -2.3726
Fold 5: MSE = 8.2545, R² = -0.0410

***** Average over 5 folds *****
Average MSE: 13.4741
Average R²: -0.9603

Overall coverage across all folds: 37.78% within 95% CI


## Lasso Regression (authentic cadence, notes only)

In [6]:
# --- Load truncated stems of all cadences ---
filepath = 'data/truncated_stems.csv'
df = load_filtered_data(filepath)
X, y = get_features_and_target(df, feature_steps=range(1,7), target_col='step7')

# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_lasso(x_train, y_train, alpha=1.0)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5, model_type="lasso", alpha=1.0)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="lasso",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

***** Evaluation Results *****
Training set mean squared error: 16.8826
Training set r-squared score: -0.6518
Validation set mean squared error: 8.8756
Validation set r-squared score: -4.2763
Testing set mean squared error: 14.6054
Testing set r-squared score: -0.4969

Learned coefficients (weights for each step):
Step 1: 0.0000
Step 2: 0.0000
Step 3: 0.0317
Step 4: -0.0000
Step 5: 0.4309
Step 6: 0.5472

***** 5-Fold Cross-Validation (Lasso) Results *****
Fold 1: MSE = 19.6460, R² = -1.1874
Fold 2: MSE = 27.7218, R² = -0.7773
Fold 3: MSE = 12.8603, R² = -0.7447
Fold 4: MSE = 13.5627, R² = -0.4810
Fold 5: MSE = 13.3910, R² = -3.6324

***** Average over 5 folds *****
Average MSE: 17.4364
Average R²: -1.3646

Overall coverage across all folds: 40.00% within 95% CI


## Lasso Regression (all cadences, notes only)

In [7]:
# --- Load truncated stems of all cadences ---
filepath = 'data/truncated_stems.csv'
df = load_full_data(filepath)
X, y = get_features_and_target(df, feature_steps=range(1,7), target_col='step7')

# --- Split data ---
x_train, y_train, x_val, y_val, x_test, y_test = split_data(X, y)

# --- Train model ---
model = train_lasso(x_train, y_train, alpha=1.0)

# --- Evaluate model ---
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test)

# --- Print coefficients ---
print_coefficients(model)

# --- 5-fold cross-validation ---
cross_validate_model(X, y, k=5, model_type="lasso", alpha=1.0)

ci_df = kfold_bootstrap_ci_analysis(
    model_type="lasso",
    X=X,
    y=y,
    k=5,
    n_bootstraps=500,
    alpha=0.05,
    plot=False
)

***** Evaluation Results *****
Training set mean squared error: 11.1471
Training set r-squared score: -0.2922
Validation set mean squared error: 12.1455
Validation set r-squared score: -2.2575
Testing set mean squared error: 14.3249
Testing set r-squared score: -2.1546

Learned coefficients (weights for each step):
Step 1: 0.0310
Step 2: -0.0000
Step 3: 0.0046
Step 4: 0.0267
Step 5: 0.3700
Step 6: 0.5756

***** 5-Fold Cross-Validation (Lasso) Results *****
Fold 1: MSE = 15.2465, R² = -0.3708
Fold 2: MSE = 12.5669, R² = -0.7612
Fold 3: MSE = 11.0464, R² = -0.8952
Fold 4: MSE = 17.0015, R² = -2.1914
Fold 5: MSE = 8.0766, R² = -0.0186

***** Average over 5 folds *****
Average MSE: 12.7876
Average R²: -0.8474

Overall coverage across all folds: 31.11% within 95% CI
