In [None]:
%load_ext autoreload
%autoreload 2
# %matplotlib widget
%pdb off

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly
from IPython.display import display, HTML

plotly.offline.init_notebook_mode()
display(HTML(
    '<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_SVG"></script>'
))

plt.rcParams['figure.dpi'] = 140
im_scaling = .75
plt.rcParams['figure.figsize'] = [6.4 * im_scaling, 4.8 * im_scaling]

home_dir = "./"
display(home_dir)
plt.close('all')

# Load Data

In [None]:
multiRun_dir = f"{home_dir}/CHARLES/multiRuns/"
plotFolder = f"{multiRun_dir}"

df = pd.read_csv(f"{multiRun_dir}/intEmulation.csv", index_col=[0,1])
ydf = df['p-noInt-C_d']
x_cols = [col for col in df.columns if "C_d" not in col]
x_cols = [col for col in x_cols if "Type" not in col]
xdf = df[x_cols]

x_data = {}
y_data = {}
for roomType in df["roomType"].unique():
    for openingType in df["openingType"].unique():
        rows = (df["roomType"] == roomType) & (df["openingType"] == openingType)
        if rows.sum() > 0:
            x_data[(roomType, openingType)] = xdf[rows]
            y_data[(roomType, openingType)] = ydf[rows]

# Modeling

Rob Suggestions
- Gaussian process (sklearn)
- linear regression
- bin into room types
- window flag
- process window and room types separately at first

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

lr_results = {}

for (roomType, openingType), x in x_data.items():
    y = y_data[(roomType, openingType)]
    # Skip empty groups
    if x.empty:
        print(f"No data for {roomType} - {openingType}")
        continue

    # Fit multivariable linear model (sklearn)
    model = LinearRegression()
    model.fit(x, y)

    # Collect coefficients per feature
    coeffs = dict(zip(x.columns, model.coef_))
    intercept = model.intercept_
    r2 = model.score(x, y)

    # Significance testing (statsmodels OLS)
    X_sm = sm.add_constant(x)
    results_sm = sm.OLS(y, X_sm).fit()
    pvalues = results_sm.pvalues.to_dict()

    # Store results
    lr_results[(roomType, openingType)] = {
        'coefficients': coeffs,
        'intercept': intercept,
        'r2': r2,
        'pvalues': pvalues
    }

    # Print summary
    print(f"{roomType} - {openingType}: R²={r2:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}")
    # Print p-values with significance marker
    print("  p-values:")
    for var, pval in pvalues.items():
        sig = '*' if pval < 0.05 else ''
        print(f"    {var}: {pval:.3f}{sig}")
    print()


In [None]:
# Summary of significant p-values across all (roomType, openingType) groups
from collections import Counter, defaultdict

sig_counts = Counter()
sig_keys = defaultdict(list)
for (roomType, openingType), res in lr_results.items():
    # Skip intercept
    for var, pval in res['pvalues'].items():
        if var == 'const':
            continue
        if pval < 0.05:
            sig_counts[var] += 1
            sig_keys[var].append((roomType, openingType))

# Display results
print("Significant counts per variable (p < 0.05)):")
for var, count in sig_counts.items():
    groups = sig_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
# Create DataFrame of variables ranked by p-value for each group
import pandas as pd

ranked = {}
for key, res in lr_results.items():
    # Build Series of p-values, exclude intercept
    pvals = pd.Series(res['pvalues'])
    pvals = pvals.drop('const', errors='ignore')
    # Sort by p-value ascending and take variable names
    ranked[key] = pvals.sort_values().index.tolist()

# Construct DataFrame; rows = rank order, columns = groups
ranked_df = pd.DataFrame(ranked)
display(ranked_df)

In [None]:
# Ridge regression for each (roomType, openingType) group
from sklearn.linear_model import Ridge

ridge_results = {}
# Set regularization strength (alpha); increase for more shrinkage
alpha = 10.0

for (roomType, openingType), X in x_data.items():
    y = y_data[(roomType, openingType)]
    if X.empty:
        continue
    # Fit Ridge model
    ridge = Ridge(alpha=alpha)
    ridge.fit(X, y)
    # Collect coefficients and intercept
    coeffs = dict(zip(X.columns, ridge.coef_))
    intercept = ridge.intercept_
    r2 = ridge.score(X, y)
    ridge_results[(roomType, openingType)] = {'coefficients': coeffs, 'intercept': intercept, 'r2': r2}

    # Display results
    print(f"Ridge α={alpha} | {roomType}-{openingType}: R²={r2:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Lasso regression for each (roomType, openingType) group
from sklearn.linear_model import Lasso

lasso_results = {}
# Set L1 regularization strength; increase alpha for more sparsity
alpha_lasso = 0.1

for (roomType, openingType), X in x_data.items():
    y = y_data[(roomType, openingType)]
    if X.empty:
        continue
    # Fit Lasso model
    lasso = Lasso(alpha=alpha_lasso, max_iter=10000)
    lasso.fit(X, y)
    # Collect coefficients and intercept
    coeffs = dict(zip(X.columns, lasso.coef_))
    intercept = lasso.intercept_
    r2 = lasso.score(X, y)
    lasso_results[(roomType, openingType)] = {'coefficients': coeffs, 'intercept': intercept, 'r2': r2}
    # Display results
    print(f"Lasso α={alpha_lasso} | {roomType}-{openingType}: R²={r2:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Summary of Lasso coefficient sparsity across groups
from collections import Counter, defaultdict

nonzero_counts = Counter()
nonzero_keys = defaultdict(list)
for (roomType, openingType), res in lasso_results.items():
    for var, coef in res['coefficients'].items():
        if coef != 0:
            nonzero_counts[var] += 1
            nonzero_keys[var].append((roomType, openingType))

# Display results
print("Non-zero coefficient counts per variable (Lasso α={alpha_lasso}):")
for var, count in nonzero_counts.items():
    groups = nonzero_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Select the first group
first_key = next(iter(x_data))
X = x_data[first_key]
y = y_data[first_key]

# 1. Split your original (unscaled) data
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)
X_train, X_dev, y_train, y_dev = train_test_split(
    X_temp, y_temp, train_size=0.875, random_state=0
)

print(f"Train: {X_train.shape[0]} samples")
print(f"Dev:   {X_dev.shape[0]} samples")
print(f"Test:  {X_test.shape[0]} samples")

In [None]:
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler
)
from sklearn.compose import ColumnTransformer

# 1. Identify your column groups by name
ori_cols = [c for c in X_train.columns
            if 'orientation' in c.lower()
            or c == 'AofA']
trig_cols = [c for c in X_train.columns
             if 'sin' in c.lower()
             or 'cos' in c.lower()]
# ori_cols = []
# trig_cols = []
# everything else gets standard scaling
other_cols = [c for c in X_train.columns
              if c not in ori_cols + trig_cols]

# 2. Build a ColumnTransformer
ct = ColumnTransformer([
    # convert degrees → [-1, 1] for orientation/AoA
    ('A_scale',
     FunctionTransformer(lambda x: (x - 180) * (np.pi / 180.0), validate=False),
     ori_cols),

    # leave any sin/cos features untouched
    ('passthrough_trig', 'passthrough', trig_cols),

    # standard‐scale the rest
    ('scale', StandardScaler(), other_cols),
], remainder='drop')  # drop any unexpected columns

# 3. Fit on TRAIN, transform all splits
X_train_s = ct.fit_transform(X_train)
X_dev_s   = ct.transform(X_dev)
X_test_s  = ct.transform(X_test)

# Manually fit + apply y‐scaler on TRAIN only
scaler_y = StandardScaler().fit(y_train.values.reshape(-1,1))

y_train_s = scaler_y.transform(y_train.values.reshape(-1,1)).ravel()
y_dev_s   = scaler_y.transform(y_dev.values.reshape(-1,1)).ravel()
y_test_s  = scaler_y.transform(y_test.values.reshape(-1,1)).ravel()

# (Optionally wrap back to DataFrame for convenience:)
out_cols = ori_cols + trig_cols + other_cols
X_train_s = pd.DataFrame(X_train_s, columns=out_cols, index=X_train.index)
X_dev_s   = pd.DataFrame(X_dev_s,   columns=out_cols, index=X_dev.index)
X_test_s  = pd.DataFrame(X_test_s,  columns=out_cols, index=X_test.index)

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.metrics import r2_score

# 1. Split your data
# (Assuming you've already got X_train, X_dev, X_test, etc. from above)

# 2. Define your kernel: signal variance × RBF + noise term
kernel = (
    C(1.0, (1e-3, 1e3))   # signal variance
    * RBF(1.0, (1e-2, 1e2))  # length-scale
    + WhiteKernel(
        noise_level=1e-2,
        noise_level_bounds=(1e-5, 1e1)
      )
)

# n_features = X_train_s.shape[1]
# kernel = (
#     C(1.0, (1e-3, 1e3))
#     * RBF(length_scale=[1.0]*n_features,  # ℓ₁, ℓ₂, …, ℓ_D
#           length_scale_bounds=(1e-3, 1e3))
#     + WhiteKernel(1e-2, (1e-5, 1e1))
# )

# 3. Instantiate GPR with no extra alpha
gpr = GaussianProcessRegressor(
    kernel=kernel,
    n_restarts_optimizer=10,
    random_state=0,
    normalize_y=False  # since you’ve done your own y-scaling
)

# 4. Fit on the training set
gpr.fit(X_train_s, y_train_s)

# 5. Validate on dev set to check for over/underfitting
y_dev_pred, y_dev_std = gpr.predict(X_dev_s, return_std=True)
r2_dev = r2_score(y_dev_s, y_dev_pred)
print(f"Dev R² = {r2_dev:.3f}")
print("Learned kernel:", gpr.kernel_)

# # 6. If you’re happy, do final eval on test set
# y_test_pred, y_test_std = gpr.predict(X_test, return_std=True)
# r2_test = r2_score(y_test, y_test_pred)
# print(f"Test R² = {r2_test:.3f}")

In [None]:
from sklearn.gaussian_process.kernels import (
    RBF,
    ConstantKernel as C,
    WhiteKernel,
    ExpSineSquared
)
from sklearn.gaussian_process import GaussianProcessRegressor

# Suppose you believe there’s a cycle of roughly T units in your data:
T_estimate = 2 * np.pi 

kernel = (
    C(1.0, (1e-3, 1e3))                 # overall signal variance
    * (
        RBF(1.0, (1e-2, 1e2))           # smooth, non-periodic trends
        + ExpSineSquared(
            length_scale=1.0,
            periodicity=T_estimate,
            length_scale_bounds=(1e-1, 1e2),
            periodicity_bounds=(0.5 * T_estimate, 2 * T_estimate)
          )
      )
    + WhiteKernel(1e-2, (1e-5, 1e1))     # observation noise
)

gpr = GaussianProcessRegressor(
    kernel=kernel,
    normalize_y=True,       # let the GP center & scale your target
    n_restarts_optimizer=10,
    random_state=0
)

gpr.fit(X_train, y_train)

print("Optimized kernel:", gpr.kernel_)
print("Dev  R²:", gpr.score(X_dev, y_dev))