In [None]:
%load_ext autoreload
%autoreload 2
# %matplotlib widget
%pdb off

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly
from IPython.display import display, HTML

plotly.offline.init_notebook_mode()
display(HTML(
    '<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_SVG"></script>'
))

plt.rcParams['figure.dpi'] = 140
im_scaling = .75
plt.rcParams['figure.figsize'] = [6.4 * im_scaling, 4.8 * im_scaling]

home_dir = "./"
display(home_dir)
plt.close('all')

# Load Data

In [None]:
multiRun_dir = f"{home_dir}/CHARLES/multiRuns/"
plotFolder = f"{multiRun_dir}"

df = pd.read_csv(f"{multiRun_dir}/intEmulation.csv", index_col=[0,1])
df = df[df["slAll"] == False]

ydf = df['p-noInt-C_d']
x_cols = [col for col in df.columns if "C_d" not in col]
x_cols = [col for col in x_cols if "Type" not in col]
xdf = df[x_cols]

# ydf = np.log(ydf + .0001)
# x_df = np.log(xdf + .0001)

x_data = {}
y_data = {}
for roomType in df["roomType"].unique():
    for openingType in df["openingType"].unique():
        rows = (df["roomType"] == roomType) & (df["openingType"] == openingType)
        if rows.sum() > 0:
            x_data[(roomType, openingType)] = xdf[rows]
            y_data[(roomType, openingType)] = ydf[rows]

# Data Preprocessing

Split each group into train/dev/test sets and apply normalization

- [ ] Add min max scalar for normalization [-1 1]
- [ ] Roughly same distributions -> plot input data

In [None]:
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Dictionaries to store our preprocessed data
x_train_data = {}
x_dev_data = {}
x_test_data = {}
y_train_data = {}
y_dev_data = {}
y_test_data = {}

# Dictionaries for transformers
column_transformers = {}
y_scalers = {}

# Process each group
for group, X in x_data.items():
    roomType, openingType = group
    y = y_data[group]
    
    print(f"Processing {roomType} - {openingType} group...")
    
    # 1. Split into train/dev/test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    X_train, X_dev, y_train, y_dev = train_test_split(
        X_temp, y_temp, train_size=0.875, random_state=0
    )
    
    # 2. Identify column groups by name
    ori_cols = [c for c in X_train.columns
                if 'orientation' in c.lower()
                or c == 'AofA']
    trig_cols = [c for c in X_train.columns
                if 'sin' in c.lower()
                or 'cos' in c.lower()]
    other_cols = [c for c in X_train.columns
                if c not in ori_cols + trig_cols]
    
    # 3. Build a ColumnTransformer
    ct = ColumnTransformer([
        # convert degrees → [-1, 1] for orientation/AoA
        ('A_scale',
         FunctionTransformer(lambda x: (x - 180) * (np.pi / 180.0), validate=False),
         ori_cols),
    
        # leave any sin/cos features untouched
        ('passthrough_trig', 'passthrough', trig_cols),
    
        # standard‐scale the rest
        ('scale', StandardScaler(), other_cols),
    ], remainder='drop')  # drop any unexpected columns
    
    # 4. Fit on TRAIN, transform all splits
    X_train_s = ct.fit_transform(X_train)
    X_dev_s = ct.transform(X_dev)
    X_test_s = ct.transform(X_test)
    
    # 5. Manually fit + apply y‐scaler on TRAIN only
    scaler_y = StandardScaler().fit(y_train.values.reshape(-1,1))
    
    y_train_s = scaler_y.transform(y_train.values.reshape(-1,1)).ravel()
    y_dev_s = scaler_y.transform(y_dev.values.reshape(-1,1)).ravel()
    y_test_s = scaler_y.transform(y_test.values.reshape(-1,1)).ravel()
    
    # 6. (Optionally wrap back to DataFrame for convenience:)
    out_cols = ori_cols + trig_cols + other_cols
    X_train_s = pd.DataFrame(X_train_s, columns=out_cols, index=X_train.index)
    X_dev_s = pd.DataFrame(X_dev_s, columns=out_cols, index=X_dev.index)
    X_test_s = pd.DataFrame(X_test_s, columns=out_cols, index=X_test.index)
    
    # Store preprocessed data
    x_train_data[group] = X_train_s
    x_dev_data[group] = X_dev_s
    x_test_data[group] = X_test_s
    y_train_data[group] = y_train_s
    y_dev_data[group] = y_dev_s
    y_test_data[group] = y_test_s
    
    # Store transformers for later inverse transformations
    column_transformers[group] = ct
    y_scalers[group] = scaler_y
    
    print(f"  Train: {X_train_s.shape[0]} samples")
    print(f"  Dev:   {X_dev_s.shape[0]} samples")
    print(f"  Test:  {X_test_s.shape[0]} samples")
    print()

# Print summary
print(f"Processed {len(x_train_data)} groups")

# Modeling

Rob Suggestions
- Gaussian process (sklearn)
- linear regression
- bin into room types
- window flag
- process window and room types separately at first

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

lr_results = {}

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit multivariable linear model (sklearn)
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Collect coefficients per feature
    coeffs = dict(zip(X_train.columns, model.coef_))
    intercept = model.intercept_
    r2_train = model.score(X_train, y_train)
    r2_dev = model.score(X_dev, y_dev)
    
    # Significance testing (statsmodels OLS)
    X_sm = sm.add_constant(X_train)
    results_sm = sm.OLS(y_train, X_sm).fit()
    pvalues = results_sm.pvalues.to_dict()
    
    # Store results
    lr_results[group] = {
        'coefficients': coeffs,
        'intercept': intercept,
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'pvalues': pvalues
    }
    
    # Print summary
    print(f"{roomType} - {openingType}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}")
    # Print p-values with significance marker
    print("  p-values:")
    for var, pval in pvalues.items():
        sig = '*' if pval < 0.05 else ''
        print(f"    {var}: {pval:.3f}{sig}")
    print()

In [None]:
# Summary of significant p-values across all (roomType, openingType) groups
from collections import Counter, defaultdict

sig_counts = Counter()
sig_keys = defaultdict(list)
for (roomType, openingType), res in lr_results.items():
    # Skip intercept
    for var, pval in res['pvalues'].items():
        if var == 'const':
            continue
        if pval < 0.05:
            sig_counts[var] += 1
            sig_keys[var].append((roomType, openingType))

# Display results
print("Significant counts per variable (p < 0.05)):")
for var, count in sig_counts.items():
    groups = sig_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
# Create DataFrame of variables ranked by p-value for each group
import pandas as pd

ranked = {}
for key, res in lr_results.items():
    # Build Series of p-values, exclude intercept
    pvals = pd.Series(res['pvalues'])
    pvals = pvals.drop('const', errors='ignore')
    # Sort by p-value ascending and take variable names
    ranked[key] = pvals.sort_values().index.tolist()

# Construct DataFrame; rows = rank order, columns = groups
ranked_df = pd.DataFrame(ranked)
display(ranked_df)

## Regularized Regression

In [None]:
# Ridge regression for each (roomType, openingType) group
from sklearn.linear_model import Ridge

ridge_results = {}
# Set regularization strength (alpha); increase for more shrinkage
alpha = 10.0

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group] 
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit Ridge model
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    
    # Collect coefficients and intercept
    coeffs = dict(zip(X_train.columns, ridge.coef_))
    intercept = ridge.intercept_
    r2_train = ridge.score(X_train, y_train)
    r2_dev = ridge.score(X_dev, y_dev)
    
    ridge_results[group] = {
        'coefficients': coeffs, 
        'intercept': intercept, 
        'r2_train': r2_train,
        'r2_dev': r2_dev
    }

    # Display results
    print(f"Ridge α={alpha} | {roomType}-{openingType}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Lasso regression for each (roomType, openingType) group
from sklearn.linear_model import Lasso

lasso_results = {}
# Set L1 regularization strength; increase alpha for more sparsity
alpha_lasso = 0.01

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit Lasso model
    lasso = Lasso(alpha=alpha_lasso, max_iter=10000)
    lasso.fit(X_train, y_train)
    
    # Collect coefficients and intercept
    coeffs = dict(zip(X_train.columns, lasso.coef_))
    intercept = lasso.intercept_
    r2_train = lasso.score(X_train, y_train)
    r2_dev = lasso.score(X_dev, y_dev)
    
    lasso_results[group] = {
        'coefficients': coeffs, 
        'intercept': intercept, 
        'r2_train': r2_train,
        'r2_dev': r2_dev
    }
    
    # Display results
    print(f"Lasso α={alpha_lasso} | {roomType}-{openingType}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Summary of Lasso coefficient sparsity across groups
from collections import Counter, defaultdict

nonzero_counts = Counter()
nonzero_keys = defaultdict(list)
for (roomType, openingType), res in lasso_results.items():
    for var, coef in res['coefficients'].items():
        if abs(coef) > 1e-6:  # Use a small threshold to account for floating point
            nonzero_counts[var] += 1
            nonzero_keys[var].append((roomType, openingType))

# Display results
print(f"Non-zero coefficient counts per variable (Lasso α={alpha_lasso}):")
for var, count in sorted(nonzero_counts.items(), key=lambda x: -x[1]):
    groups = nonzero_keys[var]
    print(f"{var}: {count} (groups: {groups})")

## Gaussian Process Regression

Normalized RMSE
- Log Likelihood 

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel, DotProduct
from sklearn.metrics import r2_score

# Dictionary to store GP results
gp_results = {}

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    print(f"Training GP for {roomType}-{openingType}...")
    
    # Define kernel: signal variance × RBF + noise term
    kernel = (
        C(1.0, (1e-3, 1e3))   # signal variance
        * RBF(1.0, (1e-2, 1e2))  # length-scale
        # + DotProduct()      # global linear trend
        + WhiteKernel(
            noise_level=1e-2,
            noise_level_bounds=(1e-5, 1e1)
          )
    )
    
    # Instantiate GPR
    gpr = GaussianProcessRegressor(
        kernel=kernel,
        n_restarts_optimizer=10,
        random_state=0,
        normalize_y=False  # since we've already normalized
    )
    
    # Fit on the training set
    gpr.fit(X_train, y_train)
    
    # Validate on dev set
    y_dev_pred, y_dev_std = gpr.predict(X_dev, return_std=True)
    r2_dev = r2_score(y_dev, y_dev_pred)
    r2_train = gpr.score(X_train, y_train)
    
    # Store results
    gp_results[group] = {
        'model': gpr,
        'kernel': gpr.kernel_,
        'r2_dev': r2_dev
    }
    
    print(f"Train R² = {r2_train:.3f}, Dev R² = {r2_dev:.3f}")
    print("Learned kernel:", gpr.kernel_)
    print()

In [None]:
# Try a more complex kernel for one group
from sklearn.gaussian_process.kernels import (
    RBF,
    ConstantKernel as C,
    WhiteKernel,
    ExpSineSquared
)

# Choose a group to test with
selected_group = list(x_train_data.keys())[0]
roomType, openingType = selected_group
print(f"Testing complex kernel on {roomType}-{openingType}")

X_train = x_train_data[selected_group]
y_train = y_train_data[selected_group]
X_dev = x_dev_data[selected_group]
y_dev = y_dev_data[selected_group]

# Suppose you believe there's a cycle of roughly T units in your data:
T_estimate = 2 * np.pi 

kernel = (
    C(1.0, (1e-3, 1e3))                 # overall signal variance
    * (
        RBF(1.0, (1e-2, 1e2))           # smooth, non-periodic trends
        + ExpSineSquared(
            length_scale=1.0,
            periodicity=T_estimate,
            length_scale_bounds=(1e-1, 1e2),
            periodicity_bounds=(0.5 * T_estimate, 2 * T_estimate)
          )
      )
    + WhiteKernel(1e-2, (1e-5, 1e1))     # observation noise
)

gpr = GaussianProcessRegressor(
    kernel=kernel,
    normalize_y=False,      # we already normalized
    n_restarts_optimizer=10,
    random_state=0
)

gpr.fit(X_train, y_train)

print("Optimized kernel:", gpr.kernel_)
print("Train R²:", gpr.score(X_train, y_train))
print("Dev R²:", gpr.score(X_dev, y_dev))

## Model Comparison

In [None]:
# Compare model performance on dev set across all groups
import pandas as pd

# Extract dev R² scores
results = []
for group in x_train_data.keys():
    roomType, openingType = group
    
    lr_r2 = lr_results[group]['r2_dev']
    ridge_r2 = ridge_results[group]['r2_dev']
    lasso_r2 = lasso_results[group]['r2_dev']
    gp_r2 = gp_results[group]['r2_dev']
    
    results.append({
        'roomType': roomType,
        'openingType': openingType,
        'Linear': lr_r2,
        'Ridge': ridge_r2,
        'Lasso': lasso_r2,
        'GP': gp_r2
    })

# Convert to DataFrame
comparison_df = pd.DataFrame(results)
comparison_df = comparison_df.set_index(['roomType', 'openingType'])

# Display
print("R² scores on dev set:")
display(comparison_df)

# Calculate average R² per model
print("\nAverage R² by model type:")
print(comparison_df.mean())

# Find best model for each group
best_model = comparison_df.idxmax(axis=1)
print("\nBest model for each group:")
display(pd.DataFrame({'Best Model': best_model}))