In [None]:
%load_ext autoreload
%autoreload 2
# %matplotlib widget
%pdb off

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly
from IPython.display import display, HTML

plotly.offline.init_notebook_mode()
display(HTML(
    '<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_SVG"></script>'
))

plt.rcParams['figure.dpi'] = 140
im_scaling = .75
plt.rcParams['figure.figsize'] = [6.4 * im_scaling, 4.8 * im_scaling]

home_dir = "./"
display(home_dir)
plt.close('all')

# Load Data

In [None]:
multiRun_dir = f"{home_dir}/CHARLES/multiRuns/"
plotFolder = f"{multiRun_dir}"

df = pd.read_csv(f"{multiRun_dir}/intEmulation.csv", index_col=[0,1])
df = df[df["slAll"] == False]

In [None]:
# ydf = df['p-noInt-C_d']
ydf = df['mean-mass_flux']

x_cols = [col for col in df.columns if "C_d" not in col]
x_cols = [col for col in x_cols if "Type" not in col]
x_cols = [col for col in x_cols if "slAll" not in col]
x_cols = [col for col in x_cols if "mass_flux" not in col]
x_cols = [col for col in x_cols if "sn_prod(abs(u))" not in col]
x_cols = [col for col in x_cols if "q_model" not in col]
x_cols += ['p-noInt_optp0-C_d', 'p-noInt_optp0-q_model']
xdf = df[x_cols]
# Create a proper copy of the dataframe to avoid SettingWithCopyWarning
xdf['p-noInt_optp0-q_modelC_d'] = xdf['p-noInt_optp0-q_model'] * xdf['p-noInt_optp0-C_d']
display(xdf.columns.values)

# ydf = np.log(ydf + .0001)
# x_df = np.log(xdf + .0001)

x_data = {}
y_data = {}
for roomType in df["roomType"].unique():
    for windowType in df["windowType"].unique():
        rows = (df["roomType"] == roomType) & (df["windowType"] == windowType)
        if rows.sum() > 0:
            x_data[(roomType, windowType)] = xdf[rows]
            y_data[(roomType, windowType)] = ydf[rows]

# Data Preprocessing

Split each group into train/dev/test sets and apply normalization

- [ ] Add min max scalar for normalization [-1 1]
- [ ] Roughly same distributions -> plot input data

In [None]:
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler,
    MinMaxScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Dictionaries to store our preprocessed data
x_train_data = {}
x_dev_data = {}
x_test_data = {}
y_train_data = {}
y_dev_data = {}
y_test_data = {}

# Dictionaries for transformers
column_transformers = {}
y_scalers = {}

# Process each group
for group, X in x_data.items():
    roomType, windowType = group
    y = y_data[group]
    
    print(f"Processing {roomType} - {windowType} group...")
    
    # 1. Split into train/dev/test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    X_train, X_dev, y_train, y_dev = train_test_split(
        X_temp, y_temp, train_size=0.875, random_state=0
    )
    
    # 2. Identify column groups by name
    ori_cols = [c for c in X_train.columns
                if 'orientation' in c.lower()
                or c == 'AofA']
    trig_cols = [c for c in X_train.columns
                if 'sin' in c.lower()
                or 'cos' in c.lower()]
    other_cols = [c for c in X_train.columns
                if c not in ori_cols + trig_cols]
    
    # 3. Build a ColumnTransformer
    ct = ColumnTransformer([
        # convert degrees → [-1, 1] for orientation/AoA
        ('A_scale',
         FunctionTransformer(lambda x: (x - 180) / 180.0, validate=False),
         ori_cols),
    
        # leave any sin/cos features untouched
        ('passthrough_trig', 'passthrough', trig_cols),
    
        # standard‐scale the rest
        ('scale', MinMaxScaler(feature_range=(-1, 1)), other_cols),
        ], remainder='drop')  # drop any unexpected columns
    
    # 4. Fit on TRAIN, transform all splits
    X_train_s = ct.fit_transform(X_train)
    X_dev_s = ct.transform(X_dev)
    X_test_s = ct.transform(X_test)
    
    # 5. Manually fit + apply y‐scaler on TRAIN only
    scaler_y = MinMaxScaler().fit(y_train.values.reshape(-1,1))
    
    y_train_s = scaler_y.transform(y_train.values.reshape(-1,1)).ravel()
    y_dev_s = scaler_y.transform(y_dev.values.reshape(-1,1)).ravel()
    y_test_s = scaler_y.transform(y_test.values.reshape(-1,1)).ravel()
    
    # 6. (Optionally wrap back to DataFrame for convenience:)
    out_cols = ori_cols + trig_cols + other_cols
    X_train_s = pd.DataFrame(X_train_s, columns=out_cols, index=X_train.index)
    X_dev_s = pd.DataFrame(X_dev_s, columns=out_cols, index=X_dev.index)
    X_test_s = pd.DataFrame(X_test_s, columns=out_cols, index=X_test.index)
    
    # Store preprocessed data
    x_train_data[group] = X_train_s
    x_dev_data[group] = X_dev_s
    x_test_data[group] = X_test_s
    y_train_data[group] = y_train_s
    y_dev_data[group] = y_dev_s
    y_test_data[group] = y_test_s
    
    # Store transformers for later inverse transformations
    column_transformers[group] = ct
    y_scalers[group] = scaler_y
    
    print(f"  Train: {X_train_s.shape[0]} samples")
    print(f"  Dev:   {X_dev_s.shape[0]} samples")
    print(f"  Test:  {X_test_s.shape[0]} samples")
    print()

# Print summary
print(f"Processed {len(x_train_data)} groups")

In [None]:
import seaborn as sns
import numpy as pd
from matplotlib.gridspec import GridSpec

# Visualize feature distributions after normalization
import matplotlib.pyplot as plt

# Function to plot feature distributions
def plot_feature_distributions(data_dict, feature_groups=None, n_cols=3, figsize=(18, 15)):
    """
    Plot distributions of features across different groups.
    
    Parameters:
    -----------
    data_dict : dict
        Dictionary mapping (roomType, openingType) to DataFrames containing the features
    feature_groups : dict, optional
        Dictionary mapping group names to lists of feature names
    n_cols : int, optional
        Number of columns in the grid
    figsize : tuple, optional
        Figure size
    """
    # If no feature groups provided, create a default one with all features
    if feature_groups is None:
        # Get a sample dataframe to extract all column names
        sample_df = next(iter(data_dict.values()))
        feature_groups = {'All Features': sample_df.columns.tolist()}
    
    # Create figure
    plt.figure(figsize=figsize)
    
    # For each feature group
    for group_idx, (group_name, features) in enumerate(feature_groups.items()):
        print(f"\n--- {group_name} ---")
        
        # Calculate how many rows we need
        n_rows = (len(features) + n_cols - 1) // n_cols
        
        # Create subplot grid
        fig = plt.figure(figsize=figsize)
        gs = GridSpec(n_rows, n_cols, figure=fig)
        
        # For each feature in this group
        for i, feature in enumerate(features):
            ax = fig.add_subplot(gs[i // n_cols, i % n_cols])
            
            # Collect all values for this feature across all groups
            all_values = []
            for group, df in data_dict.items():
                if feature in df.columns:
                    all_values.extend(df[feature].values)
            
            # Plot histogram
            sns.histplot(all_values, kde=True, ax=ax)
            ax.set_title(f"{feature}")
            ax.set_xlabel("")
            
            # Calculate and display statistics
            mean_val = np.mean(all_values)
            std_val = np.std(all_values)
            min_val = np.min(all_values)
            max_val = np.max(all_values)
            
            stats_text = f"Mean: {mean_val:.2f}\nStd: {std_val:.2f}\nMin: {min_val:.2f}\nMax: {max_val:.2f}"
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes, 
                    verticalalignment='top', bbox=dict(boxstyle='round', alpha=0.1))
            
            # Print statistics
            print(f"{feature}: Mean={mean_val:.2f}, Std={std_val:.2f}, Min={min_val:.2f}, Max={max_val:.2f}")
        
        plt.suptitle(f"Distribution of {group_name}", fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.97])
        plt.show()

# Define feature groups
ori_cols = [c for c in x_train_data[('corner', 'xwindow_0-0')].columns
           if 'orientation' in c.lower()
           or c == 'AofA']

trig_cols = [c for c in x_train_data[('corner', 'xwindow_0-0')].columns
           if 'sin' in c.lower()
           or 'cos' in c.lower()]

other_cols = [c for c in x_train_data[('corner', 'xwindow_0-0')].columns
             if c not in ori_cols + trig_cols]

# Organize features by groups
feature_groups = {
    'Orientation Features': ori_cols,
    'Trigonometric Features': trig_cols,
    'Other Features': other_cols[:10],  # First 10 other features
    'More Other Features': other_cols[10:20],  # Next 10 other features
    'Remaining Features': other_cols[20:]  # Remaining features
}

# Plot distributions for training data
# plot_feature_distributions(x_train_data, feature_groups)

# Compare distributions across different room and opening types
for feature in ['EP_mag-noInt', 'EP_shear-noInt', 'sinAofA', 'cosAofA']:
    plt.figure(figsize=(12, 6))
    for group, df in x_train_data.items():
        if feature in df.columns:
            sns.kdeplot(df[feature].values, label=f"{group[0]}-{group[1]}")
    plt.title(f"Distribution of {feature} across different room and opening types")
    plt.legend()
    plt.show()

# Modeling

Rob Suggestions
- Gaussian process (sklearn)
- linear regression
- bin into room types
- window flag
- process window and room types separately at first

## Linear Regression

In [None]:
def calculate_linear_likelihood(linear_model, X, y):
    """
    Calculate log likelihood for a linear regression model.
    
    Parameters:
    -----------
    linear_model : sklearn linear model (LinearRegression, Ridge, Lasso, etc.)
        The fitted linear regression model
    X : array-like, shape (n_samples, n_features)
        Input features
    y : array-like, shape (n_samples,)
        Target values
        
    Returns:
    --------
    log_likelihood : float
        Log likelihood of the data under the model
    """
    import numpy as np
    from scipy import stats
    
    # Get predictions
    y_pred = linear_model.predict(X)
    
    # Calculate residuals
    residuals = y - y_pred
    
    # Estimate variance (MLE of variance)
    n = len(y)
    variance = np.sum(residuals**2) / n
    
    # Calculate log likelihood
    log_likelihood = np.sum(stats.norm.logpdf(y, loc=y_pred, scale=np.sqrt(variance)))
    
    return log_likelihood

def calculate_normalized_rmse(model, X, y, normalization='std'):
    """
    Calculate RMSE and normalized RMSE for any regression model.
    
    Parameters:
    -----------
    model : trained regression model with predict method
        The model to evaluate
    X : array-like, shape (n_samples, n_features)
        Input features
    y : array-like, shape (n_samples,)
        Target values
    normalization : str, optional (default='std')
        Method for normalization:
        - 'std': normalize by standard deviation of y
        - 'mean': normalize by mean of y
        - 'range': normalize by range of y
        
    Returns:
    --------
    rmse : float
        Root Mean Square Error
    nrmse : float
        Normalized Root Mean Square Error
    """
    import numpy as np
    
    # Get predictions
    y_pred = model.predict(X)
    
    # Calculate RMSE
    mse = np.mean((y - y_pred)**2)
    rmse = np.sqrt(mse)
    
    # Calculate normalized RMSE
    if normalization == 'mean':
        # Normalize by mean of observed values
        nrmse = rmse / np.mean(np.abs(y))
    elif normalization == 'range':
        # Normalize by range of observed values
        nrmse = rmse / (np.max(y) - np.min(y))
    else:  # default: 'std'
        # Normalize by standard deviation of observed values
        nrmse = rmse / np.std(y)
    
    return rmse, nrmse

def visualize_linear_model(linear_model, X, y, model_name="Linear Regression", feature_names=None, top_features=10):
    """
    Create a comprehensive visualization of linear model fit with feature importance.
    
    Parameters:
    -----------
    linear_model : sklearn linear model (LinearRegression, Ridge, Lasso, etc.)
        The fitted linear model
    X : DataFrame
        Input features as pandas DataFrame
    y : Series or array
        Target values
    model_name : str, optional
        Name of the model type for plot titles
    feature_names : list, optional
        Names of features (if not provided, will use X.columns)
    top_features : int, optional
        Number of top features to show in importance plot
    """
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from sklearn.metrics import mean_squared_error, r2_score
    import seaborn as sns
    
    # Convert y to numpy array if it's a pandas Series
    if hasattr(y, 'values'):
        y_values = y.values
    else:
        y_values = np.array(y)
    
    # Get feature names from DataFrame if not provided
    if feature_names is None and hasattr(X, 'columns'):
        feature_names = X.columns.tolist()
    elif feature_names is None:
        feature_names = [f'Feature {i}' for i in range(X.shape[1])]
    
    # Get predictions
    y_pred = linear_model.predict(X)
    
    # Calculate metrics
    r2 = r2_score(y_values, y_pred)
    rmse = np.sqrt(mean_squared_error(y_values, y_pred))
    
    # Calculate residuals
    residuals = y_values - y_pred
    
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 12))
    
    # 1. Prediction vs Actual plot
    ax1 = fig.add_subplot(221)
    ax1.scatter(y_values, y_pred, alpha=0.6)
    
    # Add perfect prediction line
    min_val = min(min(y_values), min(y_pred))
    max_val = max(max(y_values), max(y_pred))
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    ax1.set_xlabel('Actual Values')
    ax1.set_ylabel('Predicted Values')
    ax1.set_title(f'{model_name}: Prediction vs Actual\nR² = {r2:.4f}, RMSE = {rmse:.4f}')
    ax1.grid(True, alpha=0.3)
    
    # 2. Residuals vs Predicted
    ax2 = fig.add_subplot(222)
    ax2.scatter(y_pred, residuals, alpha=0.6)
    ax2.axhline(y=0, color='r', linestyle='--')
    ax2.set_xlabel('Predicted Values')
    ax2.set_ylabel('Residuals')
    ax2.set_title('Residuals vs Predicted')
    ax2.grid(True, alpha=0.3)
    
    # 3. Residual Distribution
    ax3 = fig.add_subplot(223)
    sns.histplot(residuals, kde=True, ax=ax3)
    ax3.axvline(x=0, color='r', linestyle='--')
    ax3.set_xlabel('Residual Value')
    ax3.set_ylabel('Frequency')
    ax3.set_title('Residual Distribution')
    ax3.grid(True, alpha=0.3)
    
    # 4. QQ Plot for Residuals
    ax4 = fig.add_subplot(224)
    from scipy import stats
    stats.probplot(residuals, dist="norm", plot=ax4)
    ax4.set_title('Q-Q Plot of Residuals')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Feature Importance from Coefficients
    if hasattr(linear_model, 'coef_'):
        # Get coefficients
        if linear_model.coef_.ndim > 1:
            coeffs = linear_model.coef_[0]  # For multi-output models
        else:
            coeffs = linear_model.coef_
        
        # Create DataFrame with features and coefficients
        coef_df = pd.DataFrame({
            'Feature': feature_names,
            'Coefficient': coeffs
        })
        
        # Add absolute coefficient for ranking
        coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
        
        # Sort by absolute coefficient
        coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
        
        # Show top features
        top_coef = coef_df.head(top_features)
        
        # Plot coefficients
        plt.figure(figsize=(12, 8))
        colors = ['red' if x < 0 else 'blue' for x in top_coef['Coefficient']]
        bars = sns.barplot(x='Coefficient', y='Feature', data=top_coef, palette=colors)
        
        # Add a vertical line at x=0
        plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
        
        # Add value labels to the bars
        for i, v in enumerate(top_coef['Coefficient']):
            bars.text(v + (0.01 if v >= 0 else -0.01), i, f"{v:.4f}", 
                     va='center', ha='left' if v >= 0 else 'right')
        
        plt.title(f'Top {top_features} Feature Coefficients in {model_name}')
        plt.tight_layout()
        plt.show()
        
        # Display coefficient table
        print("\nFeature Coefficient Ranking:")
        display(coef_df)
        
        # Feature correlation with target
        if hasattr(X, 'corrwith'):
            print("\nFeature Correlation with Target:")
            corr_df = pd.DataFrame({
                'Feature': feature_names,
                'Correlation': X.corrwith(pd.Series(y_values, index=X.index)).values
            })
            corr_df['Abs_Correlation'] = np.abs(corr_df['Correlation'])
            corr_df = corr_df.sort_values('Abs_Correlation', ascending=False)
            display(corr_df)
            
            # Plot correlation heatmap for top features
            plt.figure(figsize=(12, 10))
            top_features_list = coef_df.head(min(15, len(feature_names)))['Feature'].tolist()
            X_top = X[top_features_list]
            
            # Add target to the correlation matrix
            X_with_y = X_top.copy()
            X_with_y['Target'] = y_values
            
            # Plot correlation heatmap
            sns.heatmap(X_with_y.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
            plt.title(f'Correlation Heatmap of Top Features with Target ({model_name})')
            plt.tight_layout()
            plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

lr_results = {}

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit multivariable linear model (sklearn)
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Collect coefficients per feature
    coeffs = dict(zip(X_train.columns, model.coef_))
    intercept = model.intercept_
    r2_train = model.score(X_train, y_train)
    r2_dev = model.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(model, X_train, y_train)
    LL_dev = calculate_linear_likelihood(model, X_dev, y_dev)
    rmse_train, nrmse_train = calculate_normalized_rmse(model, X_train, y_train)
    rmse_dev, nrmse_dev = calculate_normalized_rmse(model, X_dev, y_dev)
    
    # Significance testing (statsmodels OLS)
    X_sm = sm.add_constant(X_train)
    results_sm = sm.OLS(y_train, X_sm).fit()
    pvalues = results_sm.pvalues.to_dict()
    
    # Store results
    lr_results[group] = {
        'coefficients': coeffs,
        'intercept': intercept,
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': rmse_dev,
        'pvalues': pvalues
    }
    
    # Print summary
    print(f"{roomType} - {openingType}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}")
    # Print p-values with significance marker
    print("  p-values:")
    for var, pval in pvalues.items():
        sig = '*' if pval < 0.05 else ''
        print(f"    {var}: {pval:.3f}{sig}")
    print()

In [None]:
# Summary of significant p-values across all (roomType, openingType) groups
from collections import Counter, defaultdict

sig_counts = Counter()
sig_keys = defaultdict(list)
for (roomType, openingType), res in lr_results.items():
    # Skip intercept
    for var, pval in res['pvalues'].items():
        if var == 'const':
            continue
        if pval < 0.05:
            sig_counts[var] += 1
            sig_keys[var].append((roomType, openingType))

# Display results
print("Significant counts per variable (p < 0.05)):")
for var, count in sig_counts.items():
    groups = sig_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
# Create DataFrame of variables ranked by p-value for each group
import pandas as pd

ranked = {}
for key, res in lr_results.items():
    # Build Series of p-values, exclude intercept
    pvals = pd.Series(res['pvalues'])
    pvals = pvals.drop('const', errors='ignore')
    # Sort by p-value ascending and take variable names
    ranked[key] = pvals.sort_values().index.tolist()

# Construct DataFrame; rows = rank order, columns = groups
ranked_df = pd.DataFrame(ranked)
display(ranked_df)

## Regularized Regression

In [None]:
# Ridge regression for each (roomType, openingType) group
from sklearn.linear_model import Ridge

ridge_results = {}
# Set regularization strength (alpha); increase for more shrinkage
alpha = 10.0

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group] 
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit Ridge model
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    
    # Collect coefficients and intercept
    coeffs = dict(zip(X_train.columns, ridge.coef_))
    intercept = ridge.intercept_
    r2_train = ridge.score(X_train, y_train)
    r2_dev = ridge.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(model, X_train, y_train)
    LL_dev = calculate_linear_likelihood(model, X_dev, y_dev)
    rmse_train, nrmse_train = calculate_normalized_rmse(ridge, X_train, y_train)
    rmse_dev, nrmse_dev = calculate_normalized_rmse(ridge, X_dev, y_dev)
    
    ridge_results[group] = {
        'coefficients': coeffs, 
        'intercept': intercept, 
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': nrmse_dev
    }

    # Display results
    print(f"Ridge α={alpha} | {roomType}-{openingType}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Lasso regression for each (roomType, openingType) group
from sklearn.linear_model import Lasso

lasso_results = {}
# Set L1 regularization strength; increase alpha for more sparsity
alpha_lasso = 0.01

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit Lasso model
    lasso = Lasso(alpha=alpha_lasso, max_iter=10000)
    lasso.fit(X_train, y_train)
    
    # Collect coefficients and intercept
    coeffs = dict(zip(X_train.columns, lasso.coef_))
    intercept = lasso.intercept_
    r2_train = lasso.score(X_train, y_train)
    r2_dev = lasso.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(model, X_train, y_train)
    LL_dev = calculate_linear_likelihood(model, X_dev, y_dev)
    rmse_train, nrmse_train = calculate_normalized_rmse(lasso, X_train, y_train)
    rmse_dev, nrmse_dev = calculate_normalized_rmse(lasso, X_dev, y_dev)
    
    lasso_results[group] = {
        'coefficients': coeffs, 
        'intercept': intercept, 
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': nrmse_dev
    }
    
    # Display results
    print(f"Lasso α={alpha_lasso} | {roomType}-{openingType}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Summary of Lasso coefficient sparsity across groups
from collections import Counter, defaultdict

nonzero_counts = Counter()
nonzero_keys = defaultdict(list)
for (roomType, openingType), res in lasso_results.items():
    for var, coef in res['coefficients'].items():
        if abs(coef) > 1e-6:  # Use a small threshold to account for floating point
            nonzero_counts[var] += 1
            nonzero_keys[var].append((roomType, openingType))

# Display results
print(f"Non-zero coefficient counts per variable (Lasso α={alpha_lasso}):")
for var, count in sorted(nonzero_counts.items(), key=lambda x: -x[1]):
    groups = nonzero_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
# For linear regression model
group = ('corner', 'xwindow_0-0')  # Choose a group
X_train = x_train_data[group]
y_train = y_train_data[group]
linear_model = LinearRegression().fit(X_train, y_train)  # Or use your existing model

visualize_linear_model(linear_model, X_train, y_train, model_name="Linear Regression")

# For Ridge model
ridge_model = Ridge(alpha=10.0).fit(X_train, y_train)  # Or use your existing model
visualize_linear_model(ridge_model, X_train, y_train, model_name="Ridge Regression")

# For Lasso model
lasso_model = Lasso(alpha=0.01).fit(X_train, y_train)  # Or use your existing model
visualize_linear_model(lasso_model, X_train, y_train, model_name="Lasso Regression")

## Gaussian Process Regression

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel, DotProduct
from sklearn.metrics import r2_score


In [None]:
def calculate_gpr_likelihood(gpr_model, X=None, y=None, use_kernel_noise=True):
    """
    Calculate log likelihood for a Gaussian Process Regressor using kernel variance.
    
    Parameters:
    -----------
    gpr_model : GaussianProcessRegressor
        The fitted GPR model
    X : array-like, shape (n_samples, n_features), optional
        Input features. If None, assumes we want training set likelihood.
    y : array-like, shape (n_samples,), optional
        Target values. If None, assumes we want training set likelihood.
    use_kernel_noise : bool, default=True
        Whether to use the noise variance from the kernel (if available)
        
    Returns:
    --------
    log_likelihood : float
        Log likelihood of the data under the model
    """
    import numpy as np
    from scipy import stats
    
    # Case 1: No data provided - return training log marginal likelihood
    if X is None or y is None:
        if hasattr(gpr_model, 'log_marginal_likelihood_value_'):
            return gpr_model.log_marginal_likelihood_value_
        else:
            return gpr_model.log_marginal_likelihood(gpr_model.kernel_.theta)
    
    # Case 2: Data provided - calculate likelihood for this dataset
    y_mean, y_std = gpr_model.predict(X, return_std=True)
    
    # Try to extract noise variance from kernel if WhiteKernel is used
    noise_variance = None
    if use_kernel_noise:
        try:
            # Attempt to extract noise variance from kernel
            if hasattr(gpr_model.kernel_, 'k2') and 'WhiteKernel' in str(gpr_model.kernel_.k2):
                noise_variance = gpr_model.kernel_.k2.noise_level
            elif hasattr(gpr_model.kernel_, 'noise_level'):
                noise_variance = gpr_model.kernel_.noise_level
            elif hasattr(gpr_model, 'alpha'):
                # Alpha in GPR is often used as the noise variance
                noise_variance = gpr_model.alpha
        except:
            pass
    
    # If we found a noise variance from the kernel, use it
    if noise_variance is not None:
        # Combine predictive variance with noise variance
        total_variance = y_std**2 + noise_variance
        std_dev = np.sqrt(total_variance)
    else:
        # Use the predicted standard deviations
        std_dev = y_std
    
    # Calculate log likelihood assuming Gaussian noise
    log_likelihood = np.sum(stats.norm.logpdf(y, loc=y_mean, scale=std_dev))
    
    return log_likelihood

In [None]:
# Dictionary to store GP results
gp_results = {}

for group in x_train_data.keys():
    roomType, openingType = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    print(f"Training GP for {roomType}-{openingType}...")
    
    # Define kernel: signal variance × RBF + noise term
    kernel = (
        C(1.0, (1e-3, 1e3))   # signal variance
        * RBF(1.0, (1e-2, 1e2))  # length-scale
        # + DotProduct()      # global linear trend
        + WhiteKernel(
            noise_level=1e-2,
            noise_level_bounds=(1e-5, 1e1)
          )
    )
    
    # Instantiate GPR
    gpr = GaussianProcessRegressor(
        kernel=kernel,
        n_restarts_optimizer=10,
        random_state=0,
        normalize_y=False  # since we've already normalized
    )
    
    # Fit on the training set
    gpr.fit(X_train, y_train)
    
    # Validate on dev set
    y_dev_pred, y_dev_std = gpr.predict(X_dev, return_std=True)
    r2_dev = r2_score(y_dev, y_dev_pred)
    r2_train = gpr.score(X_train, y_train)
    LL_train = calculate_gpr_likelihood(gpr)
    LL_dev = calculate_gpr_likelihood(gpr, X_dev, y_dev)
    rmse_train, nrmse_train = calculate_normalized_rmse(gpr, X_train, y_train)
    rmse_dev, nrmse_dev = calculate_normalized_rmse(gpr, X_dev, y_dev)
    
    # Store results
    gp_results[group] = {
        'model': gpr,
        'kernel': gpr.kernel_,
        'r2_dev': r2_dev,
        'r2_train': r2_train,
        'LL_dev': LL_dev,
        'LL_train': LL_train,
        'nrmse_train': rmse_train,
        'nrmse_dev': nrmse_dev
    }
    
    print(f"Train R² = {r2_train:.3f}, Dev R² = {r2_dev:.3f}")
    print(f"Log-Likelihood Train: {LL_train :.3f}, Dev: {LL_dev:.3f}")
    print(f"RMSE Train: {rmse_train:.3f}, Dev: {rmse_dev:.3f}")
    print("Learned kernel:", gpr.kernel_)
    print()

In [None]:
def visualize_gpr_fit(gpr_model, X, y, feature_names=None, top_features=10):
    """
    Create a comprehensive visualization of GPR model fit with better feature interpretation.
    
    Parameters:
    -----------
    gpr_model : GaussianProcessRegressor
        The fitted GPR model
    X : DataFrame
        Input features as pandas DataFrame
    y : Series or array
        Target values
    feature_names : list, optional
        Names of features (if not provided, will use X.columns)
    top_features : int, optional
        Number of top features to show in importance plot
    """
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from sklearn.metrics import mean_squared_error
    import seaborn as sns
    
    # Convert y to numpy array if it's a pandas Series
    if hasattr(y, 'values'):
        y_values = y.values
    else:
        y_values = np.array(y)
    
    # Get feature names from DataFrame if not provided
    if feature_names is None and hasattr(X, 'columns'):
        feature_names = X.columns.tolist()
    elif feature_names is None:
        feature_names = [f'Feature {i}' for i in range(X.shape[1])]
    
    # Get predictions and uncertainty
    y_pred, y_std = gpr_model.predict(X, return_std=True)
    
    # Calculate metrics
    r2 = gpr_model.score(X, y_values)
    rmse = np.sqrt(mean_squared_error(y_values, y_pred))
    
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 14))
    
    # 1. Prediction vs Actual plot
    ax1 = fig.add_subplot(221)
    sc = ax1.scatter(y_values, y_pred, alpha=0.6, c=y_std, cmap='viridis')
    fig.colorbar(sc, ax=ax1, label='Prediction Uncertainty')
    
    # Add perfect prediction line
    min_val = min(min(y_values), min(y_pred))
    max_val = max(max(y_values), max(y_pred))
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    ax1.set_xlabel('Actual Values')
    ax1.set_ylabel('Predicted Values')
    ax1.set_title(f'Prediction vs Actual\nR² = {r2:.4f}, RMSE = {rmse:.4f}')
    ax1.grid(True, alpha=0.3)
    
    # 2. Prediction vs Actual with Uncertainty
    ax2 = fig.add_subplot(222)
    # Sort by actual values for clearer visualization
    sort_idx = np.argsort(y_values)
    ax2.errorbar(np.arange(len(y_values)), y_values[sort_idx], yerr=0, fmt='o', label='Actual', alpha=0.6)
    ax2.errorbar(np.arange(len(y_values)), y_pred[sort_idx], yerr=1.96*y_std[sort_idx], 
                fmt='o', label='Predicted with 95% CI', alpha=0.6)
    ax2.set_xlabel('Sample Index (sorted by actual value)')
    ax2.set_ylabel('Value')
    ax2.set_title('GPR Predictions with Uncertainty')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Residuals plot
    ax3 = fig.add_subplot(223)
    residuals = y_values - y_pred
    sc = ax3.scatter(y_pred, residuals, alpha=0.6, c=y_std, cmap='viridis')
    fig.colorbar(sc, ax=ax3, label='Prediction Uncertainty')
    ax3.axhline(y=0, color='r', linestyle='--')
    ax3.set_xlabel('Predicted Values')
    ax3.set_ylabel('Residuals')
    ax3.set_title('Residuals vs Predicted')
    ax3.grid(True, alpha=0.3)
    
    # 4. Uncertainty distribution
    ax4 = fig.add_subplot(224)
    ax4.hist(y_std, bins=20, alpha=0.6)
    ax4.set_xlabel('Prediction Standard Deviation')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Distribution of Prediction Uncertainty')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Feature importance analysis
    if hasattr(X, 'values'):
        X_values = X.values
    else:
        X_values = X
    
    if X_values.shape[1] > 1:
        print("\n--- Feature Importance Analysis ---")
        
        # Estimate feature importance by varying each feature
        plt.figure(figsize=(12, 8))
        importance = []
        
        # Create a grid for each feature
        for i in range(X_values.shape[1]):
            # Use pandas if available for min/max
            if hasattr(X, 'iloc'):
                feature_min = X.iloc[:, i].min()
                feature_max = X.iloc[:, i].max()
            else:
                feature_min = np.min(X_values[:, i])
                feature_max = np.max(X_values[:, i])
                
            x_grid = np.linspace(feature_min, feature_max, 50)
            X_grid = np.tile(np.mean(X_values, axis=0), (50, 1))
            X_grid[:, i] = x_grid
            
            # Predict across the grid
            y_grid = gpr_model.predict(X_grid)
            
            # Calculate importance as range of predictions
            importance.append(np.max(y_grid) - np.min(y_grid))
        
        # Create DataFrame for importance
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importance
        })
        
        # Sort by importance
        importance_df = importance_df.sort_values('Importance', ascending=False)
        
        # Show top features
        top_importance = importance_df.head(top_features)
        
        # Plot top feature importance
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=top_importance)
        plt.title(f'Top {top_features} Feature Importance in GPR Model')
        plt.tight_layout()
        plt.show()
        
        # Display importance table
        print("\nFeature Importance Ranking:")
        display(importance_df)
        
        # Feature correlation with target
        if hasattr(X, 'corrwith'):
            print("\nFeature Correlation with Target:")
            corr_df = pd.DataFrame({
                'Feature': feature_names,
                'Correlation': X.corrwith(pd.Series(y_values, index=X.index)).values
            })
            corr_df['Abs_Correlation'] = np.abs(corr_df['Correlation'])
            corr_df = corr_df.sort_values('Abs_Correlation', ascending=False)
            display(corr_df)
            
            # Plot correlation heatmap for top features
            plt.figure(figsize=(12, 10))
            top_features_list = importance_df.head(min(15, len(feature_names)))['Feature'].tolist()
            X_top = X[top_features_list]
            
            # Add target to the correlation matrix
            X_with_y = X_top.copy()
            X_with_y['Target'] = y_values
            
            # Plot correlation heatmap
            sns.heatmap(X_with_y.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
            plt.title('Correlation Heatmap of Top Features with Target')
            plt.tight_layout()
            plt.show()

# For a specific group:
group = ('corner', 'xwindow_0-0')  # or any other group you're interested in
X_train = x_train_data[group]
y_train = y_train_data[group]
model = gp_results[group]['model']

# Visualize with better feature interpretation
visualize_gpr_fit(model, X_train, y_train)

# To see only top 5 most important features
visualize_gpr_fit(model, X_train, y_train, top_features=5)

## Model Comparison

In [None]:
# Compare model performance on dev set across all groups
import pandas as pd

# Compare model performance on dev set across all groups

# Define metrics to extract and their properties
metrics = {
    'R²': {'key': 'r2_dev', 'higher_is_better': True, 'description': 'R² scores on dev set (higher is better)'},
    'Log-Likelihood': {'key': 'LL_dev', 'higher_is_better': True, 'description': 'Log-likelihood scores on dev set (higher is better)'},
    'NRMSE': {'key': 'nrmse_dev', 'higher_is_better': False, 'description': 'Normalized RMSE scores on dev set (lower is better)'}
}

# Model types to compare
models = ['Linear', 'Ridge', 'Lasso', 'GP']
model_results = {'Linear': lr_results, 'Ridge': ridge_results, 'Lasso': lasso_results, 'GP': gp_results}

# Process all metrics
results_df = {}
for metric_name, metric_info in metrics.items():
    metric_key = metric_info['key']
    higher_is_better = metric_info['higher_is_better']
    
    # Extract metrics for all groups and models
    metric_data = []
    for group in x_train_data.keys():
        roomType, windowType = group
        
        row_data = {'roomType': roomType, 'windowType': windowType}
        for model_name, model_result in model_results.items():
            row_data[model_name] = model_result[group][metric_key]
        
        metric_data.append(row_data)
    
    # Convert to DataFrame
    df = pd.DataFrame(metric_data)
    df = df.set_index(['roomType', 'windowType'])
    results_df[metric_name] = df
    
    # Display results
    print(f"\n{metric_info['description']}:")
    display(df)
    print(f"\nAverage {metric_name} by model type:")
    print(df.mean())
    
    # Find best model for each group
    if higher_is_better:
        best_model = df.idxmax(axis=1)
    else:
        best_model = df.idxmin(axis=1)
    
    print(f"\nBest model for each group ({metric_name}):")
    display(pd.DataFrame({f'Best Model ({metric_name})': best_model}))
