In [None]:
%load_ext autoreload
%autoreload 2
# %matplotlib widget
%pdb off

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly
from IPython.display import display, HTML
import flowEmulationUtils as feUtils
import random

plotly.offline.init_notebook_mode()
display(HTML(
    '<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_SVG"></script>'
))

#close all figures
plt.close('all')
plt.rcParams['figure.dpi'] = 140
im_scaling = .75
plt.rcParams['figure.figsize'] = [6.4 * im_scaling, 4.8 * im_scaling]

home_dir = "./"
display(home_dir)
plt.close('all')

# Load Data

In [None]:
multiRun_dir = f"{home_dir}/CHARLES/multiRuns/"
plotFolder = f"{multiRun_dir}"

roomVentilationMI = pd.read_csv(f"{multiRun_dir}/roomVentilationMIEmulation.csv", index_col = [0,1])
flowStatsMI = pd.read_csv(f"{multiRun_dir}/flowStatsMIEmulation.csv", index_col=[0,1])

In [None]:
# Train/Dev/Test Assignment
# Create a column called split and assign 70 % to train, 10% to dev, and 20% to test in roomVentilationMI
random.seed(42)  # For reproducibility
roomVentilationMI["split"] = roomVentilationMI.index.to_series().apply(lambda _: random.random())
roomVentilationMI["split"] = roomVentilationMI["split"].apply(lambda x: "train" if x < 0.7 else ("dev" if x < 0.8 else "test"))

for (run, room), row in roomVentilationMI.iterrows():
    windowKeyCols = roomVentilationMI.columns[
        roomVentilationMI.columns.str.contains("windowKeys")
    ].tolist()
    windowKeys = row[windowKeyCols].dropna()
    for windowKey in windowKeys:
        flowStatsMI.loc[(run, windowKey), "split"] = row["split"]

In [None]:
df = flowStatsMI.copy()
# df = df[df["slAll"] == False]
# normalize x_cols. Flow quantities to be normalized by WS. Pressures to be normalized by W**2:

p_norm_cols = []
u_norm_cols = []
no_norm_cols = []
for col in df.columns:
    if "-p0" in col or "p_" in col or "(p)" in col or "p0meas" in col or "u**2" in col:
        p_norm_cols.append(col)
    elif "mag" in col or "shear" in col or "normal" in col or "flux" in col or "(u" in col or "q_model" in col:
        u_norm_cols.append(col)
    else:
        no_norm_cols.append(col)

print(f"Normalizing p cols: {sorted(p_norm_cols)}")
print(f"Normalizing u cols: {sorted(u_norm_cols)}")
print(f"Not normalizing cols: {sorted(no_norm_cols)}")

# Normalize pressure columns by WS^2
df[p_norm_cols] = df[p_norm_cols].div(df["WS"]**2, axis=0)
# Normalize velocity columns by WS
df[u_norm_cols] = df[u_norm_cols].div(df["WS"], axis=0)

In [None]:
for col in df.columns:
    if "orientation" in col:
        sin_col = col.replace("orientation", "sin")
        cos_col = col.replace("orientation", "cos")
        df[sin_col] = np.sin(df[col])
        df[cos_col] = np.cos(df[col])

In [None]:
# df['p-noInt_optp0-q_modelC_d'] = df['p-noInt_optp0-q_model'] * df['p-noInt_optp0-C_d']
# for col in df.columns:
#     if "mag" in col or "shear" in col or "normal" in col:
#         df[col] = df[col] / df['p-noInt_optp0-q_modelC_d']

df["skylight"] = df['openingType'].apply(lambda x: 1 if "skylight" in x else 0)
df["cross"] = df['openingType'].apply(lambda x: 1 if "cross" in x else 0)
df["single"] = df['openingType'].apply(lambda x: 1 if "single" in x else 0)
df["dual"] = df['openingType'].apply(lambda x: 1 if "dual" in x else 0)
df["corner"] = df['openingType'].apply(lambda x: 1 if "corner" in x else 0)
Sdelp = np.sign(df['p-noInt_optp0-q_model'])
Sdelp[Sdelp == 0] = 1  # Assign 1 to zero values
df["Sdelp"] = Sdelp
df["EP_shear-noInt-qIn"] = df["EP_shear-noInt"] * df["Sdelp"] > 0
df["EP_shear-noInt-qOut"] = df["EP_shear-noInt"] * df["Sdelp"] <= 0

df["all"] = True

In [None]:
# ydf = df['p-noInt-C_d'].copy()
ydf = df['p-noInt_optp0Cd-C_d'].copy()
# ydf = df['flux'].copy()


# x_cols = df.columns.values
# x_cols = [col for col in x_cols if ("noInt" in col)]# or "EP_" in col)]
# x_cols = [col for col in x_cols if ("x-" not in col and "y-" not in col and "z-" not in col)]
# x_cols = [col for col in x_cols if ("_x" not in col and "_y" not in col and "_z" not in col)]
# x_cols = [col for col in x_cols if "run" not in col]
# x_cols = [col for col in x_cols if "q-" not in col]
# x_cols = [col for col in x_cols if "comp(u_avg,0)" not in col]
# x_cols = [col for col in x_cols if "comp(u_avg,2)" not in col]
# x_cols += ["skylight", "AofA", "sinAofA", "cosAofA", "slAll", "delT", "SS"]
# # x_cols += ["mean-mass_flux", "mean-sn_prod(abs(u))"]
# x_cols = [col for col in x_cols if "C_d" not in col]
# x_cols = [col for col in x_cols if "mass_flux" not in col]
# x_cols = [col for col in x_cols if "sn_prod(abs(u))" not in col]
# x_cols = [col for col in x_cols if "q_model" not in col]
# x_cols += ["Sdelp", "all", "Ri", "WS"]
# x_cols += ['p-noInt_optp0-C_d', 'p-noInt_optp0-q_model']
# x_cols += ['cross', 'single', 'dual', 'corner'] 

# x_cols = [ "skylight", "delT", "SS", "p-noInt_optp0-p0", "p_avg-noInt", "p-noInt_optp0-q_model", "EP_normal-noInt", "EP_shear-noInt", "EP_vel_orientation-noInt", "EPR_vel_orientation-noInt", "p_rms-noInt", "EPR_mag-noInt", "AofA", "Sdelp", "all"]
# x_cols = ["p-noInt_optp0-q_model", "skylight", "delT", "SS", "p-noInt_optp0-p0", "p_avg-noInt", "EP_normal-noInt", "EP_shear-noInt", "EP_vel_orientation-noInt", "p_rms-noInt", "Sdelp", "all", "EP_shear-noInt-qIn","EP_shear-noInt-qOut"]
x_cols = ["p-noInt_optp0-q_model"]#, "Sdelp", "skylight", "delT", "SS", "EP_vel_sin", "EP_vel_cos", "EP_vel_orientation"]#, "EP_shear-noInt-qIn","EP_shear-noInt-qOut"]

x_cols += ["split"]
xdf = df[x_cols].copy()

display(xdf.columns.values)

# ydf = np.log(ydf + .0001)
# x_df = np.log(xdf + .0001)

In [None]:
# xdf = xdf.map(lambda s: abs(s) if isinstance(s, (int, float)) else s)  # Ensure all numeric values are positive
# ydf = np.abs(ydf)

In [None]:
x_data = {}
y_data = {}
# first_level_col = "roomType"
first_level_col = "Sdelp"  # Use Sdelp to group data by sign of p-noInt_optp0-q_model
second_level_col = "skylight"
for first_level in df[first_level_col].unique():
    for second_level in df[second_level_col].unique():
        rows = (df[first_level_col] == first_level) & (df[second_level_col] == second_level)
        if rows.sum() > 0:
            x_data[(first_level, second_level)] = xdf[rows]
            y_data[(first_level, second_level)] = ydf[rows]


In [None]:
def recombine_grouped_data(grouped_data):
    df = pd.DataFrame()
    for (first_level, second_level), data in grouped_data.items():
        if df.empty:
            df = data.copy()
        else:
            df = pd.concat([df, data], axis=0)
    return df

# Data Preprocessing

Split each group into train/dev/test sets and apply normalization

- [ ] Add min max scalar for normalization [-1 1]
- [ ] Roughly same distributions -> plot input data

In [None]:
import numpy as np

def project_to_logspace(x):
    """
    Projects data from [-1, 1] to a range such that log(output) will be in [0, 1].
    
    Parameters:
    -----------
    x : array-like
        Input data in the range [-1, 1]
    
    Returns:
    --------
    y : array-like
        Transformed data where log(y) will be in the range [-1, 1]
    """
    # Constants
    e_plus = np.exp(1)   # e^1 ≈ 2.718
    e_minus = np.exp(0) # e^0 ≈ 1
    
    # Linear mapping from [-1, 1] to [e^0, e^1]
    # This ensures that log(y) will be in [0, 1]
    y = (e_plus - e_minus) / 2 * (x + 1) + e_minus
    
    return np.log(y)

In [None]:
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler,
    MinMaxScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Dictionaries to store our preprocessed data
x_train_data = {}
x_dev_data = {}
x_test_data = {}
y_train_data = {}
y_dev_data = {}
y_test_data = {}

x_train_data_unscaled = {}
x_dev_data_unscaled = {}
x_test_data_unscaled = {}
y_train_data_unscaled = {}
y_dev_data_unscaled = {}
y_test_data_unscaled = {}

# Dictionaries for transformers
column_transformers = {}
y_scalers = {}

    
# 2. Identify column groups by name
ori_cols = [c for c in xdf.columns
            if 'orientation' in c.lower()
            or c == 'AofA']
trig_cols = [c for c in xdf.columns
            if 'sin' in c.lower()
            or 'cos' in c.lower()]
other_cols = [c for c in xdf.columns
            if c not in ori_cols + trig_cols + ["split"]]

# Process each group
for group, X in x_data.items():
    first_level, second_level = group
    y = y_data[group]
    
    print(f"Processing {first_level} - {second_level} group...")
    
    train_mask = X['split'] == 'train'
    dev_mask = X['split'] == 'dev'
    test_mask = X['split'] == 'test'
    X.drop(columns=['split'], inplace=True)  # Remove split column from features
    
    # 1. Split data into train/dev/test
    X_train, y_train = X[train_mask], y[train_mask]
    X_dev,   y_dev   = X[dev_mask],   y[dev_mask]
    X_test,  y_test  = X[test_mask],  y[test_mask]

    # Stor the split data
    x_train_data_unscaled[group] = X_train.copy()
    x_dev_data_unscaled[group] = X_dev.copy()
    x_test_data_unscaled[group] = X_test.copy()
    y_train_data_unscaled[group] = y_train.copy()
    y_dev_data_unscaled[group] = y_dev.copy()
    y_test_data_unscaled[group] = y_test.copy()

    
    # 3. Build a ColumnTransformer
    ct = ColumnTransformer(
        [
            ('A_scale',
            FunctionTransformer(func=lambda x: (x - 180)/180,
                                inverse_func=lambda x: x*180 + 180,
                                validate=False),
            ori_cols),
            ('passthrough_trig','passthrough',trig_cols),
            ('scale', MinMaxScaler(feature_range=(-1,1)), other_cols),
        ],
        remainder='drop')  # drop any unexpected columns
    
    # 4. Fit on TRAIN, transform all splits
    X_train_s = ct.fit_transform(X_train)
    X_dev_s = ct.transform(X_dev)
    X_test_s = ct.transform(X_test)

    # #4.1 Optionally project to log space
    # X_train_s = project_to_logspace(X_train_s)
    # X_dev_s = project_to_logspace(X_dev_s)
    # X_test_s = project_to_logspace(X_test_s)
    
    # 5. Manually fit + apply y‐scaler on TRAIN only
    scaler_y = MinMaxScaler().fit(y_train.values.reshape(-1,1))
    
    y_train_s = scaler_y.transform(y_train.values.reshape(-1,1)).ravel()
    y_dev_s = scaler_y.transform(y_dev.values.reshape(-1,1)).ravel()
    y_test_s = scaler_y.transform(y_test.values.reshape(-1,1)).ravel()

    # #5.1 Optionally project to log space
    # y_train_s = project_to_logspace(y_train_s)
    # y_dev_s = project_to_logspace(y_dev_s)
    # y_test_s = project_to_logspace(y_test_s)
    
    # 6. (Optionally wrap back to DataFrame for convenience:)
    out_cols = ori_cols + trig_cols + other_cols
    X_train_s = pd.DataFrame(X_train_s, columns=out_cols, index=X_train.index)
    X_dev_s = pd.DataFrame(X_dev_s, columns=out_cols, index=X_dev.index)
    X_test_s = pd.DataFrame(X_test_s, columns=out_cols, index=X_test.index)
    
    # Store preprocessed data
    x_train_data[group] = X_train_s
    x_dev_data[group] = X_dev_s
    x_test_data[group] = X_test_s
    y_train_data[group] = y_train_s
    y_dev_data[group] = y_dev_s
    y_test_data[group] = y_test_s
    
    # Store transformers for later inverse transformations
    column_transformers[group] = ct
    y_scalers[group] = scaler_y
    
    print(f"  Train: {X_train_s.shape[0]} samples")
    print(f"  Dev:   {X_dev_s.shape[0]} samples")
    print(f"  Test:  {X_test_s.shape[0]} samples")
    print()

# Print summary
print(f"Processed {len(x_train_data)} groups")

# Data Visualization

In [None]:
# from sklearn.decomposition import TruncatedSVD
# import seaborn as sns
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # Combine all training data into one DataFrame
# X_train_combined = pd.concat(list(x_train_data.values()), axis=0)
# # Create a DataFrame with training targets
# y_train_combined = np.concatenate(list(y_train_data.values()), axis=0)

# # Perform SVD on the training feature matrix
# n_components = min(20, len(X_train_combined.columns))
# svd = TruncatedSVD(n_components=n_components, random_state=42)
# X_svd = svd.fit_transform(X_train_combined)

# # Plot explained variance ratio
# plt.figure(figsize=(10, 6))
# plt.plot(np.cumsum(svd.explained_variance_ratio_), marker='o')
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('Explained Variance by SVD Components (Training Data)')
# plt.grid(True, alpha=0.3)
# plt.axhline(y=0.95, color='r', linestyle='--', label='95% Explained Variance')
# plt.legend()
# plt.show()

# # Get feature importance based on SVD components
# feature_importance = np.abs(svd.components_).sum(axis=0)
# feature_importance = feature_importance / feature_importance.sum()  # Normalize

# # Create DataFrame with feature names and importance scores
# importance_df = pd.DataFrame({
#     'Feature': X_train_combined.columns,
#     'Importance': feature_importance
# })
# importance_df = importance_df.sort_values('Importance', ascending=False)

# # Plot top 15 features by importance
# plt.figure(figsize=(12, 8))
# sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
# plt.title('Top 15 Features by SVD Importance (Training Data)')
# plt.tight_layout()
# plt.show()

# # Weight feature importance by explained variance
# weighted_importance = np.zeros(svd.components_.shape[1])
# for i, var_ratio in enumerate(svd.explained_variance_ratio_):
#     weighted_importance += np.abs(svd.components_[i]) * var_ratio

# # Normalize
# weighted_importance = weighted_importance / weighted_importance.sum()

# # Create DataFrame with feature names and weighted importance scores
# weighted_importance_df = pd.DataFrame({
#     'Feature': X_train_combined.columns,
#     'Weighted_Importance': weighted_importance
# })
# weighted_importance_df = weighted_importance_df.sort_values('Weighted_Importance', ascending=False)

# # Plot top 15 features by weighted importance
# plt.figure(figsize=(12, 8))
# sns.barplot(x='Weighted_Importance', y='Feature', data=weighted_importance_df.head(15))
# plt.title('Top Features by Variance-Weighted Importance (Training Data)')
# plt.tight_layout()
# plt.show()

# # Look at the first 3 principal components and their feature contributions
# n_top_features = 10
# plt.figure(figsize=(15, 12))

# for i in range(min(3, n_components)):
#     plt.subplot(3, 1, i+1)
#     component = pd.Series(svd.components_[i], index=X_train_combined.columns)
#     component_df = pd.DataFrame({
#         'Feature': component.index,
#         'Loading': component.values
#     })
#     component_df = component_df.reindex(component_df.Loading.abs().sort_values(ascending=False).index)
    
#     # Plot top contributing features
#     sns.barplot(x='Loading', y='Feature', data=component_df.head(n_top_features))
#     plt.title(f'Top {n_top_features} Features in Component {i+1} (Explained Variance: {svd.explained_variance_ratio_[i]:.2%})')

# plt.tight_layout()
# plt.show()

# # Create a heatmap of the top features' correlation matrix
# top_features = importance_df.head(30)['Feature'].values
# correlation_matrix = X_train_combined[top_features].corr()

# plt.figure(figsize=(14, 12))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
# plt.title('Correlation Matrix of Top Features (Training Data)')
# plt.tight_layout()
# plt.show()

# # Check how the first few SVD components correlate with the target variable
# svd_df = pd.DataFrame(X_svd, columns=[f'SVD_{i+1}' for i in range(n_components)])
# svd_df['target'] = y_train_combined

# plt.figure(figsize=(15, 10))
# for i in range(min(5, n_components)):
#     plt.subplot(2, 3, i+1)
#     plt.scatter(svd_df[f'SVD_{i+1}'], svd_df['target'], alpha=0.5)
#     plt.xlabel(f'SVD Component {i+1}')
#     plt.ylabel('y')
#     plt.title(f'Target vs SVD Component {i+1}')
#     plt.grid(True, alpha=0.3)

# plt.tight_layout()
# plt.show()

# print(f"SVD analysis completed on training data. Top 5 most important features:")
# display(importance_df.head(5))

In [None]:
# import seaborn as sns
# import numpy as np
# from scipy import stats

# # Create scatter plots of the target variable against the top 5 features from SVD analysis
# import matplotlib.pyplot as plt

# # Get the top 5 features from the SVD importance analysis
# top_features = importance_df.head(11)['Feature'].values
# print(f"Top 11 features by importance: {top_features}")

# # Create a figure with subplots for each feature
# fig, axes = plt.subplots(4, 3, figsize=(18, 12))
# axes = axes.flatten()

# # For storing correlation values
# correlations = {}

# # Plot each feature against the target
# for i, feature in enumerate(top_features):
#     ax = axes[i]
    
#     # Calculate correlation and p-value
#     corr, p_value = stats.pearsonr(X_train_combined[feature], y_train_combined)
#     correlations[feature] = (corr, p_value)
    
#     # Create scatter plot with regression line
#     sns.regplot(x=X_train_combined[feature], y=y_train_combined, ax=ax, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
    
#     # Add correlation information
#     ax.text(0.05, 0.95, f"Correlation: {corr:.3f}\np-value: {p_value:.3e}", 
#             transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.7))
    
#     ax.set_title(f"y vs {feature}")
#     ax.set_xlabel(feature)
#     ax.set_ylabel("y")
#     ax.grid(True, alpha=0.3)

# # Create a violin plot in the last subplot showing the distribution of the target
# sns.violinplot(y=y_train_combined, ax=axes[11], inner="quartile")
# axes[11].set_title("Distribution of y")
# axes[11].set_ylabel("y")
# axes[11].grid(True, alpha=0.3)

# # Add a horizontal line at y=0 for reference
# axes[5].axhline(y=0, color='r', linestyle='--')

# plt.tight_layout()
# plt.show()

# # Display correlation summary
# print("\nCorrelation summary:")
# for feature, (corr, p_value) in sorted(correlations.items(), key=lambda x: abs(x[1][0]), reverse=True):
#     sig = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
#     print(f"{feature}: r = {corr:.3f} {sig} (p = {p_value:.3e})")

In [None]:
import seaborn as sns
import numpy as pd
from matplotlib.gridspec import GridSpec

# Visualize feature distributions after normalization
import matplotlib.pyplot as plt

# Function to plot feature distributions
def plot_feature_distributions(data_dict, feature_groups=None, n_cols=3, figsize=(18, 15)):
    """
    Plot distributions of features across different groups.
    
    Parameters:
    -----------
    data_dict : dict
        Dictionary mapping (first_level, second_level) to DataFrames containing the features
    feature_groups : dict, optional
        Dictionary mapping group names to lists of feature names
    n_cols : int, optional
        Number of columns in the grid
    figsize : tuple, optional
        Figure size
    """
    # If no feature groups provided, create a default one with all features
    if feature_groups is None:
        # Get a sample dataframe to extract all column names
        sample_df = next(iter(data_dict.values()))
        feature_groups = {'All Features': sample_df.columns.tolist()}
    
    # Create figure
    plt.figure(figsize=figsize)
    
    # For each feature group
    for group_idx, (group_name, features) in enumerate(feature_groups.items()):
        print(f"\n--- {group_name} ---")
        
        # Calculate how many rows we need
        n_rows = (len(features) + n_cols - 1) // n_cols
        
        # Create subplot grid
        fig = plt.figure(figsize=figsize)
        gs = GridSpec(n_rows, n_cols, figure=fig)
        
        # For each feature in this group
        for i, feature in enumerate(features):
            ax = fig.add_subplot(gs[i // n_cols, i % n_cols])
            
            # Collect all values for this feature across all groups
            all_values = []
            for group, df in data_dict.items():
                if feature in df.columns:
                    all_values.extend(df[feature].values)
            
            # Plot histogram
            sns.histplot(all_values, kde=True, ax=ax)
            ax.set_title(f"{feature}")
            ax.set_xlabel("")
            
            # Calculate and display statistics
            mean_val = np.mean(all_values)
            std_val = np.std(all_values)
            min_val = np.min(all_values)
            max_val = np.max(all_values)
            
            stats_text = f"Mean: {mean_val:.2f}\nStd: {std_val:.2f}\nMin: {min_val:.2f}\nMax: {max_val:.2f}"
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes, 
                    verticalalignment='top', bbox=dict(boxstyle='round', alpha=0.1))
            
            # Print statistics
            print(f"{feature}: Mean={mean_val:.2f}, Std={std_val:.2f}, Min={min_val:.2f}, Max={max_val:.2f}")
        
        plt.suptitle(f"Distribution of {group_name}", fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.97])

# Organize features by groups
feature_groups = {
    'Orientation Features': ori_cols,
    'Trigonometric Features': trig_cols,
    'Other Features': other_cols[:10],  # First 10 other features
    'More Other Features': other_cols[10:20],  # Next 10 other features
    'Remaining Features': other_cols[20:]  # Remaining features
}

# Plot distributions for training data
# plot_feature_distributions(x_train_data, feature_groups)

# Compare distributions across different room and opening types
for feature in ['EP_mag-noInt', 'EP_shear-noInt', 'EP_normal-noInt', 'sinAofA', 'cosAofA']:
    plt.figure(figsize=(12, 6))
    for group, dfplot in x_train_data.items():
        if feature in dfplot.columns:
            sns.kdeplot(dfplot[feature].values, label=f"{group[0]}-{group[1]}")
    plt.title(f"Distribution of {feature} across different room and opening types")
    plt.legend()
    plt.show()

# Modeling

Rob Suggestions
- Gaussian process (sklearn)
- linear regression
- bin into room types
- window flag
- process window and room types separately at first

## Linear Regression

In [None]:
def calculate_linear_likelihood(y, y_pred):
    """
    Calculate log likelihood for a linear regression model.
    
    Parameters:
    -----------
    X : array-like, shape (n_samples, n_features)
        Input features
    y : array-like, shape (n_samples,)
        Target values
        
    Returns:
    --------
    log_likelihood : float
        Log likelihood of the data under the model
    """
    import numpy as np
    from scipy import stats
    
    # Calculate residuals
    residuals = y - y_pred
    
    # Estimate variance (MLE of variance)
    n = len(y)
    variance = np.sum(residuals**2) / n
    
    # Calculate average log likelihood
    log_likelihood = np.mean(stats.norm.logpdf(y, loc=y_pred, scale=np.sqrt(variance)))
    
    return log_likelihood

def calculate_normalized_rmse(y, y_pred, normalization='std'):
    """
    Calculate RMSE and normalized RMSE for any regression model.
    
    Parameters:
    -----------
    X : array-like, shape (n_samples, n_features)
        Input features
    y : array-like, shape (n_samples,)
        Target values
    normalization : str, optional (default='std')
        Method for normalization:
        - 'std': normalize by standard deviation of y
        - 'mean': normalize by mean of y
        - 'range': normalize by range of y
        
    Returns:
    --------
    rmse : float
        Root Mean Square Error
    nrmse : float
        Normalized Root Mean Square Error
    """
    import numpy as np
    
    # Calculate RMSE
    mse = np.mean((y - y_pred)**2)
    rmse = np.sqrt(mse)
    
    # Calculate normalized RMSE
    if normalization == 'mean':
        # Normalize by mean of observed values
        nrmse = rmse / np.mean(np.abs(y))
    elif normalization == 'range':
        # Normalize by range of observed values
        nrmse = rmse / (np.max(y) - np.min(y))
    else:  # default: 'std'
        # Normalize by standard deviation of observed values
        nrmse = rmse / np.std(y)
    
    return rmse, nrmse

def visualize_linear_model(linear_model, X, y, y_pred=None, hue=None, style=None, model_name="Linear Regression", feature_names=None, top_features=10, y_transformer=None):
    """
    Create a comprehensive visualization of linear model fit with feature importance.
    
    Parameters:
    -----------
    linear_model : sklearn linear model (LinearRegression, Ridge, Lasso, etc.)
        The fitted linear model
    X : DataFrame
        Input features as pandas DataFrame
    y : Series or array
        Target values
    model_name : str, optional
        Name of the model type for plot titles
    feature_names : list, optional
        Names of features (if not provided, will use X.columns)
    top_features : int, optional
        Number of top features to show in importance plot
    """
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from sklearn.metrics import mean_squared_error, r2_score
    import seaborn as sns
    
    # Convert y to numpy array if it's a pandas Series
    if hasattr(y, 'values'):
        y_values = y.values
    else:
        y_values = np.array(y)

    if hue is not None and hasattr(X, 'index'):
        hue = hue.loc[X.index]
    if style is not None and hasattr(X, 'index'):
        style = style.loc[X.index]
    
    # Get feature names from DataFrame if not provided
    if feature_names is None and hasattr(X, 'columns'):
        feature_names = X.columns.tolist()
    elif feature_names is None:
        feature_names = [f'Feature {i}' for i in range(X.shape[1])]
    
    # Get predictions
    if y_pred is None:
        y_pred = linear_model.predict(X)

    if y_transformer is not None:
        # Inverse transform predictions if a transformer is provided
        y_pred = y_transformer.inverse_transform(y_pred.reshape(-1, 1)).ravel()
        y_values = y_transformer.inverse_transform(y_values.reshape(-1, 1)).ravel()
    # Calculate metrics
    r2 = r2_score(y_values, y_pred)
    rmse, nrmse = calculate_normalized_rmse(y_values, y_pred)
    log_likelihood = calculate_linear_likelihood(y_values, y_pred)
    
    # Calculate residuals
    residuals = y_values - y_pred
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Actual vs Prediction plot (swapped axes)
    sns.scatterplot(x=y_pred, y=y_values, hue=hue, style=style, alpha=0.6, ax=axes[0, 0])
    
    # Add perfect prediction line
    min_val = min(min(y_values), min(y_pred))
    max_val = max(max(y_values), max(y_pred))
    axes[0, 0].plot([min_val, max_val], [min_val, max_val], 'r--')
    
    axes[0, 0].set_xlabel('Predicted Values')
    axes[0, 0].set_ylabel('Actual Values')
    axes[0, 0].set_title(f'{model_name}: Actual vs Prediction\nR² = {r2:.4f}, NRMSE = {nrmse:.4f}, RMSE = {rmse:.4f}, Log Likelihood = {log_likelihood:.4f}')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Residuals vs Actual
    sns.scatterplot(x=y_values, y=residuals, hue=hue, style=style, alpha=0.6, ax=axes[0, 1])
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Actual Values')
    axes[0, 1].set_ylabel('Residuals')
    axes[0, 1].set_title('Residuals vs Actual')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Residual Distribution
    sns.histplot(residuals, kde=True, ax=axes[1, 0])
    axes[1, 0].axvline(x=0, color='r', linestyle='--')
    axes[1, 0].set_xlabel('Residual Value')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Residual Distribution')
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. QQ Plot for Residuals
    from scipy import stats
    stats.probplot(residuals, dist="norm", plot=axes[1, 1])
    axes[1, 1].set_title('Q-Q Plot of Residuals')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Feature Importance from Coefficients
    if hasattr(linear_model, 'coef_'):
        # Get coefficients
        if linear_model.coef_.ndim > 1:
            coeffs = linear_model.coef_[0]  # For multi-output models
        else:
            coeffs = linear_model.coef_
        
        # Create DataFrame with features and coefficients
        coef_df = pd.DataFrame({
            'Feature': feature_names,
            'Coefficient': coeffs
        })
        
        # Add absolute coefficient for ranking
        coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
        
        # Sort by absolute coefficient
        coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
        
        # Show top features
        top_coef = coef_df.head(top_features)
        
        # Plot coefficients
        plt.figure(figsize=(12, 8))
        colors = ['red' if x < 0 else 'blue' for x in top_coef['Coefficient']]
        bars = sns.barplot(x='Coefficient', y='Feature', data=top_coef, palette=colors)
        
        # Add a vertical line at x=0
        plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
        
        # Add value labels to the bars
        for i, v in enumerate(top_coef['Coefficient']):
            bars.text(v + (0.01 if v >= 0 else -0.01), i, f"{v:.4f}", 
                     va='center', ha='left' if v >= 0 else 'right')
        
        plt.title(f'Top {top_features} Feature Coefficients in {model_name}')
        plt.tight_layout()
        plt.show()
        
        # Display coefficient table
        print("\nFeature Coefficient Ranking:")
        display(coef_df)
        
        # Feature correlation with target
        if hasattr(X, 'corrwith'):
            print("\nFeature Correlation with Target:")
            corr_df = pd.DataFrame({
                'Feature': feature_names,
                'Correlation': X.corrwith(pd.Series(y_values, index=X.index)).values
            })
            corr_df['Abs_Correlation'] = np.abs(corr_df['Correlation'])
            corr_df = corr_df.sort_values('Abs_Correlation', ascending=False)
            display(corr_df)
            
            # Plot correlation heatmap for top features
            plt.figure(figsize=(12, 10))
            top_features_list = coef_df.head(min(15, len(feature_names)))['Feature'].tolist()
            X_top = X[top_features_list]
            
            # Add target to the correlation matrix
            X_with_y = X_top.copy()
            X_with_y['Target'] = y_values
            
            # Plot correlation heatmap
            sns.heatmap(X_with_y.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
            plt.title('Correlation Heatmap of Top Features with Target ({model_name})')
            plt.tight_layout()
            plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

lr_results = {}

for group in x_train_data.keys():
    first_level, second_level = group
    X_train = x_train_data[group]#[["p-noInt_optp0-q_model"]]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]#[["p-noInt_optp0-q_model"]]
    y_dev = y_dev_data[group]
    
    # Fit multivariable linear model (sklearn)
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train, y_train)
    
    # Collect coefficients per feature
    coeffs = dict(zip(X_train.columns, model.coef_))
    intercept = model.intercept_
    r2_train = model.score(X_train, y_train)
    r2_dev = model.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(y_train, model.predict(X_train))
    LL_dev = calculate_linear_likelihood(y_dev, model.predict(X_dev))
    rmse_train, nrmse_train = calculate_normalized_rmse(y_train, model.predict(X_train))
    rmse_dev, nrmse_dev = calculate_normalized_rmse(y_dev, model.predict(X_dev))
    
    # Significance testing (statsmodels OLS)
    X_sm = sm.add_constant(X_train)
    results_sm = sm.OLS(y_train, X_sm).fit()
    pvalues = results_sm.pvalues.to_dict()
    
    # Store results
    lr_results[group] = {
        'coefficients': coeffs,
        'intercept': intercept,
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': rmse_dev,
        'pvalues': pvalues,
        'model': model,
    }
    
    # Print summary
    print(f"{first_level} - {second_level}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}, rmse(train)={rmse_train:.3f}, rmse(dev)={rmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}")
    # Print p-values with significance marker
    print("  p-values:")
    for var, pval in pvalues.items():
        sig = '*' if pval < 0.05 else ''
        print(f"    {var}: {pval:.3f}{sig}")
    print()

In [None]:
# Summary of significant p-values across all (roomType, openingType) groups
from collections import Counter, defaultdict

sig_counts = Counter()
sig_keys = defaultdict(list)
for (first_level, second_level), res in lr_results.items():
    # Skip intercept
    for var, pval in res['pvalues'].items():
        if var == 'const':
            continue
        if pval < 0.05:
            sig_counts[var] += 1
            sig_keys[var].append((first_level, second_level))

# Display results
print("Significant counts per variable (p < 0.05)):")
for var, count in sig_counts.items():
    groups = sig_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
# Create DataFrame of variables ranked by p-value for each group
import pandas as pd

ranked = {}
for key, res in lr_results.items():
    # Build Series of p-values, exclude intercept
    pvals = pd.Series(res['pvalues'])
    pvals = pvals.drop('const', errors='ignore')
    # Sort by p-value ascending and take variable names
    ranked[key] = pvals.sort_values().index.tolist()

# Construct DataFrame; rows = rank order, columns = groups
ranked_df = pd.DataFrame(ranked)
display(ranked_df)

## Regularized Regression

In [None]:
# Ridge regression for each (roomType, openingType) group
from sklearn.linear_model import Ridge

ridge_results = {}
# Set regularization strength (alpha); increase for more shrinkage
alpha = 1

for group in x_train_data.keys():
    first_level, second_level = group
    X_train = x_train_data[group]
    y_train = y_train_data[group] 
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit Ridge model
    ridge = Ridge(alpha=alpha, fit_intercept=True)
    ridge.fit(X_train, y_train)
    
    # Collect coefficients and intercept
    coeffs = dict(zip(X_train.columns, ridge.coef_))
    intercept = ridge.intercept_
    r2_train = ridge.score(X_train, y_train)
    r2_dev = ridge.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(y_train, ridge.predict(X_train))
    LL_dev = calculate_linear_likelihood(y_dev, ridge.predict(X_dev))
    rmse_train, nrmse_train = calculate_normalized_rmse(y_train, ridge.predict(X_train))
    rmse_dev, nrmse_dev = calculate_normalized_rmse(y_dev, ridge.predict(X_dev))
    
    ridge_results[group] = {
        'coefficients': coeffs, 
        'intercept': intercept, 
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': nrmse_dev,
        'model': ridge
    }

    # Display results
    print(f"Ridge α={alpha} | {first_level}-{second_level}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Lasso regression for each (roomType, openingType) group
from sklearn.linear_model import Lasso

lasso_results = {}
# Set L1 regularization strength; increase alpha for more sparsity
alpha_lasso = 0.01

for group in x_train_data.keys():
    first_level, second_level = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Fit Lasso model
    lasso = Lasso(alpha=alpha_lasso, max_iter=10000, fit_intercept=True)
    lasso.fit(X_train, y_train)
    
    # Collect coefficients and intercept
    coeffs = dict(zip(X_train.columns, lasso.coef_))
    intercept = lasso.intercept_
    r2_train = lasso.score(X_train, y_train)
    r2_dev = lasso.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(y_train, lasso.predict(X_train))
    LL_dev = calculate_linear_likelihood(y_dev, lasso.predict(X_dev))
    rmse_train, nrmse_train = calculate_normalized_rmse(y_train, lasso.predict(X_train))
    rmse_dev, nrmse_dev = calculate_normalized_rmse(y_dev, lasso.predict(X_dev))
    
    lasso_results[group] = {
        'coefficients': coeffs, 
        'intercept': intercept, 
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': nrmse_dev,
        'model': lasso
    }
    
    # Display results
    print(f"Lasso α={alpha_lasso} | {first_level}-{second_level}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}\n")

In [None]:
# Summary of Lasso coefficient sparsity across groups
from collections import Counter, defaultdict

nonzero_counts = Counter()
nonzero_keys = defaultdict(list)
for (first_level, second_level), res in lasso_results.items():
    for var, coef in res['coefficients'].items():
        if abs(coef) > 1e-6:  # Use a small threshold to account for floating point
            nonzero_counts[var] += 1
            nonzero_keys[var].append((first_level, second_level))

# Display results
print(f"Non-zero coefficient counts per variable (Lasso α={alpha_lasso}):")
for var, count in sorted(nonzero_counts.items(), key=lambda x: -x[1]):
    groups = nonzero_keys[var]
    print(f"{var}: {count} (groups: {groups})")

In [None]:
def getQFromCd(results, x_data, split, y_scaler=None):
    y_preds = []
    for group, res in results.items():
        X = x_data[group]
        model = res['model']
        y_pred = model.predict(X)
        y_transformer = y_scalers[group]
        y_pred = y_transformer.inverse_transform(y_pred.reshape(-1, 1)).ravel()  # Inverse transform predictions
        y_preds.append(y_pred)
    y_pred = np.concatenate(y_preds, axis=0)

    dfq_pred = flowStatsMI[flowStatsMI["split"] == split].copy()
    dfq_pred["C_d"] = y_pred
    roomVentilationMI_pred = roomVentilationMI[roomVentilationMI["split"] == split].copy()

    dfq_pred, roomVentilationMI_pred = feUtils.update_flow_and_ventilation(dfq_pred, roomVentilationMI_pred, optTypes = ["optp0"])
    y_pred = dfq_pred["p-noInt_optp0-q_model"]

    return y_pred

In [None]:
# For linear regression model
# group = ('corner', 'xwindow_0-0')  # Choose a group
# group = ('cross', 'zwindow')  # Choose a group
group = (True, True)  # Choose a group
X_train = x_train_data[group]
y_train = y_train_data[group]
X_dev = x_dev_data[group]
y_dev = y_dev_data[group]
y_transformer = y_scalers[group]


visualize_linear_model(lr_results[group]["model"], X_train, y_train, y_pred=None, style=df['openingType'], hue=df['roomType'], model_name="Linear Regression", y_transformer=y_transformer)
# visualize_linear_model(ridge_results[group]["model"], X_train, y_train, y_pred=None, style=df['openingType'], hue=df['roomType'], model_name="Ridge Regression", y_transformer=y_transformer)
# visualize_linear_model(lasso_results[group]["model"], X_train, y_train, y_pred=None, style=df['openingType'], hue=df['roomType'], model_name="Lasso Regression", y_transformer=y_transformer)


y_ref = flowStatsMI.loc[X_train.index, "flux"]

y_pred = getQFromCd(lr_results, x_train_data, "train", y_scalers)
y_pred = y_pred.loc[X_train.index]  # split out indexes
visualize_linear_model(lr_results[group]["model"], X_train, y_ref, y_pred=y_pred, style=df['openingType'], hue=df['roomType'], model_name="Linear Regression", y_transformer=None)

# y_pred = getQFromCd(ridge_results, x_train_data, "train", y_scalers)
# y_pred = y_pred.loc[X_train.index]  # split out indexes
# visualize_linear_model(ridge_results[group]["model"], X_train, y_ref, y_pred=y_pred, style=df['openingType'], hue=df['roomType'], model_name="Ridge Regression", y_transformer=None)

# y_pred = getQFromCd(lasso_results, x_train_data, "train", y_scalers)
# y_pred = y_pred.loc[X_train.index]  # split out indexes
# visualize_linear_model(lasso_results[group]["model"], X_train, y_ref, y_pred=y_pred, style=df['openingType'], hue=df['roomType'], model_name="Lasso Regression", y_transformer=None)

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

lr_results_unscaled = {}
y_pred_train = {}
y_pred_dev = {}
y_pred_test = {}

for group in x_train_data.keys():
    print(group)
    first_level, second_level = group
    X_train = x_train_data_unscaled[group]#[["p-noInt_optp0-q_model"]]
    y_train = y_train_data_unscaled[group]
    X_dev = x_dev_data_unscaled[group]#[["p-noInt_optp0-q_model"]]
    y_dev = y_dev_data_unscaled[group]
    X_test = x_test_data_unscaled[group]
    y_test = y_test_data_unscaled[group]
    
    # Fit multivariable linear model (sklearn)
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train, y_train)
    
    # Collect coefficients per feature
    coeffs = dict(zip(X_train.columns, model.coef_))
    intercept = model.intercept_
    r2_train = model.score(X_train, y_train)
    r2_dev = model.score(X_dev, y_dev)
    LL_train = calculate_linear_likelihood(y_train, model.predict(X_train))
    LL_dev = calculate_linear_likelihood(y_dev, model.predict(X_dev))
    rmse_train, nrmse_train = calculate_normalized_rmse(y_train, model.predict(X_train))
    rmse_dev, nrmse_dev = calculate_normalized_rmse(y_dev, model.predict(X_dev))
    
    # Significance testing (statsmodels OLS)
    X_sm = sm.add_constant(X_train)
    results_sm = sm.OLS(y_train, X_sm).fit()
    pvalues = results_sm.pvalues.to_dict()
    
    # Store results
    lr_results_unscaled[group] = {
        'coefficients': coeffs,
        'intercept': intercept,
        'r2_train': r2_train,
        'r2_dev': r2_dev,
        'LL_train': LL_train,
        'LL_dev': LL_dev,
        'nrmse_train': rmse_train,
        'nrmse_dev': rmse_dev,
        'pvalues': pvalues,
        'model': model,
    }
    
    # Print summary
    print(f"{first_level} - {second_level}: R²(train)={r2_train:.3f}, R²(dev)={r2_dev:.3f}, LL(train)={LL_train:.3f}, LL(dev)={LL_dev:.3f}, nrmse(train)={nrmse_train:.3f}, nrmse(dev)={nrmse_dev:.3f}, rmse(train)={rmse_train:.3f}, rmse(dev)={rmse_dev:.3f}")
    for feat, coef in coeffs.items():
        print(f"  {feat}: {coef:.3f}")
    print(f"  intercept: {intercept:.3f}")
    # Print p-values with significance marker
    print("  p-values:")
    for var, pval in pvalues.items():
        sig = '*' if pval < 0.05 else ''
        print(f"    {var}: {pval:.3f}{sig}")
    print()
    
    y_pred_train[group] = pd.Series(model.predict(X_train), index=X_train.index)
    y_pred_dev[group] = pd.Series(model.predict(X_dev), index=X_dev.index)
    y_pred_test[group] = pd.Series(model.predict(X_test), index=X_test.index)

X_train = recombine_grouped_data(x_train_data_unscaled)
X_dev = recombine_grouped_data(x_dev_data_unscaled)
y_train = recombine_grouped_data(y_train_data_unscaled)
y_dev = recombine_grouped_data(y_dev_data_unscaled)

# if fitting for C_d, get predictions from the model
y_pred_train_all = getQFromCd(lr_results_unscaled, x_train_data_unscaled, "train")
y_pred_train_all = y_pred_train_all.loc[X_train.index]  # split out indexes
y_pred_dev_all = getQFromCd(lr_results_unscaled, x_dev_data_unscaled, "dev")
y_pred_dev_all = y_pred_dev_all.loc[X_dev.index]

y_ref = flowStatsMI.loc[X_train.index, "flux"]
visualize_linear_model(model, X_train, y_ref, y_pred=y_pred_train_all, style=df['openingType'], hue=df['AofA'], model_name="Linear Regression (Unscaled)", top_features=10)
y_ref = flowStatsMI.loc[X_dev.index, "flux"]
visualize_linear_model(model, X_dev, y_ref, y_pred=y_pred_dev_all, style=df['openingType'], hue=df['roomType'], model_name="Linear Regression (Unscaled)", top_features=10)

In [None]:
WV_train = df[df["split"] == "train"].copy()
# y_pred_train_all = recombine_grouped_data(y_pred_train)
y_pred_train_all = getQFromCd(lr_results_unscaled, x_train_data_unscaled, "train")
y_pred_train_all = y_pred_train_all.loc[WV_train.index]  # split out indexes

WV_train["q-model"] = y_pred_train_all
WV_train["q-model"] = WV_train["q-model"] / WV_train["WS"]

fig, axs = plt.subplots(2, 3, figsize=(24, 12), dpi=140, sharex=True, sharey=True)
axs = axs.flatten()  # Flatten for easier indexing
Ri_values = sorted(WV_train["Ri"].unique())

# Define combinations of Ri, slAll
combinations = [
    (Ri_values[2],  True), 
    (Ri_values[1],  True), 
    (Ri_values[0],  True), 
    (Ri_values[2],  False),
    (Ri_values[1],  False),
    (Ri_values[0],  False),
]


# Value to normalize by
rho = 1.225

# Create plots for each combination
for i, (ri_val, sl_val) in enumerate(combinations):
    # Filter data for this combination
    plotdf = WV_train.copy()
    plotdf = plotdf[np.isclose(plotdf["Ri"], ri_val)]
    plotdf = plotdf[plotdf["skylight"] == sl_val]
    
    # Create the box plot for this subplot
    x_var = "q-model"
    y_var = "flux"
    sns.scatterplot(data=plotdf, x=x_var, y=y_var, hue="roomType", style="slAll", alpha=0.6, ax=axs[i])
    # add 1:1 regression line
    min_val = min(plotdf[x_var].min(), plotdf[y_var].min())
    max_val = max(plotdf[x_var].max(), plotdf[y_var].max())
    axs[i].plot([min_val, max_val], [min_val, max_val], 'r--', label='1:1 Line')
    
    # Customize the subplot
    axs[i].set_title(f"Ri={ri_val:.4f}, {'Skylight' if sl_val else 'Window'}", fontsize=14)
    axs[i].set_xlabel(x_var if i >= 5 else "", fontsize=12)
    axs[i].set_ylabel(y_var if i % 5 == 0 else "", fontsize=12)
    
    # Set legend with custom labels
    if i == 0:  # Only add detailed legend to first subplot
        handles, labels = axs[i].get_legend_handles_labels()
        axs[i].legend(title="QOI", loc='upper right')
    else:
        axs[i].get_legend().remove()  # Remove redundant legends

# Add overall title
fig.suptitle("Ventilation Metrics by Room Type, Richardson Number, Steady State, and Skylights", fontsize=16, y=0.98)
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle

rmse, nrmse  = calculate_normalized_rmse(WV_train["flux"], WV_train["q-model"], normalization='std')
print(f"Overall NRMSE: {nrmse:.4f}, RMSE: {rmse:.4f}")

In [None]:
y_pred_train_all = recombine_grouped_data(y_pred_train)
y_pred_dev_all = recombine_grouped_data(y_pred_dev)
y_pred_test_all = recombine_grouped_data(y_pred_test)

y_pred = pd.concat([y_pred_train_all, y_pred_dev_all, y_pred_test_all], axis=0)

roomVentilationMI["q-model-Norm"] = None
for (run, room), row in roomVentilationMI.iterrows():
    windowKeyCols = roomVentilationMI.columns[
        roomVentilationMI.columns.str.contains("windowKeys")
    ].tolist()
    windowKeys = row[windowKeyCols].dropna()
    q_pred = 0
    for windowKey in windowKeys:
        q_pred += np.abs(y_pred.loc[[(run, windowKey)]].values[0]) / 2
    roomVentilationMI.loc[(run, room), "q-model-Norm"] = q_pred

roomVentilationMI["flux-Norm"] = roomVentilationMI["flux"] / roomVentilationMI["WS"]

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(24, 12), dpi=140)
axs = axs.flatten()  # Flatten for easier indexing
Ri_values = sorted(RV_train["Ri"].unique())

# Define combinations of Ri, SS, and slAll
combinations = [
    (Ri_values[2], False, True), 
    (Ri_values[1], False, True), 
    (Ri_values[2], True,  True), 
    (Ri_values[1], True,  True), 
    (Ri_values[0], True,  True), 
    (Ri_values[2], False, False),
    (Ri_values[1], False, False),
    (Ri_values[2], True,  False),
    (Ri_values[1], True,  False),
    (Ri_values[0], True,  False),
]


# Value to normalize by
rho = 1.225

# Create plots for each combination
for i, (ri_val, ss_val, sl_val) in enumerate(combinations):
    # Filter data for this combination
    plotdf = RV_train.copy()
    plotdf = plotdf[np.isclose(plotdf["Ri"], ri_val)]
    plotdf = plotdf[plotdf["SS"] == ss_val]
    plotdf = plotdf[plotdf["slAll"] == sl_val]
    
    # Create the box plot for this subplot
    x_var = "q-model-Norm"
    y_var = "flux-Norm"
    sns.scatterplot(data=plotdf, x=x_var, y=y_var, hue="roomType", style="houseType", alpha=0.6, ax=axs[i])
    
    # Customize the subplot
    axs[i].set_title(f"Ri={ri_val:.4f}, {'Steady State' if ss_val else 'Non-Steady State'}, {'With Skylights' if sl_val else 'No Skylights'}", fontsize=14)
    axs[i].set_xlabel(x_var if i >= 5 else "", fontsize=12)
    axs[i].set_ylabel(y_var if i % 5 == 0 else "", fontsize=12)
    
    # Set legend with custom labels
    if i == 0:  # Only add detailed legend to first subplot
        handles, labels = axs[i].get_legend_handles_labels()
        axs[i].legend(title="QOI", loc='upper right')
    else:
        axs[i].get_legend().remove()  # Remove redundant legends

# Add overall title
fig.suptitle("Ventilation Metrics by Room Type, Richardson Number, Steady State, and Skylights", fontsize=16, y=0.98)
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle

In [None]:
rho = 1.225
RV_train = roomVentilationMI[roomVentilationMI["split"] == "train"]
RV_train["mean-mass_flux(S)-Norm"] /= -rho
RV_train["mean-mass_flux-Norm"] /= rho
RV_train["q-D-room-Norm"] /= rho

plt.figure()
sns.scatterplot(data=RV_train, x="q-model-Norm", y="flux-Norm", hue="AofA", style="roomType", alpha=0.6)

plt.figure()
sns.scatterplot(data=RV_train, x="q-model-Norm", y="q-D-room-Norm", hue="SS", style="Ri", alpha=0.6)

plt.figure()
sns.scatterplot(data=RV_train, x="flux-Norm", y="q-D-room-Norm", hue="SS", style="Ri", alpha=0.6)

In [None]:
Ri_values = sorted(RV_train["Ri"].unique())

# Value to normalize by
rho = 1.225

fig, axs = plt.subplots(1, 2, figsize=(24, 12), dpi=140)
axs = axs.flatten()  # Flatten for easier indexing
for i in range(len(axs)):
    # Filter data for this combination
    plotdf = RV_train.copy()
    plotdf = plotdf[plotdf["slAll"] == bool(i)]
    
    # Create the box plot for this subplot
    sns.scatterplot(data=plotdf, x="q-model-Norm", y="q-D-room-Norm", hue="Ri", style="SS", alpha=0.6, ax=axs[i])
    

## Gaussian Process Regression

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel, DotProduct
from sklearn.metrics import r2_score


In [None]:
def calculate_gpr_likelihood(gpr_model, X=None, y=None, use_kernel_noise=True):
    """
    Calculate log likelihood for a Gaussian Process Regressor using kernel variance.
    
    Parameters:
    -----------
    gpr_model : GaussianProcessRegressor
        The fitted GPR model
    X : array-like, shape (n_samples, n_features), optional
        Input features. If None, assumes we want training set likelihood.
    y : array-like, shape (n_samples,), optional
        Target values. If None, assumes we want training set likelihood.
    use_kernel_noise : bool, default=True
        Whether to use the noise variance from the kernel (if available)
        
    Returns:
    --------
    log_likelihood : float
        Log likelihood of the data under the model
    """
    import numpy as np
    from scipy import stats
    
    # Case 1: No data provided - return training log marginal likelihood
    if X is None or y is None:
        if hasattr(gpr_model, 'log_marginal_likelihood_value_'):
            return gpr_model.log_marginal_likelihood_value_ / len(gpr_model.y_train_)  # Normalize by number of data points
        else:
            return gpr_model.log_marginal_likelihood(gpr_model.kernel_.theta) / len(gpr_model.y_train_)
    
    # Case 2: Data provided - calculate likelihood for this dataset
    y_mean, y_std = gpr_model.predict(X, return_std=True)
    
    # Try to extract noise variance from kernel if WhiteKernel is used
    noise_variance = None
    if use_kernel_noise:
        try:
            # Attempt to extract noise variance from kernel
            if hasattr(gpr_model.kernel_, 'k2') and 'WhiteKernel' in str(gpr_model.kernel_.k2):
                noise_variance = gpr_model.kernel_.k2.noise_level
            elif hasattr(gpr_model.kernel_, 'noise_level'):
                noise_variance = gpr_model.kernel_.noise_level
            elif hasattr(gpr_model, 'alpha'):
                # Alpha in GPR is often used as the noise variance
                noise_variance = gpr_model.alpha
        except:
            pass
    
    # If we found a noise variance from the kernel, use it
    if noise_variance is not None:
        # Combine predictive variance with noise variance
        total_variance = y_std**2 + noise_variance
        std_dev = np.sqrt(total_variance)
    else:
        # Use the predicted standard deviations
        std_dev = y_std
    
    # Calculate average log likelihood assuming Gaussian noise
    log_likelihood = np.mean(stats.norm.logpdf(y, loc=y_mean, scale=std_dev))
    
    return log_likelihood

In [None]:
# Dictionary to store GP results
gp_results = {}

for group in x_train_data.keys():
    first_level, second_level = group
    X_train = x_train_data[group]
    y_train = y_train_data[group]
    X_dev = x_dev_data[group]
    y_dev = y_dev_data[group]
    
    # Calculate residuals for y_train and y_dev using the Lasso model
    lasso_model = lasso_results[group]['model']
    y_train = y_train - lasso_model.predict(X_train)
    y_dev = y_dev - lasso_model.predict(X_dev)
    
    print(f"Training GP for {first_level}-{second_level}...")
    
    # Define kernel: signal variance × RBF + noise term
    kernel = (
        C(1.0, (1e-3, 1e3))   # signal variance
        * RBF(1.0, (1e-2, 1e2))  # length-scale
        # + DotProduct()      # global linear trend
        + WhiteKernel(
            noise_level=1e-2,
            noise_level_bounds=(1e-5, 1e1)
          )
    )
    
    # Instantiate GPR
    gpr = GaussianProcessRegressor(
        kernel=kernel,
        n_restarts_optimizer=10,
        random_state=0,
        normalize_y=False  # since we've already normalized
    )
    
    # Fit on the training set
    gpr.fit(X_train, y_train)
    


In [None]:
# Validate on dev set
y_dev_pred, y_dev_std = gpr.predict(X_dev, return_std=True)

r2_dev = r2_score(y_dev, y_dev_pred)
r2_train = gpr.score(X_train, y_train)
LL_train = calculate_gpr_likelihood(gpr)
LL_dev = calculate_gpr_likelihood(gpr, X_dev, y_dev)
rmse_train, nrmse_train = calculate_normalized_rmse(y_train, gpr.predict(X_train))
rmse_dev, nrmse_dev = calculate_normalized_rmse(y_dev, y_dev_pred)

# Store results
gp_results[group] = {
    'model': gpr,
    'kernel': gpr.kernel_,
    'r2_dev': r2_dev,
    'r2_train': r2_train,
    'LL_dev': LL_dev,
    'LL_train': LL_train,
    'nrmse_train': rmse_train,
    'nrmse_dev': nrmse_dev
}

print(f"Train R² = {r2_train:.3f}, Dev R² = {r2_dev:.3f}")
print(f"Log-Likelihood Train: {LL_train :.3f}, Dev: {LL_dev:.3f}")
print(f"RMSE Train: {rmse_train:.3f}, Dev: {rmse_dev:.3f}")
print("Learned kernel:", gpr.kernel_)
print()

In [None]:
def visualize_gpr_fit(gpr_model, X, y, hue=None, style=None, feature_names=None, top_features=10):
    """
    Create a comprehensive visualization of GPR model fit with better feature interpretation.
    
    Parameters:
    -----------
    gpr_model : GaussianProcessRegressor
        The fitted GPR model
    X : DataFrame
        Input features as pandas DataFrame
    y : Series or array
        Target values
    feature_names : list, optional
        Names of features (if not provided, will use X.columns)
    top_features : int, optional
        Number of top features to show in importance plot
    """
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from sklearn.metrics import mean_squared_error
    import seaborn as sns
    
    # Convert y to numpy array if it's a pandas Series
    if hasattr(y, 'values'):
        y_values = y.values
    else:
        y_values = np.array(y)
    
    # Get feature names from DataFrame if not provided
    if feature_names is None and hasattr(X, 'columns'):
        feature_names = X.columns.tolist()
    elif feature_names is None:
        feature_names = [f'Feature {i}' for i in range(X.shape[1])]
    
    # Get predictions and uncertainty
    y_pred, y_std = gpr_model.predict(X, return_std=True)
    if hue is None:
        hue = pd.Series(y_std, index=X.index, name='Prediction Uncertainty')
    elif hue is not None and hasattr(X, 'index'):
        hue = hue.loc[X.index]

    if style is not None and hasattr(X, 'index'):
        style = style.loc[X.index]
    
       # Calculate metrics
    r2 = gpr_model.score(X, y_values)
    rmse = np.sqrt(mean_squared_error(y_values, y_pred))
    
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 14))
    
    # 1. Prediction vs Actual plot
    ax1 = fig.add_subplot(221)
    sns.scatterplot(x=y_values, y=y_pred, hue=hue, style=style, alpha=0.6, ax=ax1)
    
    # Add perfect prediction line
    min_val = min(min(y_values), min(y_pred))
    max_val = max(max(y_values), max(y_pred))
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    ax1.set_xlabel('Actual Values')
    ax1.set_ylabel('Predicted Values')
    ax1.set_title(f'Prediction vs Actual\nR² = {r2:.4f}, RMSE = {rmse:.4f}')
    ax1.grid(True, alpha=0.3)
    
    # 2. Prediction vs Actual with Uncertainty
    ax2 = fig.add_subplot(222)
    # Sort by actual values for clearer visualization
    sort_idx = np.argsort(y_values)
    ax2.errorbar(np.arange(len(y_values)), y_values[sort_idx], yerr=0, fmt='o', label='Actual', alpha=0.6)
    ax2.errorbar(np.arange(len(y_values)), y_pred[sort_idx], yerr=1.96*y_std[sort_idx], 
                fmt='o', label='Predicted with 95% CI', alpha=0.6)
    ax2.set_xlabel('Sample Index (sorted by actual value)')
    ax2.set_ylabel('Value')
    ax2.set_title('GPR Predictions with Uncertainty')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Residuals plot
    ax3 = fig.add_subplot(223)
    residuals = y_values - y_pred
    sns.scatterplot(x=y_pred, y=residuals, hue=hue, style=style, alpha=0.6, ax=ax3)
    ax3.axhline(y=0, color='r', linestyle='--')
    ax3.set_xlabel('Predicted Values')
    ax3.set_ylabel('Residuals')
    ax3.set_title('Residuals vs Predicted')
    ax3.grid(True, alpha=0.3)
    
    # 4. Uncertainty distribution
    ax4 = fig.add_subplot(224)
    ax4.hist(y_std, bins=20, alpha=0.6)
    ax4.set_xlabel('Prediction Standard Deviation')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Distribution of Prediction Uncertainty')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Feature importance analysis
    if hasattr(X, 'values'):
        X_values = X.values
    else:
        X_values = X
    
    if X_values.shape[1] > 1:
        print("\n--- Feature Importance Analysis ---")
        
        # Estimate feature importance by varying each feature
        plt.figure(figsize=(12, 8))
        importance = []
        
        # Create a grid for each feature
        for i in range(X_values.shape[1]):
            # Use pandas if available for min/max
            if hasattr(X, 'iloc'):
                feature_min = X.iloc[:, i].min()
                feature_max = X.iloc[:, i].max()
            else:
                feature_min = np.min(X_values[:, i])
                feature_max = np.max(X_values[:, i])
                
            x_grid = np.linspace(feature_min, feature_max, 50)
            X_grid = np.tile(np.mean(X_values, axis=0), (50, 1))
            X_grid[:, i] = x_grid
            
            # Predict across the grid
            y_grid = gpr_model.predict(X_grid)
            
            # Calculate importance as range of predictions
            importance.append(np.max(y_grid) - np.min(y_grid))
        
        # Create DataFrame for importance
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importance
        })
        
        # Sort by importance
        importance_df = importance_df.sort_values('Importance', ascending=False)
        
        # Show top features
        top_importance = importance_df.head(top_features)
        
        # Plot top feature importance
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=top_importance)
        plt.title(f'Top {top_features} Feature Importance in GPR Model')
        plt.tight_layout()
        plt.show()
        
        # Display importance table
        print("\nFeature Importance Ranking:")
        display(importance_df)
        
        # Feature correlation with target
        if hasattr(X, 'corrwith'):
            print("\nFeature Correlation with Target:")
            corr_df = pd.DataFrame({
                'Feature': feature_names,
                'Correlation': X.corrwith(pd.Series(y_values, index=X.index)).values
            })
            corr_df['Abs_Correlation'] = np.abs(corr_df['Correlation'])
            corr_df = corr_df.sort_values('Abs_Correlation', ascending=False)
            display(corr_df)
            
            # Plot correlation heatmap for top features
            plt.figure(figsize=(12, 10))
            top_features_list = importance_df.head(min(15, len(feature_names)))['Feature'].tolist()
            X_top = X[top_features_list]
            
            # Add target to the correlation matrix
            X_with_y = X_top.copy()
            X_with_y['Target'] = y_values
            
            # Plot correlation heatmap
            sns.heatmap(X_with_y.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
            plt.title('Correlation Heatmap of Top Features with Target')
            plt.tight_layout()
            plt.show()

# For a specific group:
group = (True, True)  # or any other group you're interested in
# group = ('all', 'all')  # or any other group you're interested in
X_train = x_train_data[group]
y_train = y_train_data[group]
X_dev = x_dev_data[group]
y_dev = y_dev_data[group]
y_train = y_train - lasso_results[group]['model'].predict(X_train)
y_dev = y_dev - lasso_results[group]['model'].predict(X_dev)
model = gp_results[group]['model']


In [None]:

# Visualize with better feature interpretation
visualize_gpr_fit(model, X_train, y_train, hue=df['openingType'], style=df['openingType'])

# To see only top 5 most important features
visualize_gpr_fit(model, X_dev, y_dev, hue=None, style=df['slAll'], top_features=5)

## Model Comparison

In [None]:
# Compare model performance on dev set across all groups
import pandas as pd

# Compare model performance on dev set across all groups

# Define metrics to extract and their properties
metrics = {
    'R²': {'key': 'r2_dev', 'higher_is_better': True, 'description': 'R² scores on dev set (higher is better)'},
    'Log-Likelihood': {'key': 'LL_dev', 'higher_is_better': True, 'description': 'Log-likelihood scores on dev set (higher is better)'},
    'NRMSE': {'key': 'nrmse_dev', 'higher_is_better': False, 'description': 'Normalized RMSE scores on dev set (lower is better)'}
}

# Model types to compare
models = ['Linear', 'Ridge', 'Lasso', 'GP']
model_results = {'Linear': lr_results, 'Ridge': ridge_results, 'Lasso': lasso_results, 'GP': gp_results}

# Process all metrics
results_df = {}
for metric_name, metric_info in metrics.items():
    metric_key = metric_info['key']
    higher_is_better = metric_info['higher_is_better']
    
    # Extract metrics for all groups and models
    metric_data = []
    for group in x_train_data.keys():
        first_level, second_level = group
        
        row_data = {'first_level': first_level, 'second_level': second_level}
        for model_name, model_result in model_results.items():
            row_data[model_name] = model_result[group][metric_key]
        
        metric_data.append(row_data)
    
    # Convert to DataFrame
    metric_df = pd.DataFrame(metric_data)
    metric_df = metric_df.set_index(['first_level', 'second_level'])
    results_df[metric_name] = metric_df
    
    # Display results
    print(f"\n{metric_info['description']}:")
    display(metric_df)
    print(f"\nAverage {metric_name} by model type:")
    print(metric_df.mean())
    
    # Find best model for each group
    if higher_is_better:
        best_model = metric_df.idxmax(axis=1)
    else:
        best_model = metric_df.idxmin(axis=1)
    
    print(f"\nBest model for each group ({metric_name}):")
    display(pd.DataFrame({f'Best Model ({metric_name})': best_model}))
