In [5]:
# Import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [6]:
# Load input and output
x_npy = np.load('/lustre/gpu-lustre/code/Input/x_array_4_years_filtered_utc.npy')
y_npy = np.load('/lustre/gpu-lustre/code/Input/y_array_4_years_filtered_utc.npy')
time_npy = np.load('/lustre/gpu-lustre/code/Input/time_array_4_years_filtered_utc.npy')
#edge_index = torch.load("/lustre/code/BiasCorrection/Codes/edge_index.pt")


In [13]:
def run_significance_analysis(X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
    """
    Performs individual simple linear regression (SLR) for each predictor
    in X against the target y to assess statistical significance using
    the F-test for regression.

    Args:
        X: DataFrame of predictor variables (shape: [n_samples, n_features]).
        y: Series of the target variable (shape: [n_samples,]).

    Returns:
        A DataFrame summarizing the F-test results (F-statistic, P-value, R-squared).
    """
    
    # Initialize an empty list to store results for each predictor
    results_list = []

    # Loop through each predictor column in the input DataFrame X
    for col in X.columns:
        # 1. Define the predictor (X_single) and add a constant for the intercept (required by statsmodels)
        X_single = sm.add_constant(X[col])

        # 2. Fit the Simple Linear Regression (SLR) model
        # The Ordinary Least Squares (OLS) model inherently produces the F-statistic for the overall model,
        # which, in a simple linear regression (one predictor), tests the significance of that single predictor.
        try:
            model = sm.OLS(y, X_single, missing='drop').fit()
        except Exception as e:
            print(f"Error fitting model for {col}: {e}")
            continue

        # 3. Extract the required statistical metrics
        f_statistic = model.fvalue
        p_value = model.f_pvalue
        r_squared = model.rsquared

        # 4. Determine the conclusion based on the common alpha level (alpha=0.05)
        is_significant = 'Significant (p < 0.05)' if p_value < 0.05 else 'Not Significant (p >= 0.05)'

        # 5. Store the results
        results_list.append({
            'Predictor': col,
            'F_Statistic': f_statistic,
            'P_Value': p_value,
            'Conclusion': is_significant
        })

    # Convert the list of results into a final DataFrame
    results_df = pd.DataFrame(results_list)
    
    # Format columns for cleaner display
    results_df['F_Statistic'] = results_df['F_Statistic'].round(3)
    results_df['P_Value'] = results_df['P_Value'].apply(lambda x: f"{x:.4f}" if x >= 0.001 else "< 0.001")
    
    return results_df.sort_values(by='F_Statistic', ascending=False).reset_index(drop=True)


# --- Data Loading and Reshaping Logic ---

# Reshape Predictors (X) from [C, S, F, T] to [C*S*T, F]
# This flattens the sample dimensions (cycle, station, time) into one.
X_flat = np.moveaxis(x_npy, 2, -1).reshape(-1, x_npy.shape[2])
N_samples_total = X_flat.shape[0]

# Reshape Target (Y) from [C, S, T] to [C*S*T]
Y_flat = y_npy.flatten()


# Define Feature Names
# Based on your previous context, we use the following standard names for oceanographic/meteorological inputs:
feature_names = [
    'STOFS_Water_Level',
    'GFS_Wind_V_Comp',
    'GFS_Wind_U_Comp', 
    'Surface_Pressure', 
    'Solar_Angle_Sin', 
    'Lunar_Angle_Cos',
    'Latitute',
    'Longitude'
]

# Convert to DataFrame/Series for pandas/statsmodels
X_Predictors = pd.DataFrame(X_flat, columns=feature_names)
Y_Target = pd.Series(Y_flat, name="Target_Y")


# --- Execution ---
print("--- Statistical Significance Analysis (F-Regression) ---")
print(f"Target Variable: Y (Water Level); N={N_samples_total} samples.")
print(f"Predictor shape after flattening: {X_Predictors.shape}")
print("-" * 60)

significance_report = run_significance_analysis(X_Predictors, Y_Target)

print("\nStatistical Significance Report (Alpha = 0.05):")
print(significance_report.to_string(index=False))


--- Statistical Significance Analysis (F-Regression) ---
Target Variable: Y (Water Level); N=12273582 samples.
Predictor shape after flattening: (12273582, 8)
------------------------------------------------------------

Statistical Significance Report (Alpha = 0.05):
        Predictor  F_Statistic P_Value             Conclusion
STOFS_Water_Level   181283.267 < 0.001 Significant (p < 0.05)
 Surface_Pressure    89693.209 < 0.001 Significant (p < 0.05)
         Latitute    88156.651 < 0.001 Significant (p < 0.05)
  Solar_Angle_Sin    24550.806 < 0.001 Significant (p < 0.05)
        Longitude     6282.480 < 0.001 Significant (p < 0.05)
  GFS_Wind_U_Comp     3124.724 < 0.001 Significant (p < 0.05)
  Lunar_Angle_Cos     1693.380 < 0.001 Significant (p < 0.05)
  GFS_Wind_V_Comp      848.296 < 0.001 Significant (p < 0.05)
