In [1]:
# ===== Standard Library =====
import os
import gc
import warnings

# ===== Data Science Libraries =====
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import loguniform, norm, randint, uniform

# ===== Visualization =====
import matplotlib.pyplot as plt
import seaborn as sns

# ===== Scikit-learn =====
# Preprocessing
from sklearn.preprocessing import (
    LabelEncoder,
    PolynomialFeatures,
    PowerTransformer,
    QuantileTransformer,
    StandardScaler,
    RobustScaler
)

# Imputation
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

# Model selection & evaluation
from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
)
from sklearn.metrics import mean_squared_error, r2_score

# Feature engineering
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression

# Model
from sklearn.ensemble import RandomForestRegressor

# ===== External ML Libraries =====
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import shap

# ===== Statistical Tools =====
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ===== Utility =====
import joblib

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=FutureWarning)

# -- Path File  --
TRAIN_PATH = 'data/regression-rumble-ndc-2025/train.csv'
TEST_PATH = 'data/regression-rumble-ndc-2025/test.csv'
SUBMISSION_PATH = 'data/regression-rumble-ndc-2025/sample_submission.csv'

# --- Konstanta ---
SEED = 42
TARGET = 'hydrostatic_pressure'  # Switched target to hydrostatic_pressure
N_SPLITS = 10
EPSILON = 1e-6
N_SPLITS_CV = 5
TARGET_R2 = 0.99

# -- Konfigurasi Dasar --
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

np.random.seed(SEED)

### Hydrostatic 1

#### LGBM

In [None]:
# --- Configuration ---
LGBM_PARAMS_COMMON = {
    'random_state': SEED,
    'n_jobs': -1,
    'verbose': -1, # Suppress LightGBM verbosity during training
}
# Expanded parameter grid for LightGBM hyperparameter tuning
LGBM_PARAM_GRID = {
    'n_estimators': [2000, 2500, 3000],         
    'learning_rate': [0.01, 0.03, 0.05],      
    'num_leaves': [31, 63, 127],                
    'max_depth': [7, 8, 9, 10],               
    'min_child_samples': [5, 20, 50],         
    'subsample': [0.7, 0.8, 0.9],                
    'colsample_bytree': [0.7, 0.8, 0.9],         
    'reg_alpha': [0, 0.1, 0.5, 1.0],             
    'reg_lambda': [0, 0.1, 0.5, 1.0],           
    'min_split_gain': [0, 0.1, 0.2],               
    'bagging_freq': [0, 1, 5],                    
    'feature_fraction_seed': [SEED]                
}

In [None]:
# --- Helper Functions ---
def create_datetime_features(df, column_name):
    df_copy = df.copy()
    if column_name not in df_copy.columns:
        print(f"Warning: Datetime column '{column_name}' not found for feature extraction.")
        return df_copy
        
    # Ensure the column is actually datetime before attempting dt accessor
    if pd.api.types.is_datetime64_any_dtype(df_copy[column_name]):
        df_copy['month'] = df_copy[column_name].dt.month
        df_copy['day'] = df_copy[column_name].dt.day
        df_copy['hour'] = df_copy[column_name].dt.hour
        df_copy['dayofweek'] = df_copy[column_name].dt.dayofweek
        df_copy['weekofyear'] = df_copy[column_name].dt.isocalendar().week.astype(int)
        df_copy['quarter'] = df_copy[column_name].dt.quarter
        # df_copy = df_copy.drop(columns=[column_name]) # Drop original datetime column
    else:
        print(f"Warning: Column '{column_name}' is not datetime type. Skipping datetime feature extraction.")
    return df_copy

def advanced_feature_engineering(df_in):
    df = df_in.copy()
    epsilon = 1e-6 # For safe division
    
    # Interactions for temperature and salinity at different depths
    depth_suffixes = ['0m', '10m', '20m', '30m', '40m', '50m']
    for suffix in depth_suffixes:
        temp_col = f'water_temperature_{suffix}'
        sal_col = f'salinity_{suffix}'
        if temp_col in df.columns and sal_col in df.columns:
            # Ensure columns are numeric before operation
            df[f'temp_sal_interaction_{suffix}'] = pd.to_numeric(df[temp_col], errors='coerce') * pd.to_numeric(df[sal_col], errors='coerce')

    # Gradients between surface and 50m (if available)
    for param in ['water_temperature', 'salinity', 'dissolved_oxygen', 'ph', 'turbidity', 'chlorophyll']:
        col_0m = f'{param}_0m'
        col_50m = f'{param}_50m'
        if col_0m in df.columns and col_50m in df.columns:
            df[f'{param}_grad_0_50m'] = pd.to_numeric(df[col_0m], errors='coerce') - pd.to_numeric(df[col_50m], errors='coerce')

    # Ratios (add epsilon to avoid division by zero)
    if 'water_temperature_0m' in df.columns and 'water_temperature_50m' in df.columns:
         df['temp_ratio_0_50m'] = pd.to_numeric(df['water_temperature_0m'], errors='coerce') / (pd.to_numeric(df['water_temperature_50m'], errors='coerce') + epsilon)
    if 'salinity_0m' in df.columns and 'salinity_50m' in df.columns:
        df['sal_ratio_0_50m'] = pd.to_numeric(df['salinity_0m'], errors='coerce') / (pd.to_numeric(df['salinity_50m'], errors='coerce') + epsilon)

    # Polynomials of key features
    if 'depth_m' in df.columns and pd.api.types.is_numeric_dtype(df['depth_m']):
        df['depth_m_sq'] = df['depth_m']**2
        df['depth_m_cub'] = df['depth_m']**3
        
    # If seafloor_pressure is a feature (relevant for our feature engineering path)
    if 'seafloor_pressure' in df.columns and pd.api.types.is_numeric_dtype(df['seafloor_pressure']):
        df['seafloor_pressure_sq'] = df['seafloor_pressure']**2
        if 'depth_m' in df.columns and pd.api.types.is_numeric_dtype(df['depth_m']):
            df['seafloor_pressure_per_depth'] = df['seafloor_pressure'] / (df['depth_m'] + epsilon)

    # Mean/std of related features across depths
    param_groups_data = {
        'water_temperature': [f'water_temperature_{s}' for s in depth_suffixes],
        'salinity': [f'salinity_{s}' for s in depth_suffixes]
        # Add other param groups if relevant
    }
    for param_name, param_group_cols in param_groups_data.items():
        existing_cols = [col for col in param_group_cols if col in df.columns]
        if len(existing_cols) > 1: 
            # Convert to numeric before aggregation
            numeric_series_list = [pd.to_numeric(df[col], errors='coerce') for col in existing_cols]
            temp_df_for_agg = pd.concat(numeric_series_list, axis=1)
            
            df[f'{param_name}_mean_profile'] = temp_df_for_agg.mean(axis=1)
            df[f'{param_name}_std_profile'] = temp_df_for_agg.std(axis=1)
            df[f'{param_name}_range_profile'] = temp_df_for_agg.max(axis=1) - temp_df_for_agg.min(axis=1)
    return df

def preprocess_data_for_model(data_df, 
                              engineered_feature_columns_to_use, # List of feature names from training
                              is_training_data, 
                              for_model_A=True): # Flag to distinguish FE paths if needed
    df_processed = data_df.copy()
    
    # Apply advanced feature engineering
    # If for_model_A is False, 'seafloor_pressure'-dependent features should be handled differently
    df_processed = advanced_feature_engineering(df_processed)
    
    if is_training_data:
        # For training, select all numeric columns (original + engineered)
        df_final_features = df_processed.select_dtypes(include=np.number)
        # Store the columns used for training
        current_training_features = df_final_features.columns.tolist()
        return df_final_features, current_training_features
    else:
        # For prediction data
        # Ensure all engineered_feature_columns_to_use are present, add missing ones with NaN
        for col in engineered_feature_columns_to_use:
            if col not in df_processed.columns:
                df_processed[col] = np.nan # Add missing feature as NaN
        # Select and reorder to match training feature set
        # Ensure only features from the training list are used, in the correct order
        df_final_features = df_processed[engineered_feature_columns_to_use].copy() 
        df_final_features = df_final_features.select_dtypes(include=np.number) # Final check for numeric
        return df_final_features, None # No new training features list for prediction data

def train_lgbm_model(X_train, y_train, model_name):
    """
    Train LightGBM model using RandomizedSearchCV for more comprehensive hyperparameter tuning
    """
    print(f"\n--- Training {model_name} ---")
    if X_train.empty or y_train.empty:
        print(f"Training data for {model_name} is empty. Skipping training.")
        return None, [], -1.0  # model, features_used_list, best_cv_score

    # Create LightGBM model with common parameters
    lgbm = lgb.LGBMRegressor(**LGBM_PARAMS_COMMON)
    kf = KFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=SEED)
    
    print(f"Performing RandomizedSearchCV for {model_name} with {X_train.shape[1]} features...")
    # Using RandomizedSearchCV instead of GridSearchCV for faster execution with more parameters
    random_search = RandomizedSearchCV(
        estimator=lgbm, 
        param_distributions=LGBM_PARAM_GRID, 
        n_iter=90,
        cv=kf, 
        scoring='r2', 
        verbose=1, 
        n_jobs=-1,
        random_state=SEED
    )
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_cv_r2_score = random_search.best_score_
    
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best cross-validated R² score for {model_name}: {best_cv_r2_score:.4f}")
    
    # Feature importance analysis
    if best_model is not None:
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': best_model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print("\nTop 10 Feature Importances:")
        print(feature_importance.head(10))
        
    return best_model, X_train.columns.tolist(), best_cv_r2_score

In [3]:
# --- STEP 1: Preprocessing & Setup ---
print("\n--- STEP 1: Preprocessing & Setup ---")
csv_path = 'data/fitur_clean/seafloor1.csv'
df = pd.read_csv(csv_path, parse_dates=['depth_reading_time'])
print(f"Loaded dataset with shape: {df.shape}")

# Store original indices for Subset A and B based on initial NaN status
missing_hydro_pressure_mask_initial = df['hydrostatic_pressure'].isnull()
subset_A_indices = df[missing_hydro_pressure_mask_initial & df['seafloor_pressure'].notnull()].index
subset_B_indices = df[missing_hydro_pressure_mask_initial & df['seafloor_pressure'].isnull()].index
print(f"Size of Subset A (initially missing hydrostatic_pressure, seafloor_pressure available): {len(subset_A_indices)}")
print(f"Size of Subset B (initially missing both hydrostatic_pressure and seafloor_pressure): {len(subset_B_indices)}")

# Clean `oxygen_saturation_50m`
if 'oxygen_saturation_50m' in df.columns:
    print("Cleaning 'oxygen_saturation_50m'...")
    df['oxygen_saturation_50m'] = df['oxygen_saturation_50m'].astype(str).str.replace(',', '.', regex=False)
    df['oxygen_saturation_50m'] = pd.to_numeric(df['oxygen_saturation_50m'], errors='coerce')

# Extract datetime features (and drop original datetime column)
df = create_datetime_features(df, 'depth_reading_time')
print(f"Shape after initial cleaning and datetime features: {df.shape}")


--- STEP 1: Preprocessing & Setup ---
Loaded dataset with shape: (21888, 60)
Size of Subset A (initially missing hydrostatic_pressure, seafloor_pressure available): 5278
Size of Subset B (initially missing both hydrostatic_pressure and seafloor_pressure): 1289
Cleaning 'oxygen_saturation_50m'...
Shape after initial cleaning and datetime features: (21888, 60)


In [4]:
# --- STEP 2: Model A – Predict `hydrostatic_pressure` (with `seafloor_pressure`) ---
print("\n--- STEP 2: Model A (with seafloor_pressure) ---")
model_A = None
model_A_features = [] # To store feature names used by Model A
cv_r2_A = -1.0 # Initialize with a value indicating not run or failed
subset_A_early_imputed = False

# Training data for Model A: hydrostatic_pressure NOT NULL, seafloor_pressure NOT NULL
train_A_df_mask = df['hydrostatic_pressure'].notnull() & df['seafloor_pressure'].notnull()
train_A_df = df[train_A_df_mask].copy() # Use .copy() to avoid SettingWithCopyWarning later

if not train_A_df.empty:
    y_A_train = train_A_df['hydrostatic_pressure']
    X_A_train_raw = train_A_df.drop(columns=['hydrostatic_pressure']) 
    
    # Preprocess features for Model A training
    X_A_train, model_A_features_discovered = preprocess_data_for_model(
        X_A_train_raw, None, is_training_data=True, for_model_A=True
    )
    model_A_features = model_A_features_discovered # Store for prediction phase
    
    print(f"Shape of X_A_train for Model A: {X_A_train.shape}, y_A_train: {y_A_train.shape}")
    model_A, _, cv_r2_A = train_lgbm_model(X_A_train, y_A_train, "Model A")

    # Conditional early imputation for Subset A
    if model_A and cv_r2_A >= TARGET_R2:
        if not len(subset_A_indices) == 0:
            print(f"Model A performance is high (CV R²: {cv_r2_A:.4f} >= {TARGET_R2}). Performing early imputation for Subset A.")
            subset_A_data_raw_predict = df.loc[subset_A_indices].drop(columns=['hydrostatic_pressure'], errors='ignore').copy()
            
            subset_A_features_predict, _ = preprocess_data_for_model(
                subset_A_data_raw_predict, model_A_features, is_training_data=False, for_model_A=True
            )
            
            predictions_A_early = model_A.predict(subset_A_features_predict)
            df.loc[subset_A_indices, 'hydrostatic_pressure'] = predictions_A_early
            subset_A_early_imputed = True
            print(f"Imputed {len(predictions_A_early)} values in Subset A (early).")
        else:
            print("Subset A is empty, no early imputation by Model A needed.")
    elif model_A: # Model trained but R2 not high enough for early imputation
        print(f"Model A performance (CV R²: {cv_r2_A:.4f}) is below target {TARGET_R2}. Subset A will be imputed later in Step 3 if still NaN.")
    else: # Model A not trained
        print("Model A not trained. Cannot perform early imputation for Subset A.")
else:
    print("No training data available for Model A. Skipping Model A training.")


--- STEP 2: Model A (with seafloor_pressure) ---
Shape of X_A_train for Model A: (15321, 60), y_A_train: (15321,)

--- Training Model A ---
Performing RandomizedSearchCV for Model A with 60 features...
Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best parameters for Model A: {'subsample': 0.7, 'reg_lambda': 0, 'reg_alpha': 0, 'num_leaves': 63, 'n_estimators': 3000, 'min_split_gain': 0, 'min_child_samples': 5, 'max_depth': 10, 'learning_rate': 0.03, 'feature_fraction_seed': 42, 'colsample_bytree': 0.8, 'bagging_freq': 1}
Best cross-validated R² score for Model A: 0.9995

Top 10 Feature Importances:
                                  Feature  Importance
6                       seafloor_pressure        7594
4                 perceived_water_density        5213
42       bottom_current_shear_stress (Pa)        4958
40        sea_surface_height_anomaly (cm)        4832
35           total_alkalinity (µmol kg-1)        4828
48  Brunt_Vaisala_frequency_squared (s-2)        4821

#### Catboost

In [None]:
# --- Configuration for CatBoost ---
CATBOOST_PARAMS_COMMON = {
    'random_seed': SEED,
    'verbose': 0,  # Suppress CatBoost verbosity during training
    'thread_count': -1,
}
# Parameter grid for CatBoost hyperparameter tuning - optimized
CATBOOST_PARAM_GRID = {
    'iterations': [2000, 2500, 3000],                           # Number of trees to build
    'learning_rate': [0.01, 0.03, 0.05],                  # Multiple learning rates for better exploration
    'depth': [7, 8, 9, 10],                                  # Tree depth - crucial for model complexity
    'l2_leaf_reg': [1, 3, 5, 7],                          # L2 regularization to prevent overfitting
    'border_count': [128, 254],                           # Number of splits for numerical features
    'bagging_temperature': [0.7, 1.0, 1.5],               # Controls randomness in bagging (higher = more random)
    'random_strength': [0.1, 1.0, 3.0],                   # Amount of randomness to use for scoring splits
    'min_data_in_leaf': [1, 10, 20],                      # Minimum observations needed in a leaf
    'leaf_estimation_method': ['Newton', 'Gradient'],     # Method used to calculate leaf values
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'], # Method of tree construction
    'subsample': [0.66, 0.8, 0.95],                       # Sample rate for Bernoulli bootstrap
    'od_type': ['Iter'],                                  # Type of overfitting detector
    'od_wait': [100]                                      # Early stopping rounds
}

In [None]:
def train_catboost_model(X_train, y_train, model_name):
    print(f"\n--- Training {model_name} ---")
    if X_train.empty or y_train.empty:
        print(f"Training data for {model_name} is empty. Skipping training.")
        return None, [], -1.0  # model, features_used_list, best_cv_score

    # Create CatBoost model with common parameters
    catboost = cb.CatBoostRegressor(**CATBOOST_PARAMS_COMMON)
    kf = KFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=SEED)
    
    print(f"Performing RandomizedSearchCV for {model_name} with {X_train.shape[1]} features...")
    # Using RandomizedSearchCV instead of GridSearchCV for faster execution with more parameters
    random_search = RandomizedSearchCV(
        estimator=catboost, 
        param_distributions=CATBOOST_PARAM_GRID, 
        n_iter=30,
        cv=kf, 
        scoring='r2', 
        verbose=1, 
        n_jobs=-1,
        random_state=SEED
    )
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_cv_r2_score = random_search.best_score_
    
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best cross-validated R² score for {model_name}: {best_cv_r2_score:.4f}")
    
    # Optional: Feature importance analysis
    if best_model is not None:
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': best_model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print("\nTop 10 Feature Importances:")
        print(feature_importance.head(10))
        
    return best_model, X_train.columns.tolist(), best_cv_r2_score

def compare_models_and_predict(df, X_A_train, y_A_train, model_A, model_A_features, 
                              cv_r2_A, subset_A_indices, subset_A_early_imputed):
    """
    Train CatBoost model, compare with LightGBM, and use the best model for imputation
    """
    print("\n--- STEP 3: Training CatBoost Model and Model Comparison ---")
    
    # Train CatBoost model if training data is available
    if not X_A_train.empty and not y_A_train.empty:
        model_catboost, _, cv_r2_catboost = train_catboost_model(X_A_train, y_A_train, "CatBoost Model")
        
        # Compare model performances
        print("\n--- Model Performance Comparison ---")
        print(f"LightGBM Model A CV R²: {cv_r2_A:.4f}")
        print(f"CatBoost Model CV R²: {cv_r2_catboost:.4f}")
        
        # Select the better model for imputation
        if cv_r2_catboost > cv_r2_A:
            best_model = model_catboost
            best_model_name = "CatBoost"
            best_cv_r2 = cv_r2_catboost
        else:
            best_model = model_A
            best_model_name = "LightGBM"
            best_cv_r2 = cv_r2_A
            
        print(f"\nThe better model is {best_model_name} with CV R² score: {best_cv_r2:.4f}.")
        
        # Create an ensemble model for potentially better predictions
        if model_A is not None and model_catboost is not None:
            print("\n--- Creating an ensemble model with weighted average ---")
            # Create a simple ensemble using weighted average based on CV scores
            total_score = cv_r2_A + cv_r2_catboost
            lgbm_weight = cv_r2_A / total_score
            catboost_weight = cv_r2_catboost / total_score
            print(f"Ensemble weights: LightGBM = {lgbm_weight:.4f}, CatBoost = {catboost_weight:.4f}")
            
            # Define ensemble prediction function
            def ensemble_predict(X):
                return (lgbm_weight * model_A.predict(X) + 
                        catboost_weight * model_catboost.predict(X))
                
            # Perform imputation if not already done in early imputation step
            if not subset_A_early_imputed:
                print(f"\n--- Using ensemble model to impute missing hydrostatic_pressure values ---")
                # Get rows with missing hydrostatic_pressure but available seafloor_pressure
                missing_hydro_pressure_mask = df['hydrostatic_pressure'].isnull()
                has_seafloor_mask = df['seafloor_pressure'].notnull()
                to_impute_mask = missing_hydro_pressure_mask & has_seafloor_mask
                to_impute_indices = df[to_impute_mask].index
                
                if len(to_impute_indices) > 0:
                    # Prepare features for prediction
                    to_impute_data_raw = df.loc[to_impute_indices].drop(columns=['hydrostatic_pressure'], errors='ignore').copy()
                    to_impute_features, _ = preprocess_data_for_model(
                        to_impute_data_raw, model_A_features, is_training_data=False, for_model_A=True
                    )
                    
                    # Make predictions and impute missing values using ensemble
                    predictions = ensemble_predict(to_impute_features)
                    df.loc[to_impute_indices, 'hydrostatic_pressure'] = predictions
                    print(f"Imputed {len(predictions)} hydrostatic_pressure values using ensemble model.")
                else:
                    print("No rows requiring imputation were found.")
            elif subset_A_early_imputed:
                print("Imputation was already done in the early imputation step. Skipping additional imputation.")
        # If ensemble creation failed or not possible, use the best single model
        elif not subset_A_early_imputed and best_model is not None:
            print(f"\n--- Using {best_model_name} to impute missing hydrostatic_pressure values ---")
            # Get rows with missing hydrostatic_pressure but available seafloor_pressure
            missing_hydro_pressure_mask = df['hydrostatic_pressure'].isnull()
            has_seafloor_mask = df['seafloor_pressure'].notnull()
            to_impute_mask = missing_hydro_pressure_mask & has_seafloor_mask
            to_impute_indices = df[to_impute_mask].index
            
            if len(to_impute_indices) > 0:
                # Prepare features for prediction
                to_impute_data_raw = df.loc[to_impute_indices].drop(columns=['hydrostatic_pressure'], errors='ignore').copy()
                to_impute_features, _ = preprocess_data_for_model(
                    to_impute_data_raw, model_A_features, is_training_data=False, for_model_A=True
                )
                
                # Make predictions and impute missing values
                predictions = best_model.predict(to_impute_features)
                df.loc[to_impute_indices, 'hydrostatic_pressure'] = predictions
                print(f"Imputed {len(predictions)} hydrostatic_pressure values using {best_model_name} model.")
            else:
                print("No rows requiring imputation were found.")
        elif subset_A_early_imputed:
            print("Imputation was already done in the early imputation step. Skipping additional imputation.")
        else:
            print("No model available for imputation.")
    else:
        print("No training data available. Skipping model training and imputation.")
    
    return df

In [5]:
# Execute Step 3: Compare models and perform final imputation
df_with_imputation = compare_models_and_predict(
    df=df,
    X_A_train=X_A_train,
    y_A_train=y_A_train,
    model_A=model_A,
    model_A_features=model_A_features,
    cv_r2_A=cv_r2_A,
    subset_A_indices=subset_A_indices,
    subset_A_early_imputed=subset_A_early_imputed
)

# Save the imputed data
output_path = 'data/fitur_clean/hydrostatic1.csv'
print(f"\n--- Saving imputed data to {output_path} ---")
df_with_imputation.to_csv(output_path, index=False)
print(f"Data successfully saved to {output_path}")

# Print summary of imputation
print("\n--- Summary ---")
print(f"Total rows in dataset: {len(df_with_imputation)}")
print(f"Rows with NaN in hydrostatic_pressure initially: {missing_hydro_pressure_mask_initial.sum()}")
print(f"Rows with NaN in hydrostatic_pressure after imputation: {df_with_imputation['hydrostatic_pressure'].isnull().sum()}")
print(f"Total rows imputed: {missing_hydro_pressure_mask_initial.sum() - df_with_imputation['hydrostatic_pressure'].isnull().sum()}")


--- STEP 3: Training CatBoost Model and Model Comparison ---

--- Training CatBoost Model ---
Performing RandomizedSearchCV for CatBoost Model with 60 features...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters for CatBoost Model: {'subsample': 0.66, 'random_strength': 0.1, 'od_wait': 100, 'od_type': 'Iter', 'min_data_in_leaf': 20, 'learning_rate': 0.01, 'leaf_estimation_method': 'Newton', 'l2_leaf_reg': 1, 'iterations': 2000, 'grow_policy': 'SymmetricTree', 'depth': 10, 'border_count': 254, 'bagging_temperature': 1.0}
Best cross-validated R² score for CatBoost Model: 0.9995

Top 10 Feature Importances:
                                   Feature  Importance
59                    seafloor_pressure_sq   48.414099
6                        seafloor_pressure   45.005341
4                  perceived_water_density    1.342056
1                    water_temperature_50m    1.193448
17          sediment_temperature_0_to_10cm    0.724335
28                       the