In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tabpfn import TabPFNRegressor
import os
from sklearn.preprocessing import RobustScaler
import joblib

# --- Configuration ---
DATA_DIR = 'data_TabPFN'
TARGET_COLUMN = 'FEV1_FVC'
FEATURES_FILE = f'{DATA_DIR}/features_cols.xlsx'
TRAIN_FILE_PREFIX = f'{DATA_DIR}/features_train'
VAL_FILE_PREFIX = f'{DATA_DIR}/features_val'
MODEL_NAME = 'TabPFN'
OUTPUT_DIR = f'rs/{MODEL_NAME}_predictions'

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Starting {MODEL_NAME} Regression Model Training and Evaluation ...")
print(f"Target Column: {TARGET_COLUMN}")
print(f"Output directory: {OUTPUT_DIR}")



# --- Load Feature Columns ---
try:
    features_df = pd.read_excel(FEATURES_FILE)
    feature_columns = features_df['Feature'].tolist()
    print(f"Loaded {len(feature_columns)} feature columns from {FEATURES_FILE}")
except FileNotFoundError:
    print(f"Error: {FEATURES_FILE} not found. Please ensure it's in the correct directory ({DATA_DIR}).")
    exit()
except KeyError:
    print(f"Error: 'Feature' column not found in {FEATURES_FILE}. Please check the Excel file.")
    exit()

# --- Prepare for saving metrics to a file ---
metrics_output_filepath = os.path.join(OUTPUT_DIR, f'performance_metrics.txt')
with open(metrics_output_filepath, 'w') as f_metrics:
    f_metrics.write(f"--- {MODEL_NAME} Regression Model Performance Metrics ---\n")
    f_metrics.write(f"Target Column: {TARGET_COLUMN}\n")
    f_metrics.write(f"Features file: {FEATURES_FILE}\n\n")

    # --- Process the specific fold ---
    f_metrics.write(f"\n--- Processing ---\n")
    print(f"\n--- Processing ---")
    train_file = f"{TRAIN_FILE_PREFIX}.csv"
    val_file = f"{VAL_FILE_PREFIX}.csv"

    try:
        # Load training and validation data
        train_df = pd.read_csv(train_file)
        val_df = pd.read_csv(val_file)
        print(f"Loaded {train_file} (train) and {val_file} (validation)")

        # Separate features (X) and target (y)
        train_features = [col for col in feature_columns if col in train_df.columns]
        val_features = [col for col in feature_columns if col in val_df.columns]

        X_train = train_df[train_features]
        y_train = train_df[TARGET_COLUMN]
        X_val = val_df[val_features]
        y_val = val_df[TARGET_COLUMN]

        # Handle potential missing features
        if len(train_features) != len(feature_columns):
            print(f"Warning: Some features from {FEATURES_FILE} not found in {train_file}. Using available features.")
            f_metrics.write(f"Warning: Some features from {FEATURES_FILE} not found in {train_file}. Using available features.\n")
        if len(val_features) != len(feature_columns):
            print(f"Warning: Some features from {FEATURES_FILE} not found in {val_file}. Using available features.")
            f_metrics.write(f"Warning: Some features from {FEATURES_FILE} not found in {val_file}. Using available features.\n")

        # Align columns
        common_features = list(set(train_features) & set(val_features))
        X_train = X_train[common_features]
        X_val = X_val[common_features]

        if X_train.empty or X_val.empty:
            print(f"Error: No common features found or feature data is empty . Skipping.")
            f_metrics.write(f"Error: No common features found or feature data is empty . Skipping.\n")
        else:
            print(f"Training data shape: {X_train.shape}, Target shape: {y_train.shape}")
            print(f"Validation data shape: {X_val.shape}, Target shape: {y_val.shape}")
            f_metrics.write(f"Training data shape: {X_train.shape}, Target shape: {y_train.shape}\n")
            f_metrics.write(f"Validation data shape: {X_val.shape}, Target shape: {y_val.shape}\n")

            # --- Data Normalization with RobustScaler ---
            print("Applying RobustScaler for feature normalization...")
            f_metrics.write("Applying RobustScaler for feature normalization...\n")
            scaler = RobustScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            print("Normalization complete.")
            f_metrics.write("Normalization complete.\n")

            # --- Save the scaler for this fold ---
            scaler_filename = os.path.join(OUTPUT_DIR, f'robust_scaler.joblib')
            joblib.dump(scaler, scaler_filename)
            print(f"Scaler saved to {scaler_filename}")
            f_metrics.write(f"Scaler saved to {scaler_filename}\n")

            # --- Train TabPFN Regression Model ---
            model = TabPFNRegressor(device='cuda')
            print(f"Training {MODEL_NAME}...")
            f_metrics.write(f"Training {MODEL_NAME}...\n")
            model.fit(X_train_scaled, y_train)
            print("Training complete.")
            f_metrics.write("Training complete.\n")

            # --- Save the trained model for this fold ---
            model_filename = os.path.join(OUTPUT_DIR, f'{MODEL_NAME}_model.joblib')
            joblib.dump(model, model_filename)
            print(f"Model saved to {model_filename}")
            f_metrics.write(f"Model saved to {model_filename}\n")

            # --- Make Predictions ---
            y_pred = model.predict(X_val_scaled)
            print("Predictions made.")

            # --- Evaluate Model Performance ---
            mae = mean_absolute_error(y_val, y_pred)
            mse = mean_squared_error(y_val, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_val, y_pred)

            print(f"Metrics:")
            print(f"   MAE: {mae:.4f}")
            print(f"   MSE: {mse:.4f}")
            print(f"   RMSE: {rmse:.4f}")
            print(f"   R-squared: {r2:.4f}")

            f_metrics.write(f"Metrics:\n")
            f_metrics.write(f"   MAE: {mae:.4f}\n")
            f_metrics.write(f"   MSE: {mse:.4f}\n")
            f_metrics.write(f"   RMSE: {rmse:.4f}\n")
            f_metrics.write(f"   R-squared: {r2:.4f}\n")

            # --- Save Predicted Values to CSV ---
            if 'SUBJID' in val_df.columns:
                predictions_df = pd.DataFrame({
                    'SUBJID': val_df['SUBJID'],
                    'Ground_Truth': y_val,
                    'Prediction': y_pred
                })
                output_filename = os.path.join(OUTPUT_DIR, f'predictions.csv')
                predictions_df.to_csv(output_filename, index=False)
                print(f"Predictions saved to {output_filename}")
                f_metrics.write(f"Predictions saved to {output_filename}\n")
            else:
                print(f"Warning: 'SUBJID' column not found in {val_file}. Skipping saving predictions for this fold.")
                f_metrics.write(f"Warning: 'SUBJID' column not found in {val_file}. Skipping saving predictions for this fold.\n")

    except FileNotFoundError as e:
        print(f"Error: {e}. Please ensure the fold CSV files are present in {DATA_DIR}.")
        f_metrics.write(f"Error: {e}. Please ensure the fold CSV files are present in {DATA_DIR}.\n")
    except Exception as e:
        print(f"An unexpected error occurred during processing: {e}")
        f_metrics.write(f"An unexpected error occurred during processing: {e}\n")

    print("\nScript finished.")
    f_metrics.write("\nScript finished.\n")
