In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/painmonit-dataset/painmonit_dua.pdf
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_49-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_40-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_11-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_41-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_42-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_03-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_12-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_30-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_13-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_25-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_09-synchronised-data.csv
/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data/S_26-synchronised-data.csv
/k

## Load & Concatenate All Subject CSVs

In [2]:
# Path to your PMHDB dataset folder (update accordingly)
data_path = "/kaggle/input/painmonit-dataset/PMED/PMHDB/raw-data"

all_data = []

for i in range(1, 30):  # 01 to 52
    file_name = f"S_{i:02d}-synchronised-data.csv"
    file_path = os.path.join(data_path, file_name)
    
    # Read CSV with semicolon delimiter
    df = pd.read_csv(file_path, delimiter=";")
    
    # Add subject ID column
    df["Subject_ID"] = i
    
    all_data.append(df)

# Concatenate all subjects
pmhdb_df = pd.concat(all_data, ignore_index=True)

print("Shape of concatenated dataset:", pmhdb_df.shape)
print("Columns:", pmhdb_df.columns.tolist())
pmhdb_df.head()


  df = pd.read_csv(file_path, delimiter=";")


Shape of concatenated dataset: (10442080, 14)
Columns: ['Seconds', 'Bvp', 'Eda_E4', 'Tmp', 'Ibi', 'Hr', 'Resp', 'Eda_RB', 'Ecg', 'Emg', 'Heater [C]', 'COVAS', 'Heater_cleaned', 'Subject_ID']


Unnamed: 0,Seconds,Bvp,Eda_E4,Tmp,Ibi,Hr,Resp,Eda_RB,Ecg,Emg,Heater [C],COVAS,Heater_cleaned,Subject_ID
0,0,1985744,5449181107142857,3518569230769231,7165260688888889,837484398888889,-123291015625,1174850463867185,-5706024169921875,-1373291015625,3201,0,320,1
1,4,1957863,5449066803571428,35186,7166996866666666,8372736066666667,-117034912109375,1173439025878905,-49530029296875,212860107421875,32023999999999994,0,320,1
2,8,1929982,5448952499999999,3518630769230769,7168733044444445,8370628144444446,-12090047200520833,11741638183593734,-505218505859375,19989013671875,32038,0,320,1
3,12,1902101,5448838196428571,35186615384615386,7170469222222222,8368520222222224,-124664306640625,117458343505859,-49163818359375,-42572021484375,3205199999999999,0,320,1
4,16,1811296,5448723892857142,3518692307692307,71722054,83664123,-122222900390625,117366790771484,-476531982421875,-70037841796875,32065999999999995,0,320,1


In [3]:
pmhdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10442080 entries, 0 to 10442079
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Seconds         object
 1   Bvp             object
 2   Eda_E4          object
 3   Tmp             object
 4   Ibi             object
 5   Hr              object
 6   Resp            object
 7   Eda_RB          object
 8   Ecg             object
 9   Emg             object
 10  Heater [C]      object
 11  COVAS           object
 12  Heater_cleaned  object
 13  Subject_ID      int64 
dtypes: int64(1), object(13)
memory usage: 1.1+ GB


## Data Cleaning

In [4]:
# =========================
# Cell 2: Robust Data Cleaning
# =========================

import pandas as pd
import numpy as np

# List of features we know should be numeric
numeric_cols = [
    "Seconds", "Bvp", "Eda_E4", "Tmp", "Ibi", "Hr", "Resp",
    "Eda_RB", "Ecg", "Emg", "Heater [C]", "COVAS", "Heater_cleaned"
]

# Convert Subject_ID to string
pmhdb_df["Subject_ID"] = pmhdb_df["Subject_ID"].astype(str)

# Function to safely convert strings to float
def safe_float(x):
    try:
        return float(str(x).replace(',', '.').strip())
    except:
        return np.nan

# Apply conversion
for col in numeric_cols:
    pmhdb_df[col] = pmhdb_df[col].apply(safe_float)

# Fill missing values forward/backward
pmhdb_df[numeric_cols] = pmhdb_df[numeric_cols].fillna(method='ffill').fillna(method='bfill')

# Drop rows where target is still missing
pmhdb_df = pmhdb_df.dropna(subset=["COVAS"])

# Check
print("After cleaning, dataset shape:", pmhdb_df.shape)
print("Column types:")
print(pmhdb_df.dtypes)
print("\nSample:")
print(pmhdb_df.head())


  pmhdb_df[numeric_cols] = pmhdb_df[numeric_cols].fillna(method='ffill').fillna(method='bfill')


After cleaning, dataset shape: (10442080, 14)
Column types:
Seconds           float64
Bvp               float64
Eda_E4            float64
Tmp               float64
Ibi               float64
Hr                float64
Resp              float64
Eda_RB            float64
Ecg               float64
Emg               float64
Heater [C]        float64
COVAS             float64
Heater_cleaned    float64
Subject_ID         object
dtype: object

Sample:
   Seconds       Bvp    Eda_E4        Tmp       Ibi         Hr      Resp  \
0    0.000  19.85744  5.449181  35.185692  0.716526  83.748440 -1.232910   
1    0.004  19.57863  5.449067  35.186000  0.716700  83.727361 -1.170349   
2    0.008  19.29982  5.448952  35.186308  0.716873  83.706281 -1.209005   
3    0.012  19.02101  5.448838  35.186615  0.717047  83.685202 -1.246643   
4    0.016  18.11296  5.448724  35.186923  0.717221  83.664123 -1.222229   

      Eda_RB       Ecg       Emg  Heater [C]  COVAS  Heater_cleaned Subject_ID  
0  11.748505 -0

## LOSO

In [5]:
from sklearn.model_selection import LeaveOneGroupOut

# Features & target
X = pmhdb_df.drop(columns=["COVAS", "Subject_ID"])  # predictors only
y = pmhdb_df["COVAS"]                               # pain score (target)
groups = pmhdb_df["Subject_ID"]                     # subject IDs for LOSO

# LOSO splitter
logo = LeaveOneGroupOut()

print("Total folds (subjects):", logo.get_n_splits(X, y, groups))

# Loop through folds (we’ll plug in model training later)
for fold, (train_idx, test_idx) in enumerate(logo.split(X, y, groups), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    test_subject = groups.iloc[test_idx].iloc[0]  # which subject is test
    
    print(f"\n--- Fold {fold}/{logo.get_n_splits(X, y, groups)} ---")
    print(f"Train size: {X_train.shape}, Test size: {X_test.shape}, Test subject: {test_subject}")
    
    # Placeholder: later we’ll train XGBoost here
    # model.fit(X_train, y_train)
    # preds = model.predict(X_test)


Total folds (subjects): 29

--- Fold 1/29 ---
Train size: (10079179, 12), Test size: (362901, 12), Test subject: 1

--- Fold 2/29 ---
Train size: (10082963, 12), Test size: (359117, 12), Test subject: 10

--- Fold 3/29 ---
Train size: (10077190, 12), Test size: (364890, 12), Test subject: 11

--- Fold 4/29 ---
Train size: (10071530, 12), Test size: (370550, 12), Test subject: 12

--- Fold 5/29 ---
Train size: (10083516, 12), Test size: (358564, 12), Test subject: 13

--- Fold 6/29 ---
Train size: (10080144, 12), Test size: (361936, 12), Test subject: 14

--- Fold 7/29 ---
Train size: (10086228, 12), Test size: (355852, 12), Test subject: 15

--- Fold 8/29 ---
Train size: (10090703, 12), Test size: (351377, 12), Test subject: 16

--- Fold 9/29 ---
Train size: (10083993, 12), Test size: (358087, 12), Test subject: 17

--- Fold 10/29 ---
Train size: (10080959, 12), Test size: (361121, 12), Test subject: 18

--- Fold 11/29 ---
Train size: (10083425, 12), Test size: (358655, 12), Test subje

In [6]:
# Ensure numeric and drop NaN/inf
pmhdb_df["COVAS"] = pd.to_numeric(pmhdb_df["COVAS"], errors="coerce")
pmhdb_df = pmhdb_df.replace(np.inf, np.nan)
pmhdb_df = pmhdb_df.replace(-np.inf, np.nan)
pmhdb_df = pmhdb_df.dropna(subset=["COVAS"])  # drop rows where target is NaN

print("After cleaning, dataset shape:", pmhdb_df.shape)
print("COVAS min:", pmhdb_df["COVAS"].min(), "max:", pmhdb_df["COVAS"].max())

After cleaning, dataset shape: (10442080, 14)
COVAS min: 0.0 max: 100.00000000000001


In [7]:
# --- Fix column names for XGBoost compatibility ---
pmhdb_df.columns = (
    pmhdb_df.columns
    .str.replace(r"\[|\]|<|>", "", regex=True)  # remove forbidden characters
    .str.replace(" ", "_")                      # replace spaces with underscores
)

print("Cleaned column names:", pmhdb_df.columns.tolist())


Cleaned column names: ['Seconds', 'Bvp', 'Eda_E4', 'Tmp', 'Ibi', 'Hr', 'Resp', 'Eda_RB', 'Ecg', 'Emg', 'Heater_C', 'COVAS', 'Heater_cleaned', 'Subject_ID']


In [8]:
import re

# Sanitize all column names in the master dataframe
pmhdb_df = pmhdb_df.rename(columns=lambda x: re.sub(r"[^A-Za-z0-9_]+", "_", str(x)))

# If you already extracted feature columns, re-pull them
feature_cols = [col for col in pmhdb_df.columns if col != "subject" and col != "target"]

print("Sanitized feature columns:")
print(feature_cols[:20])  # show first 20 just to check


Sanitized feature columns:
['Seconds', 'Bvp', 'Eda_E4', 'Tmp', 'Ibi', 'Hr', 'Resp', 'Eda_RB', 'Ecg', 'Emg', 'Heater_C', 'COVAS', 'Heater_cleaned', 'Subject_ID']


In [9]:
X_train = X_train.rename(columns=lambda x: re.sub(r"[^A-Za-z0-9_]+", "_", str(x)))
X_test  = X_test.rename(columns=lambda x: re.sub(r"[^A-Za-z0-9_]+", "_", str(x)))


## XGBoost Regressor

In [10]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import numpy as np

# hyperparams to try
xgb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.03,
    "max_depth": 4,
    "min_child_weight": 5,
    "subsample": 0.75,
    "colsample_bytree": 0.7,
    "reg_alpha": 0.1,
    "reg_lambda": 3.0,
    "gamma": 0.1,
    "random_state": 42,
    "verbosity": 0,
    # "booster": "dart"   # uncomment to try dropout-like behavior
}

# inside LOSO loop (for each fold)
# X_train, y_train, X_test, y_test already defined
X_tr_sub, X_val_sub, y_tr_sub, y_val_sub = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

model = XGBRegressor(**xgb_params)

# fit with eval_set for early stopping
model.fit(
    X_tr_sub, y_tr_sub,
    eval_set=[(X_val_sub, y_val_sub)],
    early_stopping_rounds=50,
    verbose=50
)

# predict and clip between 0 and 100
preds = model.predict(X_test)
preds = np.clip(preds, 0.0, 100.0)




[0]	validation_0-rmse:19.30065
[50]	validation_0-rmse:15.45597
[100]	validation_0-rmse:14.50666
[150]	validation_0-rmse:14.08591
[200]	validation_0-rmse:13.75990
[250]	validation_0-rmse:13.49684
[300]	validation_0-rmse:13.31126
[350]	validation_0-rmse:13.14015
[400]	validation_0-rmse:12.97873
[450]	validation_0-rmse:12.83784
[500]	validation_0-rmse:12.67972
[550]	validation_0-rmse:12.56288
[600]	validation_0-rmse:12.44404
[650]	validation_0-rmse:12.31015
[700]	validation_0-rmse:12.21863
[750]	validation_0-rmse:12.13009
[800]	validation_0-rmse:12.06202
[850]	validation_0-rmse:11.97672
[900]	validation_0-rmse:11.89482
[950]	validation_0-rmse:11.82091
[1000]	validation_0-rmse:11.74916
[1050]	validation_0-rmse:11.68092
[1100]	validation_0-rmse:11.61785
[1150]	validation_0-rmse:11.55818
[1200]	validation_0-rmse:11.49079
[1250]	validation_0-rmse:11.43415
[1300]	validation_0-rmse:11.37148
[1350]	validation_0-rmse:11.31126
[1400]	validation_0-rmse:11.24704
[1450]	validation_0-rmse:11.18685
[15

In [11]:
import numpy as np
from sklearn.metrics import mean_squared_error

# assuming y_train_all are concatenation of all training y across folds or global train
mean_pred = np.mean(y_train)  # or compute per-fold
rmse_baseline = np.sqrt(mean_squared_error(y_test, np.full_like(y_test, mean_pred)))
print("Baseline (mean) RMSE:", rmse_baseline)


Baseline (mean) RMSE: 25.851085860017545


In [12]:
import joblib

# Save the trained model
joblib.dump(model, "/kaggle/working/xgb_model.pkl")


['/kaggle/working/xgb_model.pkl']

KeyError: "None of [Index(['Wind Speed', 'Ozone', 'Relative Humidity', 'Benzene', 'CO', 'PM2.5',\n       'NOx'],\n      dtype='object')] are in the [columns]"