### Import library

In [1]:

import numpy as np
import pandas as pd
import os
import re
import torch.nn.functional as F
from colorama import Fore, Style
from sklearn.base import clone, BaseEstimator, RegressorMixin
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
import random
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback
import os
import torch


### Environment Initialization and Reproducibility Setup
Setting a fixed seed for reproducibility across Python, NumPy, and PyTorch, ensuring consistent results in machine learning experiments.

In [2]:

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
SEED = 42
n_splits = 5

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2024)

In [3]:

train_featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW', 'BMI_PHR']

test_featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW', 'BMI_PHR']


### Additional Feature Calculation for Time series data
To computes additional features related to nocturnal, weekend, and weekend night activities from sensor data.

- Feature 'enmoXlight' for focusing on the relationship between physical activity and light as an indirect indicator. 'enmo * light' could be used to assess whether physical activity levels (enmo) in certain light conditions (light) are associated with internet usage habits. For example, low light combined with minimal physical activity might indicate prolonged internet usage, such as when someone sits indoors in front of a computer or phone screen.

- Feature 'is_night': Nighttime is outside the range of 8 AM to 9 PM.
- Feature 'is_weekend': Weekend days are Saturday (5.5) and Sunday (6).
- Feature 'is_weekend_night': Combination of weekend and nighttime conditions.


In [4]:
extra_feature_cols = [
    'night_enmo_mean','night_enmo_max','night_light_mean','night_light_max','night_enmoXlight_mean','night_enmoXlight_max',
    'weekend_enmo_mean','weekend_enmo_max','weekend_light_mean','weekend_light_max','weekend_enmoXlight_mean','weekend_enmoXlight_max',
    'weekend_night_enmo_mean','weekend_night_enmo_max','weekend_night_light_mean','weekend_night_light_max','weekend_night_enmoXlight_mean','weekend_night_enmoXlight_max',
]

def time_features(df):
    """
    Tính các đặc trưng hoạt động (ban đêm, cuối tuần, cuối tuần ban đêm) dựa trên:
      - cột 'enmo', 'light', 'time_of_day', 'weekday'
    Trả về danh sách (list) các giá trị đặc trưng.
    """

    df["enmo"] = df["enmo"].rolling(window=10, min_periods=1).mean()
    df["light"] = df["light"].rolling(window=10, min_periods=1).mean()
    df['enmoXlight'] = df['enmo'] * df['light']


    df["time_of_day_hours"] = df["time_of_day"] // (3_600 * 1_000_000_000)

    features = []

    def compute_features(condition_col, threshold, prefix):
        """
        - condition_col: cột boolean (0/1) như is_night, is_weekend, ...
        - threshold: số mẫu tối thiểu để chấp nhận.
        - prefix: tiền tố cho tên cột (night, weekend, weekend_night).
        """

        df[condition_col] = (df[condition_col].diff() == 1).cumsum() * df[condition_col]
        df.loc[df[condition_col] > 0, condition_col] = 1


        group_des = (
            df.groupby(condition_col)[['enmo', 'light', 'enmoXlight']]
              .agg({
                  'enmo': ['mean','max','count'],
                  'light': ['mean','max'],
                  'enmoXlight': ['mean','max']
              })
              .reset_index()
        )

        group_des = group_des[group_des[condition_col] > 0].reset_index(drop=True)
        

        group_des.columns = [
            condition_col,
            f'{prefix}_enmo_mean', f'{prefix}_enmo_max', f'{prefix}_enmo_count',
            f'{prefix}_light_mean', f'{prefix}_light_max',
            f'{prefix}_enmoXlight_mean', f'{prefix}_enmoXlight_max'
        ]
        

        group_des = group_des[group_des[f'{prefix}_enmo_count'] > threshold]
        if len(group_des) == 0:
          
            return [np.nan] * 6  
        

        vals = group_des.drop([condition_col, f'{prefix}_enmo_count'], axis=1).mean(axis=0).values
        return list(vals)


    df['is_night'] = np.where((df['time_of_day_hours'] >= 8) & (df['time_of_day_hours'] < 21), 0, 1)
    features.extend(compute_features('is_night', threshold=500, prefix='night'))

    df['is_weekend'] = np.where(df['weekday'] >= 5.5, 1, 0)
    features.extend(compute_features('is_weekend', threshold=2000, prefix='weekend'))

    df['is_weekend_night'] = np.where(
        (df['weekday'] >= 5.5) & ((df['time_of_day_hours'] < 8) | (df['time_of_day_hours'] >= 21)),
        1, 0
    )
    features.extend(compute_features('is_weekend_night', threshold=200, prefix='weekend_night'))

    return features

In [5]:
def process_file(file_name, parquet_dir):
    """
    Đọc file parquet, trả về (tf_list, desc_list, file_id).
      - tf_list: time_features (list)
      - desc_list: thống kê describe() (list)
      - file_id: ID trích từ tên file
    Nếu lỗi hoặc file không tồn tại -> (None, None, None).
    """
    file_path = os.path.join(parquet_dir, file_name, 'part-0.parquet')
    if not os.path.exists(file_path):
        return None, None, None

    df = pd.read_parquet(file_path)
    if 'step' in df.columns:
        df.drop('step', axis=1, inplace=True)

    tf_list = time_features(df)  

    desc_list = df.describe().values.reshape(-1)

    try:
        file_id = file_name.split('=')[1]
    except:
        file_id = file_name 
    return tf_list, desc_list, file_id


In [6]:
def load_time_series(dirname):
    """
    Duyệt qua các file/folder trong dirname, xử lý song song.
    Mỗi file được đọc, tính:
      - time_features -> tf_list
      - describe() -> desc_list
    Kết quả: Trả về (df_time, df_desc).
    """
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        results = list(
            tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids))
        )


    valid_results = [(tf, ds, i) for tf, ds, i in results if tf is not None and ds is not None and i is not None]
    if len(valid_results) == 0:

        return pd.DataFrame(), pd.DataFrame()


    time_feat_lists, desc_lists, indexes = zip(*valid_results)

    df_time = pd.DataFrame(time_feat_lists, columns=extra_feature_cols)
    df_time['id'] = indexes

   
    desc_len = len(desc_lists[0])
    desc_cols = [f"stat_{i}" for i in range(desc_len)]

    df_desc = pd.DataFrame(desc_lists, columns=desc_cols)
    df_desc['id'] = indexes

    return df_time, df_desc


In [7]:

train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts_time_features, train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts_time_features, test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [03:30<00:00,  4.74it/s]
100%|██████████| 2/2 [00:00<00:00,  3.79it/s]


This code snippet adjusts the sii column for specific rows in the dataset based on a predefined list of IDs. The list of IDs was determined manually after filtering the dataset in Excel.
The logic assumes that if the 'PCIAT-PCIAT_Total' value is less than 5 and some PCIAT-PCIAT_* columns have missing values, the response (sii) is invalid and should be set to None.

In [8]:
import pandas as pd

# nếu mà giá trị PCIAT-PCIAT_Total < 5 và một vài PCIAT-PCIAT còn lại bị missing thì sii  = None vì người trả lời trả lời không chính xác 
ids_to_update = ['18fdbccc', '053d7d31', '39dd3538', '68fa4631', '6a98537b', '6b9a25e6', '75311a3f', '926bd07e', 'fc8e4de4']
train.loc[train['id'].isin(ids_to_update), 'sii'] = None

In [9]:
train.dropna(subset=['sii'], inplace=True)

This code uses KNN imputation to fill missing values in specific PCIAT-PCIAT_* columns for rows where sii is not missing. The imputed values are rounded, updated in the dataset, and a new total score, PCIAT-PCIAT_Total_new, is calculated as the sum of these columns.

In [10]:
import pandas as pd
from sklearn.impute import KNNImputer

columns_to_impute = [
    'PCIAT-PCIAT_06', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_07',
    'PCIAT-PCIAT_16', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_11',
    'PCIAT-PCIAT_19', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_14',
    'PCIAT-PCIAT_04', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_02',
    'PCIAT-PCIAT_18', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_05'
]

df_non_nan_sii = train[train['sii'].notna()]

knn_imputer = KNNImputer(n_neighbors=10)

imputed_data = knn_imputer.fit_transform(df_non_nan_sii[columns_to_impute])

imputed_data = imputed_data.round()

train.loc[train['sii'].notna(), columns_to_impute] = imputed_data

train['PCIAT-PCIAT_Total_new'] = train[columns_to_impute].sum(axis=1)


This code recalculates the sii values based on the PCIAT-PCIAT_Total score and updates the dataset with the new classification logic:

If PCIAT-PCIAT_Total is missing, new_sii is set to NaN.

If the score is ≤ 30, new_sii is 0.

If the score is between 31 and 49, new_sii is 1.

If the score is between 50 and 79, new_sii is 2.

If the score is ≥ 80, new_sii is 3.

In [11]:

def recalculate_sii(row):
    if pd.isna(row['PCIAT-PCIAT_Total']):
        return np.nan
    if row['PCIAT-PCIAT_Total'] <= 30:
        return 0
    elif 31 <= row['PCIAT-PCIAT_Total'] <= 49:
        return 1
    elif 50 <= row['PCIAT-PCIAT_Total'] <= 79:
        return 2
    elif row['PCIAT-PCIAT_Total'] >= 80:
        return 3
    return np.nan

train['new_sii'] = train.apply(recalculate_sii, axis=1)



In [12]:
train['sii'] = train['new_sii']
train.drop(['new_sii'], axis=1, inplace=True)

Using an autoencoder to reduce the dimensionality of time-series data for both training and testing datasets.


### AutoEncoder Class
Encoder: Compresses input data into a lower-dimensional space using a series of linear layers and activation functions (LeakyReLU).

Decoder: Reconstructs the input data from the compressed representation, aiming to minimize the difference (loss) between the original and reconstructed data. The final activation function is a Sigmoid to normalize the output between 0 and 1.

Scaling: Standardizes the data using StandardScaler.

Training: Uses Mean Squared Error (MSE) as the loss function and Adam optimizer.


In [13]:

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.LeakyReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2+10),
            nn.LeakyReLU(),
            nn.Linear(encoding_dim*2+10, encoding_dim),
            nn.LeakyReLU()
        )

        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim+15),
            nn.LeakyReLU(),
            nn.Linear(encoding_dim+15, encoding_dim*3),
            nn.LeakyReLU(),
            nn.Linear(encoding_dim*3, input_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# encoder du lieu
def autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')

    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [14]:
train_ts_id = train_ts['id']
test_ts_id = test_ts['id']
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

train_ts_encoded['id'] = train_ts_id
test_ts_encoded['id'] = test_ts_id
merged_train_ts_df = train_ts_time_features.merge(train_ts_encoded, on='id', how='inner')
merged_test_ts_df = test_ts_time_features.merge(test_ts_encoded, on='id', how='inner')
time_series_cols = merged_train_ts_df.columns.tolist()
time_series_cols.remove('id')

Epoch [10/100], Loss: 1.7141]
Epoch [20/100], Loss: 1.5673]
Epoch [30/100], Loss: 1.5514]
Epoch [40/100], Loss: 1.5435]
Epoch [50/100], Loss: 1.5369]
Epoch [60/100], Loss: 1.5363]
Epoch [70/100], Loss: 1.5356]
Epoch [80/100], Loss: 1.5353]
Epoch [90/100], Loss: 1.5278]
Epoch [100/100], Loss: 1.5274]
Epoch [10/100], Loss: 0.9769]
Epoch [20/100], Loss: 0.5649]
Epoch [30/100], Loss: 0.3860]
Epoch [40/100], Loss: 0.3860]
Epoch [50/100], Loss: 0.3860]
Epoch [60/100], Loss: 0.3860]
Epoch [70/100], Loss: 0.3860]
Epoch [80/100], Loss: 0.3860]
Epoch [90/100], Loss: 0.3860]
Epoch [100/100], Loss: 0.3860]


In [15]:
merged_train_ts_df['id']

0      0745c390
1      eaab7a96
2      8ec2cc63
3      b2987a65
4      7b8842c3
         ...   
991    cd68643b
992    f8ff0bc8
993    db23fbe4
994    687c85e7
995    5f099188
Name: id, Length: 996, dtype: object

In [16]:

train = pd.merge(train, merged_train_ts_df, how="left", on='id')
test = pd.merge(test, merged_test_ts_df, how="left", on='id')

## Feature Engineering

Filling missing values in seasonal categorical columns with 'Missing' to handle NaN effectively. It then converts these columns to the category data type for efficient encoding and memory usage.

In [17]:
cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
          'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

In [18]:
def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping_train = create_mapping(col, train)
    mapping_test = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping_train).astype(int)
    test[col] = test[col].replace(mapping_test).astype(int)

print(f'Train Shape : {train.shape} || Test Shape : {test.shape}')

Train Shape : (2727, 161) || Test Shape : (20, 137)



1. **BMI and Blood Pressure Checks:**
   - Remove rows where `Physical-BMI` is less than or equal to 0.
   - Remove rows where `Physical-Diastolic_BP` or `Physical-Systolic_BP` are less than or equal to 0.
   - Remove rows where `Physical-Diastolic_BP` is greater than 160.

2. **Age-Specific Filtering for Children:**
   - Identify rows where `Basic_Demos-Age` is less than or equal to 12 (children).
   - For these rows, remove entries where `FGC-FGC_CU` or `FGC-FGC_GSND` exceeds 80.

3. **Bioelectrical Impedance Analysis (BIA) Metrics:**
   - Remove rows where `BIA-BIA_BMI` is less than or equal to 0.
   - Remove rows with unusually high values for various BIA measurements, such as:
     - `BIA-BIA_BMC` > 1000
     - `BIA-BIA_BMR` > 40000
     - `BIA-BIA_DEE` > 60000
     - `BIA-BIA_ECW`, `BIA-BIA_FFM`, `BIA-BIA_ICW`, `BIA-BIA_LDM`, `BIA-BIA_LST`, `BIA-BIA_SMM`, `BIA-BIA_TBW` > 2000

In [19]:
def remove_outliers(df):
    
    df = df.drop(df[df['Physical-BMI'] <= 0].index)
    df = df.drop(df[df['Physical-Diastolic_BP'] <= 0].index)
    df = df.drop(df[df['Physical-Systolic_BP'] <= 0].index)
    df = df.drop(df[df['Physical-Diastolic_BP'] > 160].index)

    children = df[df['Basic_Demos-Age'] <= 12]
    df = df.drop(children[children['FGC-FGC_CU'] > 80].index)
    df = df.drop(children[children['FGC-FGC_GSND'] > 80].index)

    df = df.drop(df[df['BIA-BIA_BMI'] <= 0].index)
    df = df.drop(df[df['BIA-BIA_BMC'] > 1000].index)
    df = df.drop(df[df['BIA-BIA_BMR'] > 40000].index)
    df = df.drop(df[df['BIA-BIA_DEE'] > 60000].index)
    df = df.drop(df[df['BIA-BIA_ECW'] > 2000].index)
    df = df.drop(df[df['BIA-BIA_FFM'] > 2000].index)
    df = df.drop(df[df['BIA-BIA_ICW'] > 2000].index)
    df = df.drop(df[df['BIA-BIA_LDM'] > 2000].index)
    df = df.drop(df[df['BIA-BIA_LST'] > 2000].index)
    df = df.drop(df[df['BIA-BIA_SMM'] > 2000].index)
    df = df.drop(df[df['BIA-BIA_TBW'] > 2000].index)
    
    return df

In [20]:
train = remove_outliers(train)

In [21]:
def preprocess_feature(df):
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    
    return df

In [22]:
train = preprocess_feature(train)
train = train.dropna(thresh=10, axis=0)
test = preprocess_feature(test)

In [23]:
train['Basic_Demos-Enroll_Season']

0       0
1       1
2       1
3       2
4       3
       ..
2722    0
2723    1
2724    0
2725    0
2726    3
Name: Basic_Demos-Enroll_Season, Length: 2715, dtype: int64

In [24]:
train_featuresCols += time_series_cols

train = train[train_featuresCols]
# drop cac ban ghi co sii nan
train = train.dropna(subset='sii') 


test_featuresCols += time_series_cols
test = test[test_featuresCols]

if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

if np.any(np.isinf(test)):
    test = test.replace([np.inf, -np.inf], np.nan)

In [25]:
def process_abnormal(df, rules):
    for column, (min_val, max_val) in rules.items():
        if column in df.columns:
            cond = (df[column] < min_val) | (df[column] > max_val)
            df.loc[cond, column] = np.nan
    return df


rules = {
    'BIA-BIA_Fat': (5, 50),
    'BIA-BIA_FMI': (0.5, float('inf')),
    'BIA-BIA_FFMI': (12, 25),
    'BIA-BIA_ECW': (0, 40),
    'BIA-BIA_DEE': (1200, 3000),
    'BIA-BIA_BMR': (900, 2500),
    'BIA-BIA_ICW': (20, 70),
    'BIA-BIA_SMM': (12, 70),
    'BIA-BIA_TBW': (30, 80),
    'Physical-Weight': (40, float('inf'))
}
train = process_abnormal(train, rules)
test = process_abnormal(test, rules)

In [26]:
def feature_engineering(df):

    abnormal_conditions = {
        'Physical-Diastolic_BP': [(40, 100)],
        'Physical-Systolic_BP': [(80, 160)]
    }

    for col, ranges in abnormal_conditions.items():
        if col in df.columns:
            for lower, upper in ranges:
                df.loc[(df[col] < lower) | (df[col] > upper), col] = np.nan

    if {'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'}.issubset(df.columns):
        df['Fitness_Endurance_Time'] = (
            df['Fitness_Endurance-Time_Mins'] * 60 + df['Fitness_Endurance-Time_Sec']
        )
    
    if 'Basic_Demos-Age' in df.columns:
        df['Age_Group'] = pd.cut(df['Basic_Demos-Age'], bins=[4, 12, 22], labels=[0, 1]).astype(int)

    if {'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total'}.issubset(df.columns):
        df['PAQ_Total'] = df[['PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total']].max(axis=1)
    if 'PreInt_EduHx-computerinternet_hoursday' in df.columns and 'Basic_Demos-Age' in df.columns:
        df['Internet_Hours_Age'] = (
            df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
        )
    
    if 'Physical-BMI' in df.columns and 'Basic_Demos-Age' in df.columns:
        df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    
    if 'Physical-Height' in df.columns and 'Basic_Demos-Age' in df.columns:
        df['Physical-Height_Age'] = df['Physical-Height'] * df['Basic_Demos-Age']

    if 'SDS-SDS_Total_T' in df.columns and 'PreInt_EduHx-computerinternet_hoursday' in df.columns:
        df['SDS_InternetHours'] = (
            df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']
        )
    
    if 'BIA-BIA_BMI' in df.columns and 'SDS-SDS_Total_T' in df.columns:
        df['SDS_BMI'] = df['BIA-BIA_BMI'] * df['SDS-SDS_Total_T']

    if 'CGAS-CGAS_Score' in df.columns and 'SDS-SDS_Total_T' in df.columns:
        df['CGAS_SDS'] = df['CGAS-CGAS_Score'] * df['SDS-SDS_Total_T']
    
    if 'Physical-Systolic_BP' in df.columns and 'Basic_Demos-Age' in df.columns:
        df['Age_Systolic_BP'] = df['Physical-Systolic_BP'] * df['Basic_Demos-Age']

    if 'Physical-Systolic_BP' in df.columns and 'PreInt_EduHx-computerinternet_hoursday' in df.columns:
        df['PreInt_Systolic_BP'] = (
            df['Physical-Systolic_BP'] * df['PreInt_EduHx-computerinternet_hoursday']
        )

    if 'BIA-BIA_Activity_Level_num' in df.columns and 'PAQ_Total' in df.columns:
        df['PAQ_Activity'] = df['BIA-BIA_Activity_Level_num'] * df['PAQ_Total']

    return df


train = feature_engineering(train)
test= feature_engineering(test)


In [27]:
def eval_models(thresholds, y, y_predict_non_rounded):
    y_predic = np.where(y_predict_non_rounded < thresholds[0], 0,
                    np.where(y_predict_non_rounded < thresholds[1], 1,
                             np.where(y_predict_non_rounded < thresholds[2], 2, 3)))
    return -cohen_kappa_score(y, y_predic, weights='quadratic')

def train_models(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        y_train_pred_rounded = y_train_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = cohen_kappa_score(y_train, y_train_pred_rounded, weights='quadratic')        
        val_kappa = cohen_kappa_score(y_val, y_val_pred_rounded, weights='quadratic')

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(eval_models,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = np.where(oof_non_rounded < (KappaOPtimizer.x)[0], 0,
                    np.where(oof_non_rounded < (KappaOPtimizer.x)[1], 1,
                             np.where(oof_non_rounded < (KappaOPtimizer.x)[2], 2, 3)))
    
    tKappa = cohen_kappa_score(y, oof_tuned, weights='quadratic')

    print(f"----> || Optimized SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = np.where(tpm < (KappaOPtimizer.x)[0], 0,
                    np.where(tpm < (KappaOPtimizer.x)[1], 1,
                             np.where(tpm < (KappaOPtimizer.x)[2], 2, 3)))
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [28]:


class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = KNNImputer(n_neighbors=5)
        #self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'

    def fit(self, X, y):
        X_imputed = self.imputer.fit_transform(X)

        if hasattr(y, 'values'):
            y = y.values

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed,
            y,
            test_size=0.2,
            random_state=42
        )

        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse', 'mae', 'rmse'],
            max_epochs=500,
            patience=50,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )

        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file

        return self


    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

TabNet_Params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'lambda_sparse': 1e-4,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=1e-4, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}


class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min',
                 save_best_only=True, verbose=1):
        super().__init__()
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')

    def on_train_begin(self, logs=None):
        self.model = self.trainer

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return

        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)

In [29]:
Params = {
    'learning_rate': 0.017862173759615217,
    'max_depth': 6,
    'num_leaves': 125,            
    'min_data_in_leaf': 13,    
    'feature_fraction':0.6522262873105152,     
    'bagging_fraction': 0.7654332788287815,   
    'bagging_freq': 4,
    'lambda_l1': 9.655325791404717,
    'lambda_l2': 0.320887274657755,
          
    'device': 'cpu',
    'min_gain_to_split': 0.5,
    'max_bin': 128


}


XGB_Params = {
    'learning_rate': 0.014712430731211663,
    'max_depth': 4,
    'n_estimators': 150,
    'subsample': 0.6802148193485659,
    'colsample_bytree': 0.7805781852928252,
    'reg_alpha': 0.04701140593625621,
    'reg_lambda': 9.583858559926233,
    'gamma': 0.028582667563424185,
    'max_bin': 256,
    'random_state': SEED,
    'tree_method': 'hist'
}



CatBoost_Params = {
    'learning_rate': 0.03672551139879832,
    'depth': 8,
    'iterations': 344,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 0.001989164174947358,
    'border_count': 128,
    'random_strength': 0.3829541667285703,
    'task_type': 'CPU',
    'bagging_temperature': 0.28498124222284416
}


In [30]:
import optuna
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
tabnet = TabNetWrapper(**TabNet_Params)
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', tabnet)
])

In [31]:

Submission1 = train_models(voting_model, test)

Submission1.to_csv('submission.csv', index=False)

Training Folds: 100%|██████████| 5/5 [04:45<00:00, 57.15s/it]

Mean Train QWK --> 0.6187
Mean Validation QWK ---> 0.3774
----> || Optimized SCORE :: [36m[1m 0.463[0m



