In [54]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

from colorama import Fore, Style
from lightgbm import LGBMRegressor

from sklearn.base import clone
from scipy.optimize import minimize
import matplotlib.pyplot as plt

import re
from colorama import Fore, Style

from tqdm import tqdm
from IPython.display import clear_output
import random
import torch
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.optim as optim
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import *
from sklearn.metrics import *

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

n_splits = 5
SEED = 42

In [55]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [56]:
train.head()
missing_percentage = train.isnull().mean() * 100

# Lọc danh sách các cột cần loại bỏ
columns_to_drop = missing_percentage[missing_percentage > 70].index
print("columns to drop", columns_to_drop)
train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)

train = train.dropna(subset='sii')
print(train.shape)

columns to drop Index(['Physical-Waist_Circumference', 'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
       'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
       'PAQ_A-Season', 'PAQ_A-PAQ_A_Total'],
      dtype='object')
(2736, 72)


In [57]:
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
train[numeric_cols] = train[numeric_cols].replace([np.inf, -np.inf], np.nan)
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])

cat_cols = train.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])

season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}
season_cols = [col for col in train.columns if 'Season' in col]
for col in season_cols:
    train[col] = train[col].map(season_mapping)

In [58]:
pciat_columns = [col for col in train.columns if 'PCIAT-PCIAT' in col and col != 'PCIAT-PCIAT_Total']
corr_with_total = train[pciat_columns].corrwith(train['PCIAT-PCIAT_Total'])
print("Mối tương quan với PCIAT-PCIAT_TOTAL:")
print(corr_with_total)

train.drop(columns= pciat_columns, inplace=True)

Mối tương quan với PCIAT-PCIAT_TOTAL:
PCIAT-PCIAT_01    0.728246
PCIAT-PCIAT_02    0.789033
PCIAT-PCIAT_03    0.823509
PCIAT-PCIAT_04    0.625314
PCIAT-PCIAT_05    0.831479
PCIAT-PCIAT_06    0.703399
PCIAT-PCIAT_07    0.492813
PCIAT-PCIAT_08    0.770630
PCIAT-PCIAT_09    0.721021
PCIAT-PCIAT_10    0.761529
PCIAT-PCIAT_11    0.684858
PCIAT-PCIAT_12    0.411792
PCIAT-PCIAT_13    0.772177
PCIAT-PCIAT_14    0.722164
PCIAT-PCIAT_15    0.824310
PCIAT-PCIAT_16    0.769773
PCIAT-PCIAT_17    0.824338
PCIAT-PCIAT_18    0.801978
PCIAT-PCIAT_19    0.737316
PCIAT-PCIAT_20    0.754248
dtype: float64


In [59]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2024)

# Load and process files parquet
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

# Optimized loading time series
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# AutoEncoder class
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim * 3), nn.ReLU(),
            nn.Linear(encoding_dim * 3, encoding_dim * 2), nn.ReLU(),
            nn.Linear(encoding_dim * 2, encoding_dim), nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim * 2), nn.ReLU(),
            nn.Linear(input_dim * 2, input_dim * 3), nn.ReLU(),
            nn.Linear(input_dim * 3, input_dim), nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Optimized Autoencoder Training Function
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i:i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')
    
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    return df_encoded

In [60]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [02:23<00:00,  6.95it/s]
100%|██████████| 2/2 [00:00<00:00,  9.22it/s]


In [61]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

train_ts_encoded["id"] = train_ts["id"]
test_ts_encoded['id'] = test_ts["id"]

train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

# train = train.drop('id', axis=1)
# test = test.drop('id', axis=1)

Epoch [10/100], Loss: 1.6378
Epoch [20/100], Loss: 1.5450
Epoch [30/100], Loss: 1.5127
Epoch [40/100], Loss: 1.5011
Epoch [50/100], Loss: 1.5013
Epoch [60/100], Loss: 1.5026
Epoch [70/100], Loss: 1.4933
Epoch [80/100], Loss: 1.4747
Epoch [90/100], Loss: 1.4714
Epoch [100/100], Loss: 1.4555
Epoch [10/100], Loss: 1.0070
Epoch [20/100], Loss: 0.5783
Epoch [30/100], Loss: 0.4271
Epoch [40/100], Loss: 0.4271
Epoch [50/100], Loss: 0.4271
Epoch [60/100], Loss: 0.4271
Epoch [70/100], Loss: 0.4271
Epoch [80/100], Loss: 0.4271
Epoch [90/100], Loss: 0.4271
Epoch [100/100], Loss: 0.4271


In [62]:
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
train[numeric_cols] = train[numeric_cols].replace([np.inf, -np.inf], np.nan)
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])

In [63]:
train.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,00008ff9,3.0,5.0,0.0,4.0,51.0,3.0,16.877316,46.0,50.8,66.8,79.4,109.6,1.0,3.0,0.0,0.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,3.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.0,2.17,3.0,55.0,1.0,42.8,60.0,3.0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,000fd460,2.0,9.0,0.0,1.0,63.4,3.0,14.03559,48.0,46.0,75.0,70.0,122.0,1.0,3.0,3.0,0.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,4.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,3.0,2.34,3.0,0.0,3.0,46.0,64.0,2.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00105258,2.0,10.0,1.0,3.0,71.0,3.0,16.648696,56.5,75.6,65.0,94.0,117.0,3.0,3.0,20.0,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.0,2.4,3.789656,18.00676,1078.652,1727.572,15.20566,57.15374,14.118,3.888752,15.686258,1.4,27.86406,14.08406,53.3641,25.41684,43.06968,2.0,2.17,3.0,28.0,3.0,38.0,54.0,2.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00115b9f,4.0,9.0,0.0,3.0,71.0,2.0,18.292347,56.0,81.6,60.0,97.0,117.0,2.0,2.0,18.0,1.0,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,2.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,4.0,2.451,2.0,44.0,2.0,31.0,45.0,4.0,0.0,1.0,3.706038,4.952529,0.0,3.216783,0.492864,0.0,2.913569,5.385433,2.458169,3.091718,1.098612,4.530478,0.0,1.50608,1.643995,1.854122,0.779649,0.154253,0.0,0.0,0.561464,4.63403,4.879626,0.0,0.0,2.705188,0.0,0.0,5.459943,3.150194,2.637216,2.85992,0.510033,0.0,0.0,1.764689,3.431169,0.0,0.919617,0.0,1.185839,3.096888,0.118984,0.0,1.553793,4.492571,0.0,4.213635,0.0,0.0,0.590492,1.065462,1.234016,0.0,0.0,3.34918,1.690656,2.483185,5.593308,1.066349
4,001f3379,1.0,13.0,1.0,4.0,50.0,2.0,22.279952,59.5,112.2,60.0,73.0,102.0,1.0,2.0,12.0,0.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.0,4.11,2.0,34.0,2.0,40.0,56.0,1.0,0.0,1.0,0.0,0.0,1.906668,3.442728,5.017359,0.0,4.50842,2.065328,2.109076,0.0,0.0,2.713881,3.785229,1.712004,6.489125,0.059956,4.035987,8.721813,0.0,3.59797,0.480155,0.0,6.547864,0.0,1.411324,0.0,0.972613,0.0,1.744805,2.084547,2.875771,0.0,2.768205,0.957273,0.0,4.773528,1.777925,0.0,3.333569,1.779179,0.0,1.92079,0.18158,0.0,0.0,3.013146,2.475967,0.697922,0.0,4.014232,1.449124,2.364651,0.0,0.0,0.0,0.0,0.0,0.55268,0.0,3.285048


In [64]:
test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,Fall,0.0,0.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,,Fall,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,75.0,70.0,122.0,,Fall,3.0,0.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,Fall,2.34,Fall,46.0,64.0,Summer,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,65.0,94.0,117.0,Fall,Fall,20.0,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,60.0,97.0,117.0,Summer,Summer,18.0,1.0,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,Winter,2.451,Summer,31.0,45.0,Winter,0.0,0.0,0.0,15.157281,0.0,0.037228,0.0,2.281092,0.0,7.973123,1.643958,8.318027,14.723692,0.0,0.0,0.0,0.0,0.0,15.254554,11.71503,8.90442,0.0,7.384878,0.0,0.0,9.785435,0.0,6.996854,11.378406,4.382203,0.0,0.0,0.0,0.0,7.070956,0.273735,0.0,0.0,0.0,0.0,12.211447,0.0,0.0,0.0,0.166452,0.0,12.202832,9.163034,9.083336,0.0,8.390906,3.087122,9.608292,0.0,0.120096,0.0,9.299348,6.812585,0.0,0.0,3.360213
4,0016bb22,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,60.0,73.0,102.0,,Summer,12.0,0.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,Summer,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,Spring,4.11,Summer,40.0,56.0,Spring,0.0,9.283687,1.391957,0.0,0.0,7.428909,13.35609,4.7154,8.020968,0.0,1.163097,0.0,0.0,5.585678,6.63162,0.0,8.276455,7.526707,0.0,0.0,0.0,7.320195,0.0,0.0,9.337597,0.0,0.0,0.0,4.618068,2.678735,0.0,0.0,0.0,0.0,2.957028,5.513256,0.0,0.0,0.0,0.0,0.0,0.0,10.662449,0.0,0.0,0.0,0.418605,0.0,0.0,0.0,0.0,6.256849,0.0,11.235291,0.0,0.0,4.316035,3.154848,3.933196,8.849051,2.365496
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,123.0,83.0,163.0,,Fall,9.0,1.0,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,Fall,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,Winter,3.67,Winter,27.0,40.0,Fall,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,71.0,90.0,116.0,,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,Fall,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,Fall,1.27,,,,Fall,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,0069fbed,Summer,15,0,,,Spring,,,,,,,,Spring,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,0083e397,Summer,19,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [65]:
imputer = KNNImputer(n_neighbors=5)

numeric_cols = test.select_dtypes(include=['float64', 'int64']).columns
test[numeric_cols] = imputer.fit_transform(test[numeric_cols])
test = pd.merge(test, test_ts, how="left", on='id')

cat_cols = test.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
test[cat_cols] = cat_imputer.fit_transform(test[cat_cols])

season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}
season_cols = [col for col in test.columns if 'Season' in col]
for col in season_cols:
    test[col] = test[col].map(season_mapping)

common_features = [col for col in train.columns if col in test.columns]
test = test[common_features]
train = train[common_features + ['sii']]

In [66]:
test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,00008ff9,3,5.0,0.0,4,51.0,3,16.877316,46.0,50.8,63.8,78.4,109.2,1,3,0.0,0.0,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,3,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,3,2.1,3,40.4,57.0,3,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,000fd460,2,9.0,0.0,2,65.6,3,14.03559,48.0,46.0,75.0,70.0,122.0,1,3,3.0,0.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,4,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,3,2.34,3,46.0,64.0,2,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00105258,2,10.0,1.0,3,71.0,3,16.648696,56.5,75.6,65.0,94.0,117.0,3,3,20.0,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3,2.4,3.179512,16.6695,1036.7388,1661.352,13.390802,52.68942,13.68322,2.9863,11.870584,1.4,25.60582,13.692792,49.50994,22.01768,38.99664,2,2.17,3,38.0,54.0,2,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00115b9f,4,9.0,0.0,3,71.0,2,18.292347,56.0,81.6,60.0,97.0,117.0,2,2,18.0,1.0,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,2,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,4,2.451,2,31.0,45.0,4,0.0,0.0,0.0,15.157281,0.0,0.037228,0.0,2.281092,0.0,7.973123,1.643958,8.318027,14.723692,0.0,0.0,0.0,0.0,0.0,15.254554,11.71503,8.90442,0.0,7.384878,0.0,0.0,9.785435,0.0,6.996854,11.378406,4.382203,0.0,0.0,0.0,0.0,7.070956,0.273735,0.0,0.0,0.0,0.0,12.211447,0.0,0.0,0.0,0.166452,0.0,12.202832,9.163034,9.083336,0.0,8.390906,3.087122,9.608292,0.0,0.120096,0.0,9.299348,6.812585,0.0,0.0,3.360213
4,0016bb22,1,18.0,1.0,2,61.8,3,19.131514,58.3,92.32,63.2,79.6,117.6,1,3,10.2,0.2,6.6,0.2,5.0,0.4,5.9,0.6,9.1,0.6,3,2.8,3.966014,20.3662,1164.5972,2035.05,18.890866,66.30792,14.6964,5.669806,27.05209,1.8,29.88736,17.52968,62.34192,28.3956,48.77826,3,2.324,3,36.0,51.2,3,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,001f3379,1,13.0,1.0,4,50.0,2,22.279952,59.5,112.2,60.0,73.0,102.0,1,2,12.0,0.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1,4.11,2,40.0,56.0,1,0.0,9.283687,1.391957,0.0,0.0,7.428909,13.35609,4.7154,8.020968,0.0,1.163097,0.0,0.0,5.585678,6.63162,0.0,8.276455,7.526707,0.0,0.0,0.0,7.320195,0.0,0.0,9.337597,0.0,0.0,0.0,4.618068,2.678735,0.0,0.0,0.0,0.0,2.957028,5.513256,0.0,0.0,0.0,0.0,0.0,0.0,10.662449,0.0,0.0,0.0,0.418605,0.0,0.0,0.0,0.0,6.256849,0.0,11.235291,0.0,0.0,4.316035,3.154848,3.933196,8.849051,2.365496
6,0038ba98,3,10.0,0.0,2,67.8,3,19.66076,55.0,84.6,123.0,83.0,163.0,1,3,9.0,1.0,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,3,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,4,3.67,4,27.0,40.0,3,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,0068a485,3,10.0,1.0,2,67.8,3,16.861286,59.25,84.2,71.0,90.0,116.0,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,3,1.27,3,36.6,52.2,3,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,0069fbed,2,15.0,0.0,2,61.8,1,18.482622,58.6,90.32,74.8,80.0,126.8,1,1,13.2,0.4,8.0,0.4,7.0,0.6,7.9,0.8,8.3,0.4,3,2.8,3.966014,20.3662,1164.5972,2035.05,18.890866,66.30792,14.6964,5.669806,27.05209,1.8,29.88736,17.52968,62.34192,28.3956,48.77826,3,2.624,3,36.0,51.2,2,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,0083e397,2,19.0,1.0,2,61.8,3,19.131514,58.3,92.32,63.2,79.6,117.6,1,3,10.2,0.2,6.6,0.2,5.0,0.4,5.9,0.6,9.1,0.6,3,2.8,3.966014,20.3662,1164.5972,2035.05,18.890866,66.30792,14.6964,5.669806,27.05209,1.8,29.88736,17.52968,62.34192,28.3956,48.77826,3,2.324,3,36.0,51.2,3,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [67]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [68]:
train = train.drop('id', axis=1)
def Train_model(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    # Apply K-Fold
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    test_ids = test_data['id']
    test_data = test_data.drop('id', axis=1)
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        # Train model
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # Round to integer values
        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        #Predict with test dataset
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    # Using optimizer to find the best threshold
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead') # Nelder-Mead | # Powell
    assert KappaOPtimizer.success, "Optimization did not converge."

    # Use the threshold retrive from the optimizer to predict again to evaluate
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    # Use the threshold retrive from the optimizer to predict test
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)

    # Create submition
    submission = pd.DataFrame({
        'id': test_ids,
        'sii': tpTuned
    })

    return submission


Light_Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': SEED
}

CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10
}

# Create model instances
Light = LGBMRegressor(**Light_Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

# Train the ensemble model
Submission = Train_model(voting_model, test)

Training Folds: 100%|██████████| 5/5 [00:55<00:00, 11.00s/it]

Mean Train QWK --> 0.8100
Mean Validation QWK ---> 0.4859





----> || Optimized QWK SCORE :: [36m[1m 0.546[0m


In [69]:
Submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1


In [70]:
Submission.to_csv('submission.csv', index=False)