In [1]:
# time series count 지울것

# Based
- https://www.kaggle.com/code/honganzhu/cmi-piu-competition?scriptVersionId=201912528 Version44 LB0.492

 If you find this notebook useful, please upvote this and the based one.

In [2]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from pytorch_tabnet.tab_model import TabNetRegressor
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)

SEED = 42
n_splits = 5

# Feature Engineering

- **Feature Selection**: The dataset contains features related to physical characteristics (e.g., BMI, Height, Weight), behavioral aspects (e.g., internet usage), and fitness data (e.g., endurance time). 
- **Categorical Feature Encoding**: Categorical features are mapped to numerical values using custom mappings for each unique category within the dataset. This ensures compatibility with machine learning algorithms that require numerical input.
- **Time Series Aggregation**: Time series statistics (e.g., mean, standard deviation) from the actigraphy data are computed and merged into the main dataset to create additional features for model training.


In [3]:
def process_file(id_folder, dirname):
    files = os.listdir(os.path.join(path, dirname, id_folder))
    if len(files) > 1:
        df_list = []
        for file in files:
            df = pd.read_parquet(os.path.join(dirname, id_folder, file))
            df_list.append(df)
        df = pd.concat(df_list, ignore_index=True)
    else:
        df = pd.read_parquet(os.path.join(dirname, id_folder, files[0]))
    
    df.drop('step', axis=1, inplace=True)

    describe_df = df.describe(percentiles=[0.01] + [i*0.5/10 + 0.05 for i in range(19)])
    count_mean = describe_df.loc['count', :].mean()
    describe_df.drop('count', axis=0, inplace=True)
    
    return np.append(describe_df.values.reshape(-1), count_mean), id_folder.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*4),
            nn.ReLU(),
            nn.Linear(encoding_dim*4, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*4),
            nn.ReLU(),
            nn.Linear(encoding_dim*4, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [4]:
path = "I:/Kaggle/child-mind-institute-problematic-internet-use/"

In [5]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample = pd.read_csv(path + 'sample_submission.csv')

In [6]:
pciat_cols = [col for col in train.columns if 'PCIAT' in col]

In [7]:
train.drop(pciat_cols, axis=1, inplace=True)

In [8]:
season_cols = [col for col in train.columns if 'Season' in col]

In [9]:
train.drop(season_cols, axis=1, inplace=True)
test.drop(season_cols, axis=1, inplace=True)

In [10]:
train_ts = load_time_series(path + "series_train.parquet")
test_ts = load_time_series(path + "series_test.parquet")

100%|████████████████████████████████████████████████████████████████████████████████| 996/996 [00:28<00:00, 35.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 16.15it/s]


In [13]:
concat_ts = pd.concat([train_ts, test_ts], ignore_index=True)
df_concat = concat_ts.drop('id', axis=1)

In [14]:
concat_ts_encoded = perform_autoencoder(df_concat, encoding_dim=64, epochs=1000, batch_size=128)

Epoch [10/1000], Loss: 0.7400]
Epoch [20/1000], Loss: 0.6777]
Epoch [30/1000], Loss: 0.6048]
Epoch [40/1000], Loss: 0.5751]
Epoch [50/1000], Loss: 0.5651]
Epoch [60/1000], Loss: 0.5707]
Epoch [70/1000], Loss: 0.5584]
Epoch [80/1000], Loss: 0.5517]
Epoch [90/1000], Loss: 0.5498]
Epoch [100/1000], Loss: 0.5452]
Epoch [110/1000], Loss: 0.5426]
Epoch [120/1000], Loss: 0.5411]
Epoch [130/1000], Loss: 0.5440]
Epoch [140/1000], Loss: 0.5408]
Epoch [150/1000], Loss: 0.5349]
Epoch [160/1000], Loss: 0.5370]
Epoch [170/1000], Loss: 0.5357]
Epoch [180/1000], Loss: 0.5325]
Epoch [190/1000], Loss: 0.5338]
Epoch [200/1000], Loss: 0.5321]
Epoch [210/1000], Loss: 0.5306]
Epoch [220/1000], Loss: 0.5287]
Epoch [230/1000], Loss: 0.5288]
Epoch [240/1000], Loss: 0.5300]
Epoch [250/1000], Loss: 0.5271]
Epoch [260/1000], Loss: 0.5266]
Epoch [270/1000], Loss: 0.5266]
Epoch [280/1000], Loss: 0.5273]
Epoch [290/1000], Loss: 0.5266]
Epoch [300/1000], Loss: 0.5252]
Epoch [310/1000], Loss: 0.5251]
Epoch [320/1000],

In [15]:
for col in concat_ts_encoded.columns:
    if concat_ts_encoded[col].std() == 0:
        concat_ts_encoded.drop(col, axis=1, inplace=True)

In [17]:
time_series_cols = concat_ts_encoded.columns.tolist()
concat_ts_encoded["id"] = concat_ts["id"]
train_ts_encoded = concat_ts_encoded.head(train_ts.shape[0]).reset_index(drop=True)
test_ts_encoded = concat_ts_encoded.tail(train_ts.shape[0]).reset_index(drop=True)
train_merge = pd.merge(train, train_ts_encoded, how="left", on='id').dropna(subset='sii')
test_merge = pd.merge(test, test_ts_encoded, how="left", on='id')
train_merge.drop('id', axis=1, inplace=True)
test_merge.drop('id', axis=1, inplace=True)

In [18]:
data_dict = pd.read_csv(path + 'data_dictionary.csv')
print(data_dict.shape)
data_dict

(81, 6)


Unnamed: 0,Instrument,Field,Description,Type,Values,Value Labels
0,Identifier,id,Participant's ID,str,,
1,Demographics,Basic_Demos-Enroll_Season,Season of enrollment,str,"Spring, Summer, Fall, Winter",
2,Demographics,Basic_Demos-Age,Age of participant,float,,
3,Demographics,Basic_Demos-Sex,Sex of participant,categorical int,01,"0=Male, 1=Female"
4,Children's Global Assessment Scale,CGAS-Season,Season of participation,str,"Spring, Summer, Fall, Winter",
5,Children's Global Assessment Scale,CGAS-CGAS_Score,Children's Global Assessment Scale Score,int,,
6,Physical Measures,Physical-Season,Season of participation,str,"Spring, Summer, Fall, Winter",
7,Physical Measures,Physical-BMI,Body Mass Index (kg/m^2),float,,
8,Physical Measures,Physical-Height,Height (in),float,,
9,Physical Measures,Physical-Weight,Weight (lbs),float,,


In [19]:
train_cols = [col for col in train_merge.columns if col in test_merge.columns]
print(len(train_cols))
train_cols

104


['Basic_Demos-Age',
 'Basic_Demos-Sex',
 'CGAS-CGAS_Score',
 'Physical-BMI',
 'Physical-Height',
 'Physical-Weight',
 'Physical-Waist_Circumference',
 'Physical-Diastolic_BP',
 'Physical-HeartRate',
 'Physical-Systolic_BP',
 'Fitness_Endurance-Max_Stage',
 'Fitness_Endurance-Time_Mins',
 'Fitness_Endurance-Time_Sec',
 'FGC-FGC_CU',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL',
 'FGC-FGC_TL_Zone',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW',
 'PAQ_A-PAQ_A_Total',
 'PAQ_C-PAQ_C_Total',
 'SDS-SDS_Total_Raw',
 'SDS-SDS_Total_T',
 'PreInt_EduHx-computerinternet_hoursday',
 'Enc_1',
 'Enc_2',

In [20]:
list(set(train_merge.columns) - set(train_cols))

['sii']

In [21]:
cat_cols = []
for i in range(data_dict.shape[0]):
    col = data_dict['Field'][i]
    if ((data_dict['Type'][i] == 'str') | ('categori' in data_dict['Type'][i])) & (col in train_cols):
        cat_cols.append(col)

In [22]:
print(len(cat_cols))
cat_cols

11


['Basic_Demos-Sex',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL_Zone',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_Frame_num',
 'PreInt_EduHx-computerinternet_hoursday']

In [23]:
ordinal_variables = ['FGC-FGC_CU_Zone', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU_Zone',
                     'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num',
                     'PreInt_EduHx-computerinternet_hoursday']

In [24]:
cat_cols = list(set(cat_cols) - set(ordinal_variables))
cat_cols = [col for col in cat_cols if 'PCIAT' not in col]
print(len(cat_cols))
cat_cols

1


['Basic_Demos-Sex']

In [25]:
numeric_cols = sorted(list(set(train_cols) - set(cat_cols)))
print(len(numeric_cols))
numeric_cols

103


['BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW',
 'Basic_Demos-Age',
 'CGAS-CGAS_Score',
 'Enc_1',
 'Enc_10',
 'Enc_11',
 'Enc_12',
 'Enc_13',
 'Enc_15',
 'Enc_16',
 'Enc_17',
 'Enc_19',
 'Enc_2',
 'Enc_20',
 'Enc_21',
 'Enc_22',
 'Enc_23',
 'Enc_24',
 'Enc_25',
 'Enc_26',
 'Enc_27',
 'Enc_29',
 'Enc_3',
 'Enc_30',
 'Enc_32',
 'Enc_33',
 'Enc_34',
 'Enc_35',
 'Enc_36',
 'Enc_37',
 'Enc_38',
 'Enc_39',
 'Enc_4',
 'Enc_40',
 'Enc_41',
 'Enc_42',
 'Enc_43',
 'Enc_44',
 'Enc_45',
 'Enc_46',
 'Enc_48',
 'Enc_49',
 'Enc_5',
 'Enc_50',
 'Enc_51',
 'Enc_52',
 'Enc_53',
 'Enc_54',
 'Enc_55',
 'Enc_56',
 'Enc_57',
 'Enc_60',
 'Enc_61',
 'Enc_62',
 'Enc_63',
 'Enc_64',
 'Enc_7',
 'Enc_8',
 'Enc_9',
 'FGC-FGC_CU',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSD',
 'FGC-FGC_GSD_Zone',
 '

In [26]:
sorted(list(set(train_merge.columns) - set(test_merge.columns)))

['sii']

In [27]:
'''train_nonan = train_merge.dropna(subset=['sii'], ignore_index=True)
print(train_nonan.shape)
train_nonan.head()'''

"train_nonan = train_merge.dropna(subset=['sii'], ignore_index=True)\nprint(train_nonan.shape)\ntrain_nonan.head()"

In [28]:
'''train_one_hot = train_merge.copy()
test_one_hot = test_merge.copy()'''

'train_one_hot = train_merge.copy()\ntest_one_hot = test_merge.copy()'

In [29]:
cat_cols

['Basic_Demos-Sex']

In [30]:
'''for col in cat_cols:
    train_sr = train_one_hot[col].fillna('Missing').apply(str)
    test_sr = test_one_hot[col].fillna('Missing').apply(str)
    train_one_hot[col] = train_sr
    test_one_hot[col] = test_sr
    concat_sr = pd.concat([train_sr, test_sr])
    unique_list = sorted(concat_sr.unique().tolist())

    for v in unique_list:
        new_col_name = col + '_' + v
        train_cols.append(new_col_name)
        train_one_hot[new_col_name] = pd.Series(train_one_hot[col] == v).astype(int)
        test_one_hot[new_col_name] = pd.Series(test_one_hot[col] == v).astype(int)
    train_cols.remove(col)
train_one_hot.drop(cat_cols, axis=1, inplace=True)
test_one_hot.drop(cat_cols, axis=1, inplace=True)'''

"for col in cat_cols:\n    train_sr = train_one_hot[col].fillna('Missing').apply(str)\n    test_sr = test_one_hot[col].fillna('Missing').apply(str)\n    train_one_hot[col] = train_sr\n    test_one_hot[col] = test_sr\n    concat_sr = pd.concat([train_sr, test_sr])\n    unique_list = sorted(concat_sr.unique().tolist())\n\n    for v in unique_list:\n        new_col_name = col + '_' + v\n        train_cols.append(new_col_name)\n        train_one_hot[new_col_name] = pd.Series(train_one_hot[col] == v).astype(int)\n        test_one_hot[new_col_name] = pd.Series(test_one_hot[col] == v).astype(int)\n    train_cols.remove(col)\ntrain_one_hot.drop(cat_cols, axis=1, inplace=True)\ntest_one_hot.drop(cat_cols, axis=1, inplace=True)"

In [31]:
'''one_hot_concat = pd.concat([train_one_hot[train_cols], test_one_hot[train_cols]], ignore_index=True)
print(one_hot_concat.shape)
one_hot_concat.head()'''

'one_hot_concat = pd.concat([train_one_hot[train_cols], test_one_hot[train_cols]], ignore_index=True)\nprint(one_hot_concat.shape)\none_hot_concat.head()'

In [32]:
#train_one_hot.shape[0] + test_one_hot.shape[0]

In [33]:
#test_one_hot.shape

In [34]:
#len(train_cols)

In [35]:
merged_concat = pd.concat([train_merge[train_cols], test_merge[train_cols]], ignore_index=True)
print(merged_concat.shape)
merged_concat.head()

(2756, 104)


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_15,Enc_16,Enc_17,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_29,Enc_30,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64
0,5,0,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,38.0,54.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,3.379076,0.0,0.0,0.060061,7.307746,13.665402,0.0,0.0,2.320182,3.843184,1.774965,0.0,7.298285,0.838345,13.962495,11.627674,13.75059,0.0,0.0,7.561372,7.524602,0.858806,4.355196,5.837994,0.0,4.788587,13.068821,3.021619,14.116825,9.862643,12.08551,4.795244,5.572842,3.443617,0.0,3.339127,0.0,2.081927,0.0,4.676276,9.72712,6.83093,0.0,2.188877,2.159903,0.0,0.0,2.571432,0.328762,0.0,0.0,0.0,10.878162,3.841405,1.636401,7.399321
4,13,1,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,4.11,40.0,56.0,0.0,8.954702,8.786606,0.0,0.0,5.32892,0.0,0.0,0.0,6.211152,3.493192,15.538667,0.0,0.0,7.882574,11.079842,9.05934,0.0,6.784193,8.332929,2.514006,1.849349,16.86327,0.0,0.659014,4.214959,6.257581,3.553791,8.447275,0.0,0.658559,0.0,6.079519,0.0,0.0,11.236004,0.0,8.89573,1.43137,0.0,10.726178,2.633679,6.659511,9.373384,0.0,2.761524,0.0,1.72579,3.44308,0.0,10.124622,15.363891,0.0,0.0,9.835675,5.03399,0.0


In [36]:
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(merged_concat[train_cols])

In [37]:
merge_imputed = pd.DataFrame(imputed_data, columns=train_cols)
print(merge_imputed.shape)

(2756, 104)


In [38]:
train_imputed = merge_imputed.head(train_merge.shape[0]).reset_index(drop=True)
test_imputed = merge_imputed.tail(test_merge.shape[0]).reset_index(drop=True)

In [40]:
train_merge[train_cols] = train_imputed
test_merge[train_cols] = test_imputed

In [41]:
def feature_engineering(df):
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    return df

In [42]:
len(train_cols)

104

In [43]:
train_cols += ['BMI_Age', 'Internet_Hours_Age', 'BMI_Internet_Hours', 'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE',
 'BMR_Weight', 'DEE_Weight', 'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

In [44]:
len(train_cols)

119

In [45]:
train_merge = feature_engineering(train_merge)
train = train.dropna(thresh=10, axis=0)
test_merge = feature_engineering(test_merge)

In [46]:
if np.any(np.isinf(train_merge)):
    train_merge = train_merge.replace([np.inf, -np.inf], np.nan)

In [47]:
if np.any(np.isinf(test_merge)):
    test_merge = test_merge.replace([np.inf, -np.inf], np.nan)

In [48]:
train_final = train_merge[train_cols + ['sii']].copy()
test_final = test_merge[train_cols].copy()

In [49]:
print(train_final.shape)
train_final.head()

(2736, 120)


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_15,Enc_16,Enc_17,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_29,Enc_30,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,sii
0,5.0,0.0,51.0,16.877316,46.0,50.8,22.2,63.4,85.0,110.0,3.4,4.0,36.4,0.0,0.0,12.3,1.2,12.16,1.6,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.866,2.668,37.8,53.6,3.0,1.363611,7.621941,2.123082,3.380753,3.987221,4.00992,0.960785,2.686903,3.556705,6.251574,0.765016,2.086529,4.273782,7.119099,4.522466,2.074293,3.432917,8.236403,4.376191,7.00913,3.359256,3.7514,7.178717,9.687251,6.689728,3.46621,4.77907,4.325388,6.143128,7.917602,2.107428,7.752057,1.942345,3.891191,2.375024,3.577436,3.477573,1.436455,3.350119,5.511547,3.770916,4.609437,5.272477,5.292899,1.521473,0.0,6.240864,2.583221,4.922733,1.684328,0.680318,4.218587,4.929527,12.580836,4.108385,2.261817,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453,2.0
1,9.0,0.0,59.0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.8,9.0,21.2,3.0,0.0,19.5,1.4,19.86,1.6,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.718,2.34,46.0,64.0,0.0,0.46349,10.318422,7.955656,5.096108,1.791037,1.724214,2.292603,5.517929,6.352954,4.877342,1.895464,0.889664,1.043735,6.563208,4.23988,2.510472,2.012371,6.498184,11.64473,5.576575,3.789739,11.172126,4.098786,3.773451,4.714305,6.857052,3.522767,3.836948,5.85362,2.012731,2.359838,7.865605,0.857085,2.266458,3.039173,2.758347,5.579533,4.171046,4.427868,8.593099,5.972615,6.093087,9.494704,3.674694,3.747165,3.012251,3.248234,3.005837,0.087344,2.733043,4.345109,3.356686,2.867316,13.33632,2.512155,3.287817,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,0.0
2,10.0,1.0,71.0,16.648696,56.5,75.6,26.2,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.868872,17.08248,1129.666,1965.644,15.83894,62.58756,13.67254,3.409932,15.49244,1.6,29.96452,16.78404,58.71868,28.76124,45.80346,1.778,2.17,38.0,54.0,2.0,1.351631,4.111321,10.816498,5.593917,6.978366,8.063424,2.54204,4.426742,4.852935,3.183453,2.900289,0.0,2.919314,7.103267,10.682555,8.740913,7.936072,3.628475,12.904566,4.178044,6.68688,8.358402,3.527065,5.86138,3.036245,6.569004,12.235125,5.976111,11.884802,6.511728,8.645638,7.243757,3.481177,5.82631,1.631363,1.335651,6.069639,8.375359,3.640172,6.521181,11.181112,4.001003,6.634944,2.45601,1.196101,0.0,3.088828,3.070763,2.217901,3.277424,4.180332,4.63481,6.353503,8.233279,0.956285,6.565434,166.486961,20.0,33.297392,0.90692,0.88253,0.220103,1.28197,17501.282725,30452.621731,14.942672,26.000582,0.509048,8.434549,0.605866,0.654198,0.0
3,9.0,0.0,71.0,18.292347,56.0,81.6,27.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,16.52,1.4,18.78,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.97,2.451,31.0,45.0,0.0,3.379076,0.0,0.0,0.060061,7.307746,13.665402,0.0,0.0,2.320182,3.843184,1.774965,0.0,7.298285,0.838345,13.962495,11.627674,13.75059,0.0,0.0,7.561372,7.524602,0.858806,4.355196,5.837994,0.0,4.788587,13.068821,3.021619,14.116825,9.862643,12.08551,4.795244,5.572842,3.443617,0.0,3.339127,0.0,2.081927,0.0,4.676276,9.72712,6.83093,0.0,2.188877,2.159903,0.0,0.0,2.571432,0.328762,0.0,0.0,0.0,10.878162,3.841405,1.636401,7.399321,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,1.0
5,10.0,0.0,67.2,19.66076,55.0,84.6,28.4,123.0,83.0,163.0,4.0,6.6,39.2,9.0,1.0,16.8,1.8,17.0,1.6,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,1.992,3.67,27.0,40.0,3.0,0.0,4.854711,6.740585,8.46726,6.529135,4.047139,3.987917,7.011478,8.293274,6.644733,2.482212,1.674758,3.599777,6.617038,6.972386,6.700883,4.93289,11.518289,11.683325,7.854158,3.559725,5.662934,4.863183,7.317832,4.341396,3.941816,9.142334,7.614948,3.888869,7.296078,3.57136,5.866939,7.567775,11.112383,5.351613,3.024767,9.042933,6.270827,4.255499,7.40044,3.719724,6.036229,6.458134,3.541324,1.813698,0.0,11.782809,2.14519,3.504258,3.652587,1.245142,10.607136,9.650975,6.673582,6.705151,1.510366,196.607603,30.0,58.982281,1.085954,0.688428,0.232422,1.259274,24254.01858,38806.51514,13.426241,21.482033,0.476285,5.278294,0.558169,0.654233,1.0


In [51]:
print(test_final.shape)
test_final.head()

(20, 119)


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_15,Enc_16,Enc_17,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_29,Enc_30,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
0,5.0,0.0,51.0,16.877316,46.0,50.8,22.2,63.4,85.0,110.0,3.4,4.0,36.4,0.0,0.0,12.3,1.2,12.16,1.6,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.866,2.668,37.8,53.6,3.0,1.363611,7.621941,2.123082,3.380753,3.987221,4.00992,0.960785,2.686903,3.556705,6.251574,0.765016,2.086529,4.273782,7.119099,4.522466,2.074293,3.432917,8.236403,4.376191,7.00913,3.359256,3.7514,7.178717,9.687251,6.689728,3.46621,4.77907,4.325388,6.143128,7.917602,2.107428,7.752057,1.942345,3.891191,2.375024,3.577436,3.477573,1.436455,3.350119,5.511547,3.770916,4.609437,5.272477,5.292899,1.521473,0.0,6.240864,2.583221,4.922733,1.684328,0.680318,4.218587,4.929527,12.580836,4.108385,2.261817,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453
1,9.0,0.0,59.0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.8,9.0,21.2,3.0,0.0,19.5,1.4,19.86,1.6,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.718,2.34,46.0,64.0,0.0,0.46349,10.318422,7.955656,5.096108,1.791037,1.724214,2.292603,5.517929,6.352954,4.877342,1.895464,0.889664,1.043735,6.563208,4.23988,2.510472,2.012371,6.498184,11.64473,5.576575,3.789739,11.172126,4.098786,3.773451,4.714305,6.857052,3.522767,3.836948,5.85362,2.012731,2.359838,7.865605,0.857085,2.266458,3.039173,2.758347,5.579533,4.171046,4.427868,8.593099,5.972615,6.093087,9.494704,3.674694,3.747165,3.012251,3.248234,3.005837,0.087344,2.733043,4.345109,3.356686,2.867316,13.33632,2.512155,3.287817,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2,10.0,1.0,71.0,16.648696,56.5,75.6,26.2,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.868872,17.08248,1129.666,1965.644,15.83894,62.58756,13.67254,3.409932,15.49244,1.6,29.96452,16.78404,58.71868,28.76124,45.80346,1.778,2.17,38.0,54.0,2.0,1.351631,4.111321,10.816498,5.593917,6.978366,8.063424,2.54204,4.426742,4.852935,3.183453,2.900289,0.0,2.919314,7.103267,10.682555,8.740913,7.936072,3.628475,12.904566,4.178044,6.68688,8.358402,3.527065,5.86138,3.036245,6.569004,12.235125,5.976111,11.884802,6.511728,8.645638,7.243757,3.481177,5.82631,1.631363,1.335651,6.069639,8.375359,3.640172,6.521181,11.181112,4.001003,6.634944,2.45601,1.196101,0.0,3.088828,3.070763,2.217901,3.277424,4.180332,4.63481,6.353503,8.233279,0.956285,6.565434,166.486961,20.0,33.297392,0.90692,0.88253,0.220103,1.28197,17501.282725,30452.621731,14.942672,26.000582,0.509048,8.434549,0.605866,0.654198
3,9.0,0.0,71.0,18.292347,56.0,81.6,27.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,16.52,1.4,18.78,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.97,2.451,31.0,45.0,0.0,3.379076,0.0,0.0,0.060061,7.307746,13.665402,0.0,0.0,2.320182,3.843184,1.774965,0.0,7.298285,0.838345,13.962495,11.627674,13.75059,0.0,0.0,7.561372,7.524602,0.858806,4.355196,5.837994,0.0,4.788587,13.068821,3.021619,14.116825,9.862643,12.08551,4.795244,5.572842,3.443617,0.0,3.339127,0.0,2.081927,0.0,4.676276,9.72712,6.83093,0.0,2.188877,2.159903,0.0,0.0,2.571432,0.328762,0.0,0.0,0.0,10.878162,3.841405,1.636401,7.399321,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
4,18.0,1.0,66.8,22.353443,63.5,127.04,32.0,73.6,74.4,127.4,3.8,5.4,18.2,12.8,0.4,27.06,1.8,28.32,1.8,2.4,0.2,5.8,0.2,5.4,0.2,8.8,0.6,2.4,4.444068,26.26986,1402.728,2157.106,30.59268,91.6721,15.91662,10.353276,59.0479,2.4,34.83748,26.24202,87.22818,44.34808,65.43016,1.04,2.942,42.2,58.8,2.6,1.529686,1.515348,11.910147,5.008523,5.770485,4.630889,2.989651,7.929092,11.465461,3.350022,4.883521,0.0,2.955138,12.576322,5.238375,11.751881,3.557933,6.993788,13.96418,7.696182,6.812133,9.474536,3.343701,2.625884,6.105223,6.906148,11.441968,4.770343,4.522361,4.688436,9.968357,5.886752,5.026941,8.316228,4.440661,4.244267,13.601195,11.29675,1.871167,9.64227,2.174128,5.640476,6.688809,2.720477,0.538887,3.903632,10.054635,0.754742,6.788886,3.788735,3.969575,13.851828,6.371822,5.26831,2.458733,2.332177,402.361977,46.8,58.118952,2.247743,0.269554,0.175337,1.333149,82828.142671,127372.579377,11.041625,16.979739,0.698395,4.283483,0.515036,0.532438


In [52]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

# Model Training and Evaluation

- **Model Types**: Various models are used, including:
  - **LightGBM**: A gradient-boosting framework known for its speed and efficiency with large datasets.
  - **XGBoost**: Another powerful gradient-boosting model used for structured data.
  - **CatBoost**: Optimized for categorical features without the need for extensive preprocessing.
  - **Voting Regressor**: An ensemble model that combines the predictions of LightGBM, XGBoost, and CatBoost for better accuracy.
- **Cross-Validation**: Stratified K-Folds cross-validation is employed to split the data into training and validation sets, ensuring balanced class distribution in each fold.
- **Quadratic Weighted Kappa (QWK)**: The performance of the models is evaluated using QWK, which measures the agreement between predicted and actual values, taking into account the ordinal nature of the target variable.
- **Threshold Optimization**: The `minimize` function from `scipy.optimize` is used to fine-tune decision thresholds that map continuous predictions to discrete categories (None, Mild, Moderate, Severe).


In [53]:
def TrainML(model_class, train_data, test_data):
    X = train_data.drop(['sii'], axis=1)
    y = train_data['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission


# Hyperparameter Tuning

- **LightGBM Parameters**: Hyperparameters such as `learning_rate`, `max_depth`, `num_leaves`, and `feature_fraction` are tuned to improve the performance of the LightGBM model. These parameters control the complexity of the model and its ability to generalize to new data.
- **XGBoost and CatBoost Parameters**: Similar tuning is applied for XGBoost and CatBoost, adjusting parameters such as `n_estimators`, `max_depth`, `learning_rate`, `subsample`, and `regularization` terms (`reg_alpha`, `reg_lambda`). These help in controlling overfitting and ensuring the model's robustness.

In [54]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'gpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [55]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback

class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=500,
            patience=50,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

# Ensemble Learning and Submission Preparation

- **Ensemble Learning**: The model uses a **Voting Regressor**, which combines the predictions from LightGBM, XGBoost, and CatBoost. This approach is beneficial as it leverages the strengths of multiple models, reducing overfitting and improving overall model performance.
- **Out-of-Fold (OOF) Predictions**: During cross-validation, out-of-fold predictions are generated for the training set, which helps in model evaluation without data leakage.
- **Kappa Optimizer**: The Kappa Optimizer ensures that the predicted values are as close to the actual values as possible by adjusting the thresholds used to convert raw model outputs into class labels.
- **Test Set Predictions**: After the model is trained and thresholds are optimized, the test dataset is processed, and predictions are generated using the ensemble model. These predictions are converted into the appropriate format for submission.
- **Submission File Creation**: The predictions are saved in a CSV file following the required format for submission (e.g., for a Kaggle competition), which includes columns like `id` and `sii` (Severity Impairment Index).

# Final Results and Performance Metrics

- **Train and Validation Scores**: After training across multiple folds, the mean Quadratic Weighted Kappa (QWK) score is calculated for both the training and validation datasets, providing an indicator of model performance. 
- **Optimized QWK Score**: The final optimized QWK score after threshold tuning is displayed, showcasing the model's ability to predict the severity levels effectively.
- **Test Predictions**: The test set predictions are evaluated, and a breakdown of the predicted severity levels (None, Mild, Moderate, Severe) is shown, along with their respective counts.

In [56]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params) # New

In [57]:
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
])

Submission1 = TrainML(voting_model, train_final, test_final)

Submission1

Training Folds: 100%|████████████████████████████████████████████████████████████████████| 5/5 [01:32<00:00, 18.60s/it]

Mean Train QWK --> 0.5790
Mean Validation QWK ---> 0.0054
----> || Optimized QWK SCORE :: [36m[1m 0.014[0m





Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,1
9,0083e397,1


In [None]:
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')
sample = pd.read_csv(path + '/sample_submission.csv')
        
train_ts = load_time_series(path + "/series_train.parquet")
test_ts = load_time_series(path + "/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)   

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params)  # New:TAbNet

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)  # New:TabNet
])

# Train the ensemble model
Submission2 = TrainML(voting_model, test)

# Save submission
#Submission2.to_csv('submission.csv', index=False)
Submission2

In [None]:
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')
sample = pd.read_csv(path + '/sample_submission.csv')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

train_ts = load_time_series(path + "/series_train.parquet")
test_ts = load_time_series(path + "/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

    return tp_rounded

imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    ('tabnet', Pipeline(steps=[('imputer', imputer), ('regressor', TabNetWrapper(**TabNet_Params))]))  # New:TabNet
])

Submission3 = TrainML(ensemble, test)

Submission3 = TrainML(ensemble, test)
Submission3 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission3
})

Submission3

In [None]:
sub1 = Submission1
sub2 = Submission2
sub3 = Submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
final_submission