In [1]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [2]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

In [3]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [4]:
import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2024)

In [7]:
target_labels = ['None', 'Mild', 'Moderate', 'Severe']

In [8]:
season_dtype = pl.Enum(['Spring', 'Summer', 'Fall', 'Winter'])

train = (
    pl.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
    .with_columns(pl.col('^.*Season$').cast(season_dtype))
)

test = (
    pl.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
    .with_columns(pl.col('^.*Season$').cast(season_dtype))
)

train
test

id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
str,enum,i64,i64,enum,i64,enum,f64,f64,f64,f64,i64,i64,i64,enum,i64,i64,i64,enum,i64,i64,f64,i64,f64,i64,i64,i64,f64,i64,f64,i64,f64,i64,enum,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,enum,f64,enum,f64,enum,i64,i64,enum,i64
"""00008ff9""","""Fall""",5,0,"""Winter""",51,"""Fall""",16.877316,46.0,50.8,,,,,,,,,"""Fall""",0,0,,,,,0,0,7.0,0,6.0,0,6.0,1,"""Fall""",2,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,,,,"""Fall""",3
"""000fd460""","""Summer""",9,0,,,"""Fall""",14.03559,48.0,46.0,22.0,75,70,122,,,,,"""Fall""",3,0,,,,,5,0,11.0,1,11.0,1,3.0,0,"""Winter""",2,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1,21.0352,14.974,39.4497,15.4107,27.0552,,,"""Fall""",2.34,"""Fall""",46,64,"""Summer""",0
"""00105258""","""Summer""",10,1,"""Fall""",71,"""Fall""",16.648696,56.5,75.6,,65,94,117,"""Fall""",5,7,33,"""Fall""",20,1,10.2,1,14.7,2,7,1,10.0,1,10.0,1,5.0,0,,,,,,,,,,,,,,,,,,,,"""Summer""",2.17,"""Fall""",38,54,"""Summer""",2
"""00115b9f""","""Winter""",9,0,"""Fall""",71,"""Summer""",18.292347,56.0,81.6,,60,97,117,"""Summer""",6,9,37,"""Summer""",18,1,,,,,5,0,7.0,0,7.0,0,7.0,1,"""Summer""",3,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2,30.4041,16.779,58.9338,26.4798,45.9966,,,"""Winter""",2.451,"""Summer""",31,45,"""Winter""",0
"""0016bb22""","""Spring""",18,1,"""Summer""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""Summer""",1.04,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""00c0cd71""","""Winter""",7,0,"""Summer""",51,"""Spring""",29.315775,54.0,121.6,,80,75,99,"""Spring""",4,5,32,"""Spring""",6,1,,,,,0,0,12.0,1,15.0,1,12.0,1,,,,,,,,,,,,,,,,,,,,,,"""Spring""",35,50,"""Winter""",2
"""00d56d4b""","""Spring""",5,1,"""Summer""",80,"""Spring""",17.284504,44.0,47.6,,61,76,109,"""Spring""",,,,"""Spring""",0,0,,,,,0,0,10.5,1,10.0,1,7.0,1,,,,,,,,,,,,,,,,,,,,,,"""Spring""",37,53,"""Spring""",0
"""00d9913d""","""Fall""",10,1,,,"""Fall""",19.893157,55.0,85.6,30.0,,81,,,,,,"""Fall""",5,0,,,,,0,0,0.0,0,0.0,0,9.0,1,,,,,,,,,,,,,,,,,,,,,,,,,"""Fall""",1
"""00e6167c""","""Winter""",6,0,"""Spring""",60,"""Winter""",30.094649,37.5,60.2,24.0,61,91,95,,,,,"""Winter""",6,1,,,,,0,0,4.0,0,4.0,0,7.0,1,"""Winter""",2,2.75035,17.2738,1003.07,1504.61,15.1456,49.1034,14.0898,3.18407,11.0966,1,23.6182,10.3396,46.3531,19.8886,38.7638,,,,,"""Winter""",39,55,"""Winter""",3


# Actigraphy (time series)

In [9]:
actigraphy = pl.read_parquet('/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=0417c91e/part-0.parquet')
actigraphy

step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT
u32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i8,i8,f32
0,0.014375,-0.020112,-0.995358,0.00106,-88.445251,0.0,41.0,4195.0,44100000000000,2,2,5.0
1,0.014167,-0.023278,-0.996164,0.000289,-88.3722,0.0,41.0,4194.833496,44105000000000,2,2,5.0
2,0.014036,-0.022964,-0.99632,0.000301,-88.356422,0.0,41.5,4194.666504,44110000000000,2,2,5.0
3,0.013593,-0.022048,-0.996762,0.002278,-88.575943,0.0,37.5,4194.5,44115000000000,2,2,5.0
4,-0.061772,-0.065317,-0.973063,0.092321,-88.391273,0.0,55.666668,4199.0,44780000000000,2,2,5.0
…,…,…,…,…,…,…,…,…,…,…,…,…
287174,-0.407433,0.091612,-0.377763,0.039733,-43.319416,0.0,7.0,3695.0,32875000000000,1,3,53.0
287175,-0.703572,0.016187,0.15956,0.03598,14.12139,0.0,7.0,3695.0,32880000000000,1,3,53.0
287176,-0.209607,-0.4697,0.636573,0.097799,44.998573,0.0,7.0,3695.0,32885000000000,1,3,53.0
287177,-0.390378,0.284386,0.147654,0.057826,7.726313,0.0,7.0,3695.0,32890000000000,1,3,53.0


In [5]:
SEED = 42
n_splits = 5

# Feature Engineering

- **Feature Selection**: The dataset contains features related to physical characteristics (e.g., BMI, Height, Weight), behavioral aspects (e.g., internet usage), and fitness data (e.g., endurance time). 
- **Categorical Feature Encoding**: Categorical features are mapped to numerical values using custom mappings for each unique category within the dataset. This ensures compatibility with machine learning algorithms that require numerical input.
- **Time Series Aggregation**: Time series statistics (e.g., mean, standard deviation) from the actigraphy data are computed and merged into the main dataset to create additional features for model training.


In [6]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    return df

In [12]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]
        
train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

train = train.drop('id', axis=1)
test  = test .drop('id', axis=1)   


featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

featuresCols += time_series_cols
test = test[featuresCols]

100%|██████████| 996/996 [01:15<00:00, 13.27it/s]
100%|██████████| 2/2 [00:00<00:00, 10.38it/s]


Epoch [10/100], Loss: 1.6710]
Epoch [20/100], Loss: 1.5469]
Epoch [30/100], Loss: 1.5154]
Epoch [40/100], Loss: 1.4932]
Epoch [50/100], Loss: 1.4964]
Epoch [60/100], Loss: 1.4920]
Epoch [70/100], Loss: 1.4309]
Epoch [80/100], Loss: 1.4185]
Epoch [90/100], Loss: 1.3667]
Epoch [100/100], Loss: 1.3620]
Epoch [10/100], Loss: 1.0070]
Epoch [20/100], Loss: 0.5783]
Epoch [30/100], Loss: 0.4271]
Epoch [40/100], Loss: 0.4271]
Epoch [50/100], Loss: 0.4271]
Epoch [60/100], Loss: 0.4271]
Epoch [70/100], Loss: 0.4271]
Epoch [80/100], Loss: 0.4271]
Epoch [90/100], Loss: 0.4271]
Epoch [100/100], Loss: 0.4271]


In [13]:
# train_copy_1 = train.copy()
# test_copy_1 = test.copy()

In [14]:
# total_nan = train.isna().sum().sum()
# total_nan

In [15]:
# train_copy = train.copy()
# train.head()
train.shape

(3960, 124)

In [16]:
# train_copy_after = data_clean(train_copy)
# total_nan = train_copy_after.isna().sum().sum()
# total_nan

In [17]:
# train_copy_after2 = infzero_to_nan(train_copy_after)
# total_nan = train_copy_after2.isna().sum().sum()
# total_nan

In [18]:

def data_clean(df):
    
    df.loc[df["CGAS-CGAS_Score"] >= 200, "CGAS-CGAS_Score"] = np.nan
    df.loc[(df["Physical-Weight"] >= 300) | (df["Physical-Weight"] <= 20), "Physical-Weight"] = np.nan
    df.loc[(df["Physical-BMI"] >= 50) | (df["Physical-BMI"] <= 5), "Physical-Weight"] = np.nan
    df.loc[(df["Physical-Diastolic_BP"] >= 150) | (df["Physical-Diastolic_BP"] <= 25), "Physical-Diastolic_BP"] = np.nan
    df.loc[(df["Physical-HeartRate"] >= 120) | (df["Physical-HeartRate"] <= 40), "Physical-HeartRate"] = np.nan
    df.loc[df["Physical-Systolic_BP"] <= 40, "Physical-Systolic_BP"] = np.nan
    df.loc[(df["Fitness_Endurance-Max_Stage"] >= 20) | (df["Fitness_Endurance-Max_Stage"] == 0), "Fitness_Endurance-Max_Stage"] = np.nan
    df["Fitness_Endurance-Time"] = df["Fitness_Endurance-Time_Mins"]*60 + df["Fitness_Endurance-Time_Sec"]
    df.drop(['Fitness_Endurance-Time_Mins'], axis=1)
    df.drop(['Fitness_Endurance-Time_Sec'], axis=1)
    df.loc[df["Fitness_Endurance-Time"] <= 0, "Fitness_Endurance-Time"] = np.nan
    # あってるかよく確認 聞き手の方が大きくなるように値を入れ替えている
    df.loc[df['FGC-FGC_GSND'] > df['FGC-FGC_GSD'], ['FGC-FGC_GSD', 'FGC-FGC_GSND']] = df.loc[df['FGC-FGC_GSND'] > df['FGC-FGC_GSD'], ['FGC-FGC_GSND', 'FGC-FGC_GSD']].values
    df.loc[(df["FGC-FGC_GSND"] >= 100) | (df["FGC-FGC_GSND"] <= 0), "FGC-FGC_GSND"] = np.nan
    df.loc[(df["FGC-FGC_GSD"] >= 100) | (df["FGC-FGC_GSD"] <= 0), "FGC-FGC_GSD"] = np.nan 
    df.loc[df["FGC-FGC_SRL"] == 0, "FGC-FGC_SRL"] = np.nan
    df.loc[df["FGC-FGC_SRR"] == 0, "FGC-FGC_SRR"] = np.nan
    df.loc[df["FGC-FGC_TL"] == 0, "FGC-FGC_TL"] = np.nan
    df.loc[(df["BIA-BIA_BMC"] >= 50) | (df["BIA-BIA_BMC"] <= 0), "BIA-BIA_BMC"] = np.nan
    df.loc[df["BIA-BIA_BMR"] >= 6000, "BIA-BIA_BMR"] = np.nan
    df.loc[df["BIA-BIA_DEE"] >= 12000, "BIA-BIA_DEE"] = np.nan
    df.loc[df["BIA-BIA_ECW"] >= 300, "BIA-BIA_ECW"] = np.nan
    df.loc[df["BIA-BIA_FFM"] >= 500, "BIA-BIA_FFM"] = np.nan
    df.loc[(df["BIA-BIA_FFMI"] >= 120) | (df["BIA-BIA_FFMI"] <= 40), "BIA-BIA_FFMI"] = np.nan
    df.loc[df["BIA-BIA_FMI"] <= -20, "BIA-BIA_FMI"] = np.nan
    df.loc[df["BIA-BIA_Fat"] <= 3, "BIA-BIA_Fat"] = np.nan
    df.loc[df["BIA-BIA_ICW"] >= 200, "BIA-BIA_ICW"] = np.nan
    df.loc[df["BIA-BIA_LDM"] >= 200, "BIA-BIA_LDM"] = np.nan
    df.loc[df["BIA-BIA_LDM"] >= df["Physical-Weight"], "BIA-BIA_LDM"] = np.nan
    df.loc[df["BIA-BIA_LST"] >= 300, "BIA-BIA_LST"] = np.nan
    df.loc[df["BIA-BIA_LST"] >= 300, "BIA-BIA_LST"] = np.nan
    df.loc[df["BIA-BIA_SMM"] >= 300, "BIA-BIA_SMM"] = np.nan
    df.loc[df["BIA-BIA_TBW"] >= 300, "BIA-BIA_TBW"] = np.nan

    return df

def infzero_to_nan(df):
    if np.any(np.isinf(df)):
        df = df.replace([np.inf, -np.inf], np.nan)
    
    col_miura = ['Basic_Demos-Age',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T','BMI_Age','Internet_Hours_Age'
                ]
    df[col_miura] = df[col_miura].where(df[col_miura] > 0, np.nan)

    return df

def infzero_to_nan_noncat(df):

    # 数値型の列だけを選択
    numeric_cols = df.select_dtypes(include=[np.number])
    
    # 数値型の列でinfをnanに置き換え
    if np.any(np.isinf(numeric_cols)):
        df[numeric_cols.columns] = numeric_cols.replace([np.inf, -np.inf], np.nan)
    
    col_miura = ['Basic_Demos-Age',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T'
                ]
    df[col_miura] = df[col_miura].where(df[col_miura] > 0, np.nan)

    return df

In [19]:
train = data_clean(train)
train = infzero_to_nan(train)
test = data_clean(test)
test = infzero_to_nan(test)

In [20]:
# train_copy_2 = train.copy()
# test_copy_2 = test.copy()

In [21]:
# 0:2178
# 1:1311
# 2:437
# 3:34

# 本来は
# 0:1594
# 1:730
# 2:378
# 3:34


In [22]:
# count = (train['sii'] == 1).sum()
# print(f"'sii'列が1のデータ数: {count}")

In [23]:
# train_test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

In [24]:
# count = (train_test['sii'] == 3).sum()
# print(f"'sii'列が1のデータ数: {count}")

In [25]:
# if np.any(np.isinf(train)):
#     train = train.replace([np.inf, -np.inf], np.nan)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

# Model Training and Evaluation

- **Model Types**: Various models are used, including:
  - **LightGBM**: A gradient-boosting framework known for its speed and efficiency with large datasets.
  - **XGBoost**: Another powerful gradient-boosting model used for structured data.
  - **CatBoost**: Optimized for categorical features without the need for extensive preprocessing.
  - **Voting Regressor**: An ensemble model that combines the predictions of LightGBM, XGBoost, and CatBoost for better accuracy.
- **Cross-Validation**: Stratified K-Folds cross-validation is employed to split the data into training and validation sets, ensuring balanced class distribution in each fold.
- **Quadratic Weighted Kappa (QWK)**: The performance of the models is evaluated using QWK, which measures the agreement between predicted and actual values, taking into account the ordinal nature of the target variable.
- **Threshold Optimization**: The `minimize` function from `scipy.optimize` is used to fine-tune decision thresholds that map continuous predictions to discrete categories (None, Mild, Moderate, Severe).


In [26]:
from sklearn.utils.class_weight import compute_sample_weight
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]


        # data_sum=1586+722+377+34
        # weight_0 = data_sum/1586
        # weight_1 = data_sum/722
        # weight_2 = data_sum/377
        # weight_3 = data_sum/34
        # # クラス重みに基づいてサンプル重みを計算
        
        # # class_weights = {0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3}
        # weights = {0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3}
        # # weights = compute_sample_weight(class_weight=class_weights, y=y_train)

        model = clone(model_class)

        # CatBoost の場合に sample_weight を渡す
        # if isinstance(model, CatBoostRegressor):
         #   model.fit(X_train, y_train, sample_weight=weights)
       # elif isinstance(model, TabNetRegressor):
     #       model.fit(X_train, y_train, sample_weight=weights)
            # model.fit(X_train.values, y_train.values, weights=weights)
     #   else:
        #    model.fit(X_train, y_train)

        # model = clone(model_class)
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission


# Hyperparameter Tuning

- **LightGBM Parameters**: Hyperparameters such as `learning_rate`, `max_depth`, `num_leaves`, and `feature_fraction` are tuned to improve the performance of the LightGBM model. These parameters control the complexity of the model and its ability to generalize to new data.
- **XGBoost and CatBoost Parameters**: Similar tuning is applied for XGBoost and CatBoost, adjusting parameters such as `n_estimators`, `max_depth`, `learning_rate`, `subsample`, and `regularization` terms (`reg_alpha`, `reg_lambda`). These help in controlling overfitting and ensuring the model's robustness.

In [27]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'cpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [28]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback
import os
import torch
from pytorch_tabnet.callbacks import Callback

class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

# Ensemble Learning and Submission Preparation

- **Ensemble Learning**: The model uses a **Voting Regressor**, which combines the predictions from LightGBM, XGBoost, and CatBoost. This approach is beneficial as it leverages the strengths of multiple models, reducing overfitting and improving overall model performance.
- **Out-of-Fold (OOF) Predictions**: During cross-validation, out-of-fold predictions are generated for the training set, which helps in model evaluation without data leakage.
- **Kappa Optimizer**: The Kappa Optimizer ensures that the predicted values are as close to the actual values as possible by adjusting the thresholds used to convert raw model outputs into class labels.
- **Test Set Predictions**: After the model is trained and thresholds are optimized, the test dataset is processed, and predictions are generated using the ensemble model. These predictions are converted into the appropriate format for submission.
- **Submission File Creation**: The predictions are saved in a CSV file following the required format for submission (e.g., for a Kaggle competition), which includes columns like `id` and `sii` (Severity Impairment Index).

# Final Results and Performance Metrics

- **Train and Validation Scores**: After training across multiple folds, the mean Quadratic Weighted Kappa (QWK) score is calculated for both the training and validation datasets, providing an indicator of model performance. 
- **Optimized QWK Score**: The final optimized QWK score after threshold tuning is displayed, showcasing the model's ability to predict the severity levels effectively.
- **Test Predictions**: The test set predictions are evaluated, and a breakdown of the predicted severity levels (None, Mild, Moderate, Severe) is shown, along with their respective counts.

In [29]:
# data_sum=1586+722+377+34
# weight_0 = data_sum/1586
# weight_1 = data_sum/722
# weight_2 = data_sum/377
# weight_3 = data_sum/34


In [30]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params) # New
#Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300,class_weight={0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3})
#XGB_Model = XGBRegressor(**XGB_Params,class_weight={0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3})
# TabNet_Model = TabNetWrapper(**TabNet_Params,class_weight={0: 1.0, 1: 2.0, 2: 1.5, 3: 0.5}) # New
# CatBoost_Model = CatBoostRegressor(**CatBoost_Params,class_weight={0: 1.0, 1: 2.0, 2: 1.5, 3: 0.5})


---
# **》》》Model1.Train**
---

In [31]:
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
],weights=[4.0,4.0,5.0,4.0])

Submission1 = TrainML(voting_model, test)

Submission1

Training Folds: 100%|██████████| 5/5 [01:45<00:00, 21.16s/it]

Mean Train QWK --> 0.7300
Mean Validation QWK ---> 0.4769
----> || Optimized QWK SCORE :: [36m[1m 0.534[0m





Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,0


```
],weights=[5.0,4.0,4.0,4.0])
Mean Train QWK --> 0.7424
Mean Validation QWK ---> 0.4735
----> || Optimized QWK SCORE ::  0.533

```

---
# **》》》Model2**
---

In [32]:
# train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

In [33]:
# train.head()

In [34]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df
        
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)   

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

# Train the ensemble model
Submission2 = TrainML(voting_model, test)

# Save submission
#Submission2.to_csv('submission.csv', index=False)
Submission2

Training Folds: 100%|██████████| 5/5 [00:50<00:00, 10.17s/it]

Mean Train QWK --> 0.7595
Mean Validation QWK ---> 0.3926





----> || Optimized QWK SCORE :: [36m[1m 0.457[0m


Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


In [35]:
# train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
# test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
# sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# def process_file(filename, dirname):
#     df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
#     df.drop('step', axis=1, inplace=True)
#     return df.describe().values.reshape(-1), filename.split('=')[1]

# def load_time_series(dirname) -> pd.DataFrame:
#     ids = os.listdir(dirname)
    
#     with ThreadPoolExecutor() as executor:
#         results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
#     stats, indexes = zip(*results)
    
#     df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
#     df['id'] = indexes
#     return df
        
# train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
# test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

# time_series_cols = train_ts.columns.tolist()
# time_series_cols.remove("id")

# train = pd.merge(train, train_ts, how="left", on='id')
# test = pd.merge(test, test_ts, how="left", on='id')

# train = train.drop('id', axis=1)
# test = test.drop('id', axis=1)   

# featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
#                 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
#                 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
#                 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
#                 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
#                 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
#                 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
#                 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
#                 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
#                 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
#                 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
#                 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
#                 'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
#                 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
#                 'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
#                 'PreInt_EduHx-computerinternet_hoursday', 'sii']

# featuresCols += time_series_cols

# train = train[featuresCols]
# train = train.dropna(subset='sii')

# cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
#           'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
#           'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

# # train = train_copy_2
# # test = test_copy_2

# def update(df):
#     global cat_c
#     for c in cat_c: 
#         df[c] = df[c].fillna('Missing')
#         df[c] = df[c].astype('category')
#     return df
        
# train = update(train)
# test = update(test)




# # 追加分
# # train = data_clean(train)
# # train = infzero_to_nan_noncat(train)
# # test = data_clean(test)
# # test = infzero_to_nan_noncat(test)
# featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
#                 'CGAS-CGAS_Score', 'Physical-BMI',
#                 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
#                 'Fitness_Endurance-Max_Stage',
#                 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
#                 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
#                 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
#                 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
#                 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
#                 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
#                 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
#                 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
#                 'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
#                 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
#                 'SDS-SDS_Total_T',
#                 'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
#                 'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
#                 'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']


# def create_mapping(column, dataset):
#     unique_values = dataset[column].unique()
#     return {value: idx for idx, value in enumerate(unique_values)}

# for col in cat_c:
#     mapping = create_mapping(col, train)
#     mappingTe = create_mapping(col, test)
    
#     train[col] = train[col].replace(mapping).astype(int)
#     test[col] = test[col].replace(mappingTe).astype(int)

# def quadratic_weighted_kappa(y_true, y_pred):
#     return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# def threshold_Rounder(oof_non_rounded, thresholds):
#     return np.where(oof_non_rounded < thresholds[0], 0,
#                     np.where(oof_non_rounded < thresholds[1], 1,
#                              np.where(oof_non_rounded < thresholds[2], 2, 3)))

# def evaluate_predictions(thresholds, y_true, oof_non_rounded):
#     rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
#     return -quadratic_weighted_kappa(y_true, rounded_p)

# def TrainML(model_class, test_data):
#     X = train.drop(['sii'], axis=1)
#     y = train['sii']

#     SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
#     train_S = []
#     test_S = []
    
#     oof_non_rounded = np.zeros(len(y), dtype=float) 
#     oof_rounded = np.zeros(len(y), dtype=int) 
#     test_preds = np.zeros((len(test_data), n_splits))

#     for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
#         X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

#         model = clone(model_class)

#        #  data_sum=1586+722+377+34
#        #  weight_0 = data_sum/1586
#        #  weight_1 = data_sum/722
#        #  weight_2 = data_sum/377
#        #  weight_3 = data_sum/34
#        #  # weight_0 = 1
#        #  # weight_1 = 2
#        #  # weight_2 = 10
#        #  # weight_3 = 50
#        #  # クラス重みに基づいてサンプル重みを計算
#        #  # class_weights = {0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3}
#        #  weights = {0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3}

#        # # if isinstance(model, CatBoostRegressor):
#        #     # model.fit(X_train, y_train, sample_weight=weights)

#        # # else:
#        #  #    model.fit(X_train, y_train)
#         model.fit(X_train, y_train)
            
#         y_train_pred = model.predict(X_train)
#         y_val_pred = model.predict(X_val)

#         oof_non_rounded[test_idx] = y_val_pred
#         y_val_pred_rounded = y_val_pred.round(0).astype(int)
#         oof_rounded[test_idx] = y_val_pred_rounded

#         train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
#         val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

#         train_S.append(train_kappa)
#         test_S.append(val_kappa)
        
#         test_preds[:, fold] = model.predict(test_data)
        
#         print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
#         clear_output(wait=True)

#     print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
#     print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

#     KappaOPtimizer = minimize(evaluate_predictions,
#                               x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
#                               method='Nelder-Mead')
#     assert KappaOPtimizer.success, "Optimization did not converge."
    
#     oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
#     tKappa = quadratic_weighted_kappa(y, oof_tuned)

#     print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

#     tpm = test_preds.mean(axis=1)
#     tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
#     submission = pd.DataFrame({
#         'id': sample['id'],
#         'sii': tpTuned
#     })

#     return submission

# # Model parameters for LightGBM
# Params = {
#     'learning_rate': 0.046,
#     'max_depth': 12,
#     'num_leaves': 478,
#     'min_data_in_leaf': 13,
#     'feature_fraction': 0.893,
#     'bagging_fraction': 0.784,
#     'bagging_freq': 4,
#     'lambda_l1': 10,  # Increased from 6.59
#     'lambda_l2': 0.01  # Increased from 2.68e-06
# }


# # XGBoost parameters
# XGB_Params = {
#     'learning_rate': 0.05,
#     'max_depth': 6,
#     'n_estimators': 200,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
#     'reg_alpha': 1,  # Increased from 0.1
#     'reg_lambda': 5,  # Increased from 1
#     'random_state': SEED
# }


# CatBoost_Params = {
#     'learning_rate': 0.05,
#     'depth': 6,
#     'iterations': 200,
#     'random_seed': SEED,
#     # 'cat_features': cat_c,
#     'verbose': 0,
#     'l2_leaf_reg': 10  # Increase this value
# }

# # Create model instances
# Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
# XGB_Model = XGBRegressor(**XGB_Params)
# CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# #CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
# # TabNet_Model = TabNetWrapper(**TabNet_Params) # New
# #Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300,class_weight={0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3})
# #XGB_Model = XGBRegressor(**XGB_Params,class_weight={0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3})


# # Combine models using Voting Regressor
# voting_model = VotingRegressor(estimators=[
#     ('lightgbm', Light),
#     ('xgboost', XGB_Model),
#     ('catboost', CatBoost_Model)
# ])

# # Train the ensemble model
# Submission2 = TrainML(voting_model, test)

# # Save submission
# #Submission2.to_csv('submission.csv', index=False)
# Submission2

In [36]:
# train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
# test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
# sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
#                 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
#                 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
#                 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
#                 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
#                 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
#                 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
#                 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
#                 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
#                 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
#                 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
#                 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
#                 'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
#                 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
#                 'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
#                 'PreInt_EduHx-computerinternet_hoursday', 'sii']

# cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
#           'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
#           'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

# train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
# test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

# time_series_cols = train_ts.columns.tolist()
# time_series_cols.remove("id")

# train = pd.merge(train, train_ts, how="left", on='id')
# test = pd.merge(test, test_ts, how="left", on='id')

# train = train.drop('id', axis=1)
# test = test.drop('id', axis=1)

# featuresCols += time_series_cols

# train = train[featuresCols]
# train = train.dropna(subset='sii')

# def update(df):
#     global cat_c
#     for c in cat_c: 
#         df[c] = df[c].fillna('Missing')
#         df[c] = df[c].astype('category')
#     return df

# train = update(train)
# test = update(test)



# # 追加分
# # train = data_clean(train)
# # train = infzero_to_nan_noncat(train)
# # test = data_clean(test)
# # test = infzero_to_nan_noncat(test)




# def create_mapping(column, dataset):
#     unique_values = dataset[column].unique()
#     return {value: idx for idx, value in enumerate(unique_values)}

# for col in cat_c:
#     mapping = create_mapping(col, train)
#     mappingTe = create_mapping(col, test)
    
#     train[col] = train[col].replace(mapping).astype(int)
#     test[col] = test[col].replace(mappingTe).astype(int)

# def quadratic_weighted_kappa(y_true, y_pred):
#     return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# def threshold_Rounder(oof_non_rounded, thresholds):
#     return np.where(oof_non_rounded < thresholds[0], 0,
#                     np.where(oof_non_rounded < thresholds[1], 1,
#                              np.where(oof_non_rounded < thresholds[2], 2, 3)))

# def evaluate_predictions(thresholds, y_true, oof_non_rounded):
#     rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
#     return -quadratic_weighted_kappa(y_true, rounded_p)

# def TrainML(model_class, test_data):
#     X = train.drop(['sii'], axis=1)
#     y = train['sii']

#     SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
#     train_S = []
#     test_S = []
    
#     oof_non_rounded = np.zeros(len(y), dtype=float) 
#     oof_rounded = np.zeros(len(y), dtype=int) 
#     test_preds = np.zeros((len(test_data), n_splits))

#     for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
#         X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

#         model = clone(model_class)


#         # data_sum=1586+722+377+34
#         # weight_0 = data_sum/1586
#         # weight_1 = data_sum/722
#         # weight_2 = data_sum/377
#         # weight_3 = data_sum/34
        
#         # weights = {0: weight_0, 1: weight_1, 2: weight_2, 3: weight_3}
        
#         # # Pipeline内のモデルに sample_weight を渡して fit
#         # model.fit(X_train, y_train,
#         #           lgb__regressor__sample_weight=weights,
#         #           xgb__regressor__sample_weight=weights,
#         #           cat__regressor__sample_weight=weights,
#         #           rf__regressor__sample_weight=weights,
#         #           gb__regressor__sample_weight=weights)

        
#         model.fit(X_train, y_train)

#         y_train_pred = model.predict(X_train)
#         y_val_pred = model.predict(X_val)

#         oof_non_rounded[test_idx] = y_val_pred
#         y_val_pred_rounded = y_val_pred.round(0).astype(int)
#         oof_rounded[test_idx] = y_val_pred_rounded

#         train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
#         val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

#         train_S.append(train_kappa)
#         test_S.append(val_kappa)
        
#         test_preds[:, fold] = model.predict(test_data)
        
#         print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
#         clear_output(wait=True)

#     print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
#     print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

#     KappaOPtimizer = minimize(evaluate_predictions,
#                               x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
#                               method='Nelder-Mead')
#     assert KappaOPtimizer.success, "Optimization did not converge."
    
#     oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
#     tKappa = quadratic_weighted_kappa(y, oof_tuned)

#     print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

#     tpm = test_preds.mean(axis=1)
#     tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

#     return tp_rounded

# imputer = SimpleImputer(strategy='median')

# ensemble = VotingRegressor(estimators=[
#     ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
#     ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
#     ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
#     ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
#     ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
# ])

# Submission3 = TrainML(ensemble, test)
# Submission3 = pd.DataFrame({
#     'id': sample['id'],
#     'sii': Submission3
# })

# Submission3

Training Folds: 100%|██████████| 5/5 [01:56<00:00, 23.35s/it]

Mean Train QWK --> 0.9175
Mean Validation QWK ---> 0.3803





----> || Optimized QWK SCORE :: [36m[1m 0.450[0m


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,0


In [7]:
!pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl

In [8]:
!mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [17]:
# tabpfnでsubmission3を作成する

import numpy as np
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split

train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from tabpfn import TabPFNClassifier

class TabPFNRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, device='cpu'):
        self.device = device
        self.model = TabPFNClassifier(device=self.device)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        # 確率を取得
        print(X.shape)
        probabilities = self.model.predict_proba(X)
        # 各クラスのラベルを [0, 1, 2, 3] と仮定して加重平均を計算
        class_labels = np.arange(probabilities.shape[1])
        return np.dot(probabilities, class_labels)

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
#                 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
#                 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
#                 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
#                 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
#                 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
#                 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
#                 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
#                 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
#                 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
#                 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
#                 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
#                 'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
#                 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
#                 'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
#                 'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T','PreInt_EduHx-computerinternet_hoursday','sii',]

featuresCols_test = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T','PreInt_EduHx-computerinternet_hoursday']

# cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
#           'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
#           'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(df_train, encoding_dim=30, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=30, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

# imputer = KNNImputer(n_neighbors=5)
# numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
# imputed_data = imputer.fit_transform(train[numeric_cols])
# train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
# train_imputed['sii'] = train_imputed['sii'].round().astype(int)
# for col in train.columns:
#     if col not in numeric_cols:
#         train_imputed[col] = train[col]
        
# train = train_imputed

# train = feature_engineering(train)
# train = train.dropna(thresh=10, axis=0)
# test = feature_engineering(test)

train = train.drop('id', axis=1)
test  = test .drop('id', axis=1)   



# time_series_cols = train_ts.columns.tolist()
# time_series_cols.remove("id")

# train = pd.merge(train, train_ts, how="left", on='id')
# test = pd.merge(test, test_ts, how="left", on='id')

# train = train.drop('id', axis=1)
# test = test.drop('id', axis=1)

featuresCols += time_series_cols
featuresCols_test += time_series_cols

train = train[featuresCols]
test = test[featuresCols_test]
train = train.dropna(subset='sii')


100%|██████████| 996/996 [01:19<00:00, 12.52it/s]
100%|██████████| 2/2 [00:00<00:00, 13.07it/s]
  return lib.map_infer(values, mapper, convert=convert)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  return lib.map_infer(values, mapper, convert=convert)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Epoch [10/100], Loss: 1.6076]
Epoch [20/100], Loss: 1.5499]
Epoch [30/100], Loss: 1.5218]
Epoch [40/100], Loss: 1.5105]
Epoch [50/100], Loss: 1.5136]
Epoch [60/100], Loss: 1.5074]
Epoch [70/100], Loss: 1.5078]
Epoch [80/100], Loss: 1.4996]
Epoch [90/100], Loss: 1.4957]
Epoch [100/100], Loss: 1.4761]
Epoch [10/100], Loss: 1.0307]
Epoch [20/100], Loss: 0.7398]
Epoch [30/100], Loss: 0.4382]
Epoch [40/100], Loss: 0.4271]
Epoch [50/100], Loss: 0.4271]
Epoch [60/100], Loss: 0.4271]
Epoch [70/100], Loss: 0.4271]
Epoch [80/100], Loss: 0.4271]
Epoch [90/100], Loss: 0.4271]


  return lib.map_infer(values, mapper, convert=convert)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  return lib.map_infer(values, mapper, convert=convert)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Epoch [100/100], Loss: 0.4271]


In [21]:
test_sub = test.copy()

In [22]:
test.head()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30
0,5,0,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,38.0,54.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,0.0,1.377484,0.0,0.0,0.0,2.573518,0.0,0.0,0.0,0.848512,1.132202,7.846786,3.140399,1.03644,0.0,2.631907,0.067091,0.0,0.0,0.0,0.0,0.0,0.0,7.839452,0.0,0.0,0.0,0.0,5.113706,6.303276
4,18,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [24]:

# データの成型が上記で終わり
# データセット2つ作成する
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)


import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# データセット全体
X = train.drop(columns=['sii'])
y = train['sii']

# StratifiedShuffleSplitの初期化
sss = StratifiedShuffleSplit(n_splits=1, train_size=1024, random_state=42)

# データセット1を作成
for train_idx, _ in sss.split(X, y):
    X_1 = X.iloc[train_idx]
    y_1 = y.iloc[train_idx]

# データセット2を作成（データセット1を除いた残りからサンプリング）
# remaining_X = X.drop(index=train_idx)
# remaining_y = y.drop(index=train_idx)
remaining_X = X.drop(train_idx)
remaining_y = y.drop(train_idx)

sss = StratifiedShuffleSplit(n_splits=1, train_size=1024, random_state=43)
for train_idx, _ in sss.split(remaining_X, remaining_y):
    X_2 = remaining_X.iloc[train_idx]
    y_2 = remaining_y.iloc[train_idx]

# データセットの確認
print(f"Dataset 1 size: {len(X_1)}")
print(f"Dataset 2 size: {len(X_2)}")

# クラスごとの分布確認import numpy as np
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split

print("Dataset 1 class distribution:\n", y_1.value_counts(normalize=True))
print("Dataset 2 class distribution:\n", y_2.value_counts(normalize=True))


Dataset 1 size: 1024
Dataset 2 size: 1024
Dataset 1 class distribution:
 sii
0.0    0.583008
1.0    0.266602
2.0    0.137695
3.0    0.012695
Name: proportion, dtype: float64
Dataset 2 class distribution:
 sii
0.0    0.582031
1.0    0.266602
2.0    0.138672
3.0    0.012695
Name: proportion, dtype: float64


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [13]:

class TabPFNRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, device='cpu'):
        self.device = device
        self.model = TabPFNClassifier(device=self.device)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        # 確率を取得
        print(X.shape)
        probabilities = self.model.predict_proba(X)
        # 各クラスのラベルを [0, 1, 2, 3] と仮定して加重平均を計算
        class_labels = np.arange(probabilities.shape[1])
        return np.dot(probabilities, class_labels)
        
# train = X_1
# test = y_1

# train.shape

(1024, 78)

In [25]:
# 関数の類と実行

warnings.simplefilter('ignore')

# train = X_1
# test = y_1
test = test
        


def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    # X = train.drop(['sii'], axis=1)
    # y = train['sii']
    X = X_1
    y = y_1

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    print(submission)

    return submission


In [16]:
test

632     2.0
2096    0.0
1439    0.0
2447    1.0
1513    1.0
       ... 
1790    0.0
1807    1.0
697     0.0
498     0.0
750     2.0
Name: sii, Length: 1024, dtype: float64

In [26]:
# Create model instances
# Combine models using Voting Regressor
TabNet_Model = TabPFNRegressor(device='cpu')
voting_model = VotingRegressor(estimators=[
('tabpfn', TabNet_Model)
])

# Train the ensemble model
Submission3 = TrainML(voting_model, test)

# Save submission
#Submission2.to_csv('submission.csv', index=False)
Submission3

Training Folds: 100%|██████████| 5/5 [00:30<00:00,  6.16s/it]

Mean Train QWK --> 0.6667
Mean Validation QWK ---> 0.3696
----> || Optimized QWK SCORE :: [36m[1m 0.387[0m
          id  sii
0   00008ff9    1
1   000fd460    0
2   00105258    0
3   00115b9f    0
4   0016bb22    0
5   001f3379    0
6   0038ba98    0
7   0068a485    0
8   0069fbed    1
9   0083e397    0
10  0087dd65    0
11  00abe655    0
12  00ae59c9    1
13  00af6387    1
14  00bd4359    1
15  00c0cd71    1
16  00d56d4b    0
17  00d9913d    0
18  00e6167c    0
19  00ebc35d    1



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not i

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,0
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


In [None]:
import numpy as np
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split

train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
#                 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
#                 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
#                 'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
#                 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
#                 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
#                 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
#                 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
#                 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
#                 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
#                 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
#                 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
#                 'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
#                 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
#                 'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
#                 'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T','PreInt_EduHx-computerinternet_hoursday','sii',]

featuresCols_test = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T','PreInt_EduHx-computerinternet_hoursday']

# cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
#           'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
#           'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(df_train, encoding_dim=30, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=30, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

# imputer = KNNImputer(n_neighbors=5)
# numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
# imputed_data = imputer.fit_transform(train[numeric_cols])
# train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
# train_imputed['sii'] = train_imputed['sii'].round().astype(int)
# for col in train.columns:
#     if col not in numeric_cols:
#         train_imputed[col] = train[col]
        
# train = train_imputed

# train = feature_engineering(train)
# train = train.dropna(thresh=10, axis=0)
# test = feature_engineering(test)

train = train.drop('id', axis=1)
test  = test .drop('id', axis=1)   







# time_series_cols = train_ts.columns.tolist()
# time_series_cols.remove("id")

# train = pd.merge(train, train_ts, how="left", on='id')
# test = pd.merge(test, test_ts, how="left", on='id')

# train = train.drop('id', axis=1)
# test = test.drop('id', axis=1)

featuresCols += time_series_cols
featuresCols_test += time_series_cols

train = train[featuresCols]
test = test[featuresCols_test]
train = train.dropna(subset='sii')

# def update(df):
#     global cat_c
#     for c in cat_c: 
#         df[c] = df[c].fillna('Missing')
#         df[c] = df[c].astype('category')
#     return df

# train = update(train)
# test = update(test)

# featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
#                 'CGAS-CGAS_Score', 'Physical-BMI',
#                 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
#                 'Fitness_Endurance-Max_Stage',
#                 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
#                 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
#                 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
#                 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
#                 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
#                 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
#                 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
#                 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
#                 'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
#                 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
#                 'SDS-SDS_Total_T']

# # 特徴量とターゲットに分ける
# X = train.drop(columns=['sii'])
# y = train['sii']

# # データ分割
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # TabPFNClassifierの初期化と学習
# clf = TabPFNClassifier(device='cpu')  # 必要に応じてGPU ('cuda') を使用
# clf.fit(X_train, y_train)

# # 予測
# predictions = clf.predict(X_test)
# print("Predictions:", predictions)

In [None]:
train_copy = train.copy()
train_copy = train_copy.reset_index(drop=True)

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# データセット全体
X = train_copy.drop(columns=['sii'])
y = train_copy['sii']

# StratifiedShuffleSplitの初期化
sss = StratifiedShuffleSplit(n_splits=1, train_size=1024, random_state=42)

# データセット1を作成
for train_idx, _ in sss.split(X, y):
    X_1 = X.iloc[train_idx]
    y_1 = y.iloc[train_idx]

# データセット2を作成（データセット1を除いた残りからサンプリング）
# remaining_X = X.drop(index=train_idx)
# remaining_y = y.drop(index=train_idx)
remaining_X = X.drop(train_idx)
remaining_y = y.drop(train_idx)

sss = StratifiedShuffleSplit(n_splits=1, train_size=1024, random_state=43)
for train_idx, _ in sss.split(remaining_X, remaining_y):
    X_2 = remaining_X.iloc[train_idx]
    y_2 = remaining_y.iloc[train_idx]

# データセットの確認
print(f"Dataset 1 size: {len(X_1)}")
print(f"Dataset 2 size: {len(X_2)}")

# クラスごとの分布確認
print("Dataset 1 class distribution:\n", y_1.value_counts(normalize=True))
print("Dataset 2 class distribution:\n", y_2.value_counts(normalize=True))


In [None]:
# 特徴量とターゲットに分ける
# X = train.drop(columns=['sii'])
# y = train['sii']

# X=X.iloc[0:1000,:]
# y=y.iloc[0:1000]

# データ分割
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TabPFNClassifierの初期化と学習
clf = TabPFNClassifier(device='cpu')  # 必要に応じてGPU ('cuda') を使用
clf.fit(X_1, y_1)

# # 予測
# predictions = clf.predict(test)
# proba = clf.predict_proba(test)

# print("Predictions:", predictions)
# 各クラスの推論スコアを取得
proba = clf.predict_proba(test)
print("Prediction Probabilities:\n", proba)

# 各データに対する最も高いスコアを持つクラスを取得
predictions = clf.predict(test)
print("Predicted Classes:\n", predictions)

# 二回目
clf = TabPFNClassifier(device='cpu')  # 必要に応じてGPU ('cuda') を使用
clf.fit(X_2, y_2)

# # 予測
# predictions = clf.predict(test)
# proba = clf.predict_proba(test)

# print("Predictions:", predictions)
# 各クラスの推論スコアを取得
proba2 = clf.predict_proba(test)
print("Prediction Probabilities:\n", proba2)

# 各データに対する最も高いスコアを持つクラスを取得
predictions2 = clf.predict(test)
print("Predicted Classes:\n", predictions2)

# アンサンブル (確率の平均)
ensemble_proba = (proba + proba2) / 2

# 最終予測クラス
final_predictions = np.argmax(ensemble_proba, axis=1)

Submission4 = pd.DataFrame({
    'id': sample['id'],
    'sii': final_predictions
})

Submission4


In [None]:
Submission4


In [None]:
sub1 = Submission1
sub2 = Submission2
sub3 = Submission3
sub4 = Submission4
# sub1 = Submission1
# sub2 = Submission1
# sub3 = Submission1

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)
sub4 = sub4.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii'],
    'sii_4': sub4['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3', 'sii_4']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
# final_submission