In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Import
import sys
sys.path.append("/kaggle/src")

from pathlib import Path

import polars as pl
from sklearn.preprocessing import LabelEncoder

from common.constant import TRAIN_CSV_PATH, TEST_CSV_PATH, OUTPUT_DIR
from run.lgb001 import run_kfold

# Data
trn_df = pl.read_csv(TRAIN_CSV_PATH)
tst_df = pl.read_csv(TEST_CSV_PATH)

base_cols = [col for col in trn_df.columns if col in tst_df.columns]
df = pl.concat([trn_df[base_cols], tst_df])

# Feature engineering
features = pl.concat(
    [
        df,
    ],
    how="horizontal",
)

trn_id = trn_df.drop_nulls(subset=["sii"])["id"].to_list()
tst_id = tst_df["id"].to_list()
features = pl.concat([
    features.slice(0, len(trn_df)).filter(pl.col("id").is_in(trn_id)),
    features.slice(len(trn_df)).filter(pl.col("id").is_in(tst_id))
])

# Preprocessing
cat_cols = features.select(pl.col(pl.Utf8)).columns
le = LabelEncoder()
for col in cat_cols:
    encoded = le.fit_transform(features[col].to_numpy())
    features = features.with_columns(pl.Series(encoded).alias(col))

# Model parameters
model_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting": "gbdt",
    "num_leaves": 64,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "feature_fraction": 0.8,
    "learning_rate": 0.1,
    "seed": 1,
    "num_threads": 4,
}

In [9]:
features = features.drop("id").to_pandas()
trn_targets = trn_df.filter(pl.col("id").is_in(trn_id))["sii"].to_pandas()

In [10]:
features

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,0,5,0,3,51.0,0,16.877316,46.0,50.8,,...,32.6909,4,,4,,4,,,0,3.0
1,2,9,0,4,,0,14.035590,48.0,46.0,22.0,...,27.0552,4,,0,2.340,0,46.0,64.0,2,0.0
2,2,10,1,0,71.0,0,16.648696,56.5,75.6,,...,,4,,2,2.170,0,38.0,54.0,2,2.0
3,3,9,0,0,71.0,2,18.292347,56.0,81.6,,...,45.9966,4,,3,2.451,2,31.0,45.0,3,0.0
4,1,13,1,3,50.0,2,22.279952,59.5,112.2,,...,63.1265,4,,1,4.110,2,40.0,56.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,3,7,0,2,51.0,1,29.315775,54.0,121.6,,...,,4,,4,,1,35.0,50.0,3,2.0
2752,1,5,1,2,80.0,1,17.284504,44.0,47.6,,...,,4,,4,,1,37.0,53.0,1,0.0
2753,0,10,1,4,,0,19.893157,55.0,85.6,30.0,...,,4,,4,,4,,,0,1.0
2754,3,6,0,1,60.0,3,30.094649,37.5,60.2,24.0,...,38.7638,4,,4,,3,39.0,55.0,3,3.0


In [24]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from common.constant import SPLIT_RANDOM_SEED

In [18]:
trn_features = features.iloc[:len(trn_targets)]
tst_features = features.iloc[len(trn_targets):]

In [26]:
fold = np.zeros(len(trn_features))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SPLIT_RANDOM_SEED)
for i, (trn_idx, val_idx) in enumerate(skf.split(trn_features, trn_targets)):
    fold[val_idx] = i + 1
fold

array([2., 3., 2., ..., 5., 2., 2.])

In [30]:
trn_df.filter(pl.col("id").is_in(trn_id)).with_columns(pl.Series(fold, dtype=pl.Int64).alias("fold"))

id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,…,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,fold
str,str,i64,i64,str,i64,str,f64,f64,f64,f64,i64,i64,i64,str,i64,i64,i64,str,i64,i64,f64,i64,f64,i64,i64,i64,f64,i64,f64,i64,f64,i64,str,i64,f64,f64,…,f64,f64,f64,f64,str,f64,str,f64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,i64,str,i64,i64,i64
"""00008ff9""","""Fall""",5,0,"""Winter""",51,"""Fall""",16.877316,46.0,50.8,,,,,,,,,"""Fall""",0,0,,,,,0,0,7.0,0,6.0,0,6.0,1,"""Fall""",2,2.66855,16.8792,…,8.89536,38.9177,19.5413,32.6909,,,,,"""Fall""",5,4,4,0,4,0,0,4,0,0,4,0,4,4,4,4,4,4,2,4,55,,,,"""Fall""",3,2,2
"""000fd460""","""Summer""",9,0,,,"""Fall""",14.03559,48.0,46.0,22.0,75,70,122,,,,,"""Fall""",3,0,,,,,5,0,11.0,1,11.0,1,3.0,0,"""Winter""",2,2.57949,14.0371,…,14.974,39.4497,15.4107,27.0552,,,"""Fall""",2.34,"""Fall""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""Fall""",46,64,"""Summer""",0,0,3
"""00105258""","""Summer""",10,1,"""Fall""",71,"""Fall""",16.648696,56.5,75.6,,65,94,117,"""Fall""",5,7,33,"""Fall""",20,1,10.2,1,14.7,2,7,1,10.0,1,10.0,1,5.0,0,,,,,…,,,,,,,"""Summer""",2.17,"""Fall""",5,2,2,1,2,1,1,2,1,1,1,0,1,1,1,0,2,2,1,1,28,"""Fall""",38,54,"""Summer""",2,0,2
"""00115b9f""","""Winter""",9,0,"""Fall""",71,"""Summer""",18.292347,56.0,81.6,,60,97,117,"""Summer""",6,9,37,"""Summer""",18,1,,,,,5,0,7.0,0,7.0,0,7.0,1,"""Summer""",3,3.84191,18.2943,…,16.779,58.9338,26.4798,45.9966,,,"""Winter""",2.451,"""Summer""",4,2,4,0,5,1,0,3,2,2,3,0,3,0,0,3,4,3,4,1,44,"""Summer""",31,45,"""Winter""",0,1,1
"""001f3379""","""Spring""",13,1,"""Winter""",50,"""Summer""",22.279952,59.5,112.2,,60,73,102,,,,,"""Summer""",12,0,16.5,2,17.9,2,6,0,10.0,1,11.0,1,8.0,0,"""Summer""",2,4.33036,30.1865,…,20.902,79.6982,35.3804,63.1265,,,"""Spring""",4.11,"""Summer""",3,3,3,0,2,1,0,2,2,1,0,1,3,3,2,1,3,1,2,1,34,"""Summer""",40,56,"""Spring""",0,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ff6c2bb8""","""Fall""",8,0,,,"""Fall""",17.13981,52.5,67.2,25.0,60,65,112,,,,,"""Fall""",0,0,,,,,0,0,8.0,1,10.0,1,12.0,1,"""Fall""",3,3.20303,17.1417,…,15.815,49.3301,20.2645,36.7181,,,"""Fall""",3.44,"""Fall""",3,3,3,0,0,0,0,3,0,0,0,0,2,0,0,3,0,2,2,1,22,"""Fall""",41,58,"""Fall""",2,0,1
"""ff759544""","""Summer""",7,1,,,"""Summer""",13.927006,48.5,46.6,23.0,65,75,105,,,,,"""Summer""",0,0,,,,,0,0,9.0,0,8.5,0,4.5,0,"""Fall""",1,2.3668,13.6457,…,15.14,42.8185,18.0937,30.0453,,,,,"""Summer""",1,3,3,0,3,0,0,0,0,0,3,0,5,1,0,5,3,3,3,0,33,"""Summer""",48,67,"""Summer""",0,1,1
"""ff8a2de4""","""Fall""",13,0,"""Spring""",60,"""Fall""",16.36246,59.5,82.4,,71,70,104,,,,,"""Fall""",16,0,18.0,1,19.9,2,10,1,8.0,1,9.0,1,12.0,1,"""Fall""",3,4.52277,16.3642,…,17.9797,66.2889,29.779,52.832,,,"""Winter""",3.26,"""Winter""",3,3,3,2,3,2,2,2,2,1,2,0,2,0,1,0,2,1,1,0,32,"""Winter""",35,50,"""Fall""",1,1,5
"""ffcd4dbd""","""Fall""",11,0,"""Spring""",68,"""Winter""",21.4415,60.0,109.8,,79,99,116,,,,,"""Winter""",15,1,18.5,2,15.8,2,0,0,10.0,1,10.0,1,14.0,1,"""Winter""",2,4.41305,21.4438,…,21.3403,71.3903,28.7792,54.463,,,"""Winter""",2.729,"""Winter""",5,5,3,0,5,1,0,2,0,2,1,0,1,3,0,0,1,1,0,1,31,"""Winter""",56,77,"""Fall""",0,1,2
