<div
      style="
        text-align: center;
        font-weight: bold;
        font-size: 32px;
        font-family: Arial, Helvetica, sans-serif;
        color: white;
        background-color: rgb(84, 84, 84);
        padding-top: 20px;
        padding-bottom: 20px;
        border-radius: 20px;
      "
    >
      CMI | Problematic Internet Use
    </div>

# Library

In [1]:
import numpy as np
import pandas as pd
import os
import optuna
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from scipy.optimize import minimize
from IPython.display import clear_output
import concurrent.futures
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor


N_FOLD = 5
SEED = 42

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')

# Process data

In [2]:
TRAIN_PARQUET_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"
TEST_PARQUET_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"
TRAIN_CSV_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"
TEST_CSV_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"
SAMPLE_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv"
DICTIONARY_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"

## Time series data

In [3]:
def process_time_series(file_path: str) -> list:
    df = pd.read_parquet(file_path)
    df = df.drop("step", axis=1)
    return df.describe().values.flatten().tolist()

In [4]:
def load_time_series(dir: str) -> pd.DataFrame:
    all_parquet_folders = os.listdir(dir)
    all_ids = [file.split("=")[-1] for file in all_parquet_folders]
    all_parquet_files = [
        os.path.join(dir, folder_name, "part-0.parquet")
        for folder_name in all_parquet_folders
    ]

    with ThreadPoolExecutor(max_workers=4) as excuter:
        results = list(
            tqdm(
                excuter.map(process_time_series, all_parquet_files), total=len(all_ids)
            )
        )

    df = pd.DataFrame(results)
    df.columns = [f"Stat_{i}" for i in range(len(results[0]))]
    df["id"] = all_ids

    return df

In [5]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [6]:
def encode_time_series(
    df: pd.DataFrame, encoding_dim: int, batch_size: int, epochs: int
) -> pd.DataFrame:
    scaler = StandardScaler()
    df = scaler.fit_transform(df)
    data = torch.FloatTensor(df)
    input_dim = data.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())

    for epoch in range(epochs):
        for i in range(0, len(data), batch_size):
            batch = data[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]")

    with torch.no_grad():
        encoded_data = autoencoder.encoder(data).numpy()

    df_encoded = pd.DataFrame(
        encoded_data, columns=[f"Enc_{i + 1}" for i in range(encoded_data.shape[1])]
    )

    return df_encoded

## CSV data

In [7]:
def map_seasons(df: pd.DataFrame) -> pd.DataFrame:
    # 10 columns (excluding PCIAT_Season)
    season_cols = [
        "Basic_Demos-Enroll_Season",
        "CGAS-Season",
        "Physical-Season",
        "Fitness_Endurance-Season",
        "FGC-Season",
        "BIA-Season",
        "PAQ_A-Season",
        "PAQ_C-Season",
        "SDS-Season",
        "PreInt_EduHx-Season",
    ]

    df = df.drop(columns=[col for col in df.columns if "PCIAT" in col])

    mapping = {"Summer": 0, "Winter": 1, "Spring": 2, "Fall": 3, "Missing": 4}

    for col in season_cols:
        df[col] = df[col].fillna("Missing")
        df[col] = df[col].map(mapping).astype(int)

    return df

# Feature Engineering

In [8]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    # season_cols = [col for col in df.columns if 'Season' in col]
    # df = df.drop(season_cols, axis=1) 
    # df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    # df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    # df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    # df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    # df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    # df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    # df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    # df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    # df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    # df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    # df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    # df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    # df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    # df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    # df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    # df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    
    return df

# Train model

In [9]:
def compute_qwk(y_pred: np.ndarray, y_true: np.ndarray) -> float:
    return cohen_kappa_score(y_pred, y_true,weights="quadratic")

In [10]:
def round_prediction(y_pred: np.ndarray, thresholds: list) -> np.ndarray:

    return np.where(
        y_pred < thresholds[0],
        0,
        np.where(y_pred < thresholds[1], 1, np.where(y_pred < thresholds[2], 2, 3)),
    )

In [11]:
def evaluate_predictions(thresholds, y_true, y_pred):
    rounded_pred = round_prediction(y_pred, thresholds)
    return -compute_qwk(y_true, rounded_pred)

In [12]:
def train_model(
    train_set: pd.DataFrame, test_set: pd.DataFrame, model, test_id
) -> pd.DataFrame:
    x = train_set.drop("sii", axis=1)
    y = train_set["sii"]

    n_samples = len(y)

    SKF = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    train_kappa_scores = []
    val_kappa_scores = []

    non_rounded_train_pred = np.zeros(n_samples, dtype=float)
    non_rounded_test_pred = np.zeros((len(test_set), N_FOLD), dtype=float)

    for fold, (train_index, val_index) in enumerate(SKF.split(x, y)):
        train_data, val_data = x.iloc[train_index], x.iloc[val_index]
        train_label, val_label = y.iloc[train_index], y.iloc[val_index]

        clone_model = clone(model)
        clone_model.fit(train_data, train_label)

        train_pred = clone_model.predict(train_data)
        val_pred = clone_model.predict(val_data)

        train_kappa_scores.append(
            compute_qwk(train_pred.round(0).astype(int), train_label)
        )

        val_kappa_scores.append(
            compute_qwk(val_pred.round(0).astype(int), val_label)
        )

        non_rounded_train_pred[val_index] = val_pred
        non_rounded_test_pred[:, fold] = clone_model.predict(test_set)

    kappa_optimizer = minimize(
        evaluate_predictions,
        x0=[0.5, 1.5, 2.5],
        args=(y, non_rounded_train_pred),
        method="Nelder-Mead",
    )

    optimized_thresholds = kappa_optimizer.x
    rounded_train_pred = round_prediction(non_rounded_train_pred, optimized_thresholds)
    optimized_train_kappa_score = compute_qwk(rounded_train_pred, y)

    print("Optimized train kappa score:", optimized_train_kappa_score)

    rounded_test_pred = round_prediction(
        non_rounded_test_pred.mean(axis=1), optimized_thresholds
    )

    submisison = pd.DataFrame({"id": test_id, "sii": rounded_test_pred})

    return (submisison, optimized_train_kappa_score)

# Submission

## Load data

In [13]:
# Load and encode time series data
train_ts = load_time_series(TRAIN_PARQUET_PATH)
test_ts = load_time_series(TEST_PARQUET_PATH)
encoded_train_ts = encode_time_series(
    df=train_ts.drop("id", axis=1), encoding_dim=60, batch_size=32, epochs=150
)
encoded_test_ts = encode_time_series(
    df=test_ts.drop("id", axis=1), encoding_dim=60, batch_size=32, epochs=150
)
encoded_train_ts["id"] = train_ts["id"]
encoded_test_ts["id"] = test_ts["id"]

# Load and process csv data
train_csv = pd.read_csv(TRAIN_CSV_PATH)
test_csv = pd.read_csv(TEST_CSV_PATH)
mapped_train_csv = map_seasons(train_csv.drop("id", axis=1))
mapped_test_csv = map_seasons(test_csv.drop("id", axis=1))

knn_imputer = KNNImputer(n_neighbors=5)
mask_train = mapped_train_csv.notna()
mask_test = mapped_test_csv.notna()

imputed_train_csv = pd.DataFrame(
    knn_imputer.fit_transform(mapped_train_csv),
    columns=mapped_train_csv.columns
)

imputed_test_csv = pd.DataFrame(
    knn_imputer.fit_transform(mapped_test_csv),
    columns=mapped_test_csv.columns
)

imputed_train_csv[mask_train] = mapped_train_csv[mask_train]
imputed_test_csv[mask_test] = mapped_test_csv[mask_test]

imputed_train_csv["sii"] = imputed_train_csv["sii"].round(0).astype(int)

imputed_train_csv["id"] = train_csv["id"]
imputed_test_csv["id"] = test_csv["id"]

# Merge csv and time series data
train = pd.merge(imputed_train_csv, encoded_train_ts, how="left", on="id")
test = pd.merge(imputed_test_csv, encoded_test_ts, how="left", on="id")

train_id = train["id"]
test_id = test["id"]

train = train.drop("id", axis=1)
test = test.drop("id", axis=1)

# Do feature engineering
train = feature_engineering(train)
test = feature_engineering(test)

if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

# train = train.iloc[:10]

100%|██████████| 996/996 [01:16<00:00, 13.04it/s]
100%|██████████| 2/2 [00:00<00:00, 10.57it/s]


Epoch [10/150], Loss: 1.6039]
Epoch [20/150], Loss: 1.5405]
Epoch [30/150], Loss: 1.4856]
Epoch [40/150], Loss: 1.4620]
Epoch [50/150], Loss: 1.4509]
Epoch [60/150], Loss: 1.4502]
Epoch [70/150], Loss: 1.4728]
Epoch [80/150], Loss: 1.4621]
Epoch [90/150], Loss: 1.3947]
Epoch [100/150], Loss: 1.3611]
Epoch [110/150], Loss: 1.3599]
Epoch [120/150], Loss: 1.3612]
Epoch [130/150], Loss: 1.3637]
Epoch [140/150], Loss: 1.3445]
Epoch [150/150], Loss: 1.3371]
Epoch [10/150], Loss: 1.0062]
Epoch [20/150], Loss: 0.7385]
Epoch [30/150], Loss: 0.4348]
Epoch [40/150], Loss: 0.4271]
Epoch [50/150], Loss: 0.4271]
Epoch [60/150], Loss: 0.4271]
Epoch [70/150], Loss: 0.4271]
Epoch [80/150], Loss: 0.4271]
Epoch [90/150], Loss: 0.4271]
Epoch [100/150], Loss: 0.4271]
Epoch [110/150], Loss: 0.4271]
Epoch [120/150], Loss: 0.4271]
Epoch [130/150], Loss: 0.4271]
Epoch [140/150], Loss: 0.4271]
Epoch [150/150], Loss: 0.4271]


## Find optimal parameters for models

In [14]:
# def objective_xgb(trial):
#     params = {
#         'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 50, 500),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'gamma': trial.suggest_float('gamma', 0, 5),
#         'lambda': trial.suggest_float('lambda', 1e-3, 10, log=True),
#     }
    
#     model = XGBRegressor(**params, verbosity=0, device="cuda")
#     _, metric = train_model(train, test, model, test_id)
    
#     return metric

In [15]:
# def objective_catb(trial):
#     params = {
#         'depth': trial.suggest_int('depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#         'iterations': trial.suggest_int('iterations', 50, 500),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
#     }
    
#     model = CatBoostRegressor(**params, verbose=0, task_type='GPU')
#     _, metric = train_model(train, test, model, test_id)
    
#     return metric

In [16]:
# def objective_lgbm(trial):
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 20, 300),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 50, 500),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
#     }
    
#     model = LGBMRegressor(**params, verbose=-1, device="GPU")
#     _, metric = train_model(train, test, model, test_id)
    
#     return metric

In [17]:
# study_xgb = optuna.create_study(direction='maximize')
# study_catb = optuna.create_study(direction='maximize')
# study_lgbm = optuna.create_study(direction='maximize')

# def optimize_study(study, objective, n_trials=50):
#     study.optimize(objective, n_trials=n_trials)

# with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
#     futures = [
#         executor.submit(optimize_study, study_xgb, objective_xgb),
#         executor.submit(optimize_study, study_catb, objective_catb),
#         executor.submit(optimize_study, study_lgbm, objective_lgbm)
#     ]
#     concurrent.futures.wait(futures)

# BEST_PARAMS_XGB = study_xgb.best_params
# BEST_PARAMS_CATB = study_catb.best_params
# BEST_PARAMS_LGBM = study_lgbm.best_params

# clear_output(wait=True)

In [18]:
# print(BEST_PARAMS_XGB)
# print(BEST_PARAMS_CATB)
# print(BEST_PARAMS_LGBM)

XGB: {'booster': 'gbtree', 'max_depth': 5, 'learning_rate': 0.032806306822661946, 'n_estimators': 124, 'subsample': 0.7612200524957443, 'colsample_bytree': 0.7879462813404031, 'gamma': 2.27693382985497, 'lambda': 0.0037960352854142705}

CATB: {'depth': 8, 'learning_rate': 0.02999839537419576, 'iterations': 492, 'l2_leaf_reg': 2.3005068330449827}

LGBM: {'num_leaves': 290, 'learning_rate': 0.022351630623035076, 'n_estimators': 110, 'min_child_samples': 36, 'subsample': 0.64715318375847, 'colsample_bytree': 0.5964597906674066, 'reg_alpha': 1.5700746777921748, 'reg_lambda': 2.718790747208782}

## Train

In [20]:
BEST_PARAMS_XGB = {
    "booster": "gbtree",
    "max_depth": 5,
    "learning_rate": 0.032806306822661946,
    "n_estimators": 124,
    "subsample": 0.7612200524957443,
    "colsample_bytree": 0.7879462813404031,
    "gamma": 2.27693382985497,
    "lambda": 0.0037960352854142705,
    "verbosity": 0,
}

BEST_PARAMS_CATB = {
    "depth": 8,
    "learning_rate": 0.02999839537419576,
    "iterations": 492,
    "l2_leaf_reg": 2.3005068330449827,
    "verbose": 0,
}

BEST_PARAMS_LGBM = {
    "num_leaves": 290,
    "learning_rate": 0.022351630623035076,
    "n_estimators": 110,
    "min_child_samples": 36,
    "subsample": 0.64715318375847,
    "colsample_bytree": 0.5964597906674066,
    "reg_alpha": 1.5700746777921748,
    "reg_lambda": 2.718790747208782,
    "verbosity": -1,
}

In [21]:
# def objective_ensemble(trial):
#     weights = {
#         "weights": [
#             trial.suggest_float("weight_xgb", 0.0, 10.0),
#             trial.suggest_float("weight_catb", 0.0, 10.0),
#             trial.suggest_float("weight_lgbm", 0.0, 10.0),
#         ]
#     }
    
#     model = VotingRegressor(estimators=[
#         ("xgb", XGBRegressor(**BEST_PARAMS_XGB, device="cuda")),
#         ("catb", CatBoostRegressor(**BEST_PARAMS_CATB, task_type="GPU")),
#         ("lgbm", LGBMRegressor(**BEST_PARAMS_LGBM, device="GPU")),
#     ], **weights)
    
#     _, metric = train_model(train, test, model, test_id)
    
#     return metric


# study_ensemble = optuna.create_study(direction='maximize')

# def optimize_study(study, objective, n_trials=50):
#     study.optimize(objective, n_trials=n_trials)

# with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
#     futures = [
#         executor.submit(optimize_study, study_ensemble, objective_ensemble)
#     ]
#     concurrent.futures.wait(futures)

# BEST_WEIGHTS = study_ensemble.best_params

# clear_output(wait=True)

In [22]:
# print(BEST_WEIGHTS)

{'weight_xgb': 0.29143842381072055, 'weight_catb': 0.7223070202458827, 'weight_lgbm': 0.0038376523038045}

In [23]:
ensemble_model = VotingRegressor(estimators=[
    ("xgb", XGBRegressor(**BEST_PARAMS_XGB)),
    ("catb", CatBoostRegressor(**BEST_PARAMS_CATB)),
    ("lgbm", LGBMRegressor(**BEST_PARAMS_LGBM)),
], weights=[0.29143842381072055, 0.7223070202458827, 0.0038376523038045])

In [24]:
submission, _ = train_model(train, test, ensemble_model, test_id)
submission.to_csv("submission.csv", index=False)

clear_output(wait=True)

Optimized train kappa score: 0.48975690655504844
