In [None]:
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn
# %pip install tqdm

In [None]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone, BaseEstimator, RegressorMixin
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
# from pytorch_tabnet.tab_model import TabNetRegressor
# from pytorch_tabnet.callbacks import Callback

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

SEED = 42
n_splits = 10

In [None]:
DATA_DICTIONARY_PATH = (
    "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"
)

PARQUET_TRAIN_PATH = (
    "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"
)

PARQUET_TEST_PATH = (
    "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"
)

CSV_TRAIN_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"
CSV_TEST_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"

# Data Preprocessing


In [None]:
data_dictionary = pd.read_csv(
    DATA_DICTIONARY_PATH
)

In [None]:
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

In [None]:
def process_parquet_file(path: str) -> list:
    df = pd.read_parquet(path)
    df = df.drop("step", axis=1)
    res = df.describe().values.flatten().tolist()
    id = path.split("=")[-1].split("/")[0]
    res.append(id)

    return res

In [None]:
def load_parquet_files(dir: str) -> pd.DataFrame:
    all_ids = os.listdir(dir)
    all_parquet_files = [os.path.join(dir, id, "part-0.parquet") for id in all_ids]

    with ThreadPoolExecutor(max_workers=4) as excuter:
        results = list(
            tqdm(
                excuter.map(process_parquet_file, all_parquet_files), total=len(all_ids)
            )
        )

    return results

In [None]:
def create_df_parquet(dir: str) -> pd.DataFrame:
    time_series_data = load_parquet_files(dir)
    df = pd.DataFrame(
        data=time_series_data,
        columns=["Stat_" + str(i) for i in range(len(time_series_data[0]))],
    )

    df.rename(columns={df.columns[-1]: "id"}, inplace=True)

    return df

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if epoch % 10 == 0:
            print(epoch, "/100")
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [None]:
train_parquet = create_df_parquet(PARQUET_TRAIN_PATH)
test_parquet = create_df_parquet(PARQUET_TEST_PATH)

In [None]:
train_parquet_encoded = perform_autoencoder(train_parquet.drop(columns="id"), encoding_dim=60, epochs=100, batch_size=32)
test_parquet_encoded = perform_autoencoder(test_parquet.drop(columns="id"), encoding_dim=60, epochs=100, batch_size=32)

In [None]:
train_parquet_encoded["id"] = train_parquet["id"]
test_parquet_encoded["id"] = test_parquet["id"]

In [None]:
train_csv = pd.read_csv(CSV_TRAIN_PATH)
test_csv = pd.read_csv(CSV_TEST_PATH)

In [None]:
train = pd.merge(train_csv, train_parquet_encoded, how="left", on="id")
test = pd.merge(test_csv, test_parquet_encoded, how="left", on="id")

In [None]:
test_id = test["id"]
train_id = train["id"]

In [None]:
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [None]:
train = train[[col for col in train.columns if "PCIAT" not in col]]
train = train[[col for col in train if col != "sii"] + ["sii"]]
train.dropna(subset="sii", inplace=True)

In [None]:
season_columns = data_dictionary[
    data_dictionary["Type"].str.contains("str")
]["Field"].tolist()

season_columns = [i for i in season_columns if "PCIAT" not in i][1:]

In [None]:
mapping_season = {"Summer": 0, "Winter": 1, "Spring": 2, "Fall": 3, "Missing": 4}

In [None]:
def fill_missing_seasons(df: pd.DataFrame) -> pd.DataFrame:

    for col in season_columns:
        df[col] = df[col].fillna("Missing")
        df[col] = df[col].map(mapping_season).astype(int)

    return df

In [None]:
train = fill_missing_seasons(train)
test = fill_missing_seasons(test)

In [None]:
# train = train.dropna(thresh=10, axis=0)

In [None]:
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

In [None]:
imputer = KNNImputer(n_neighbors=7, keep_empty_features=True)

imputed_train = imputer.fit_transform(train)
imputed_test = imputer.fit_transform(test)

train = pd.DataFrame(imputed_train, columns=train.columns)
test = pd.DataFrame(imputed_test, columns=test.columns)
# weights="distance"

# Train model


In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    # submission = pd.DataFrame({
    #     'id': sample['id'],
    #     'sii': tpTuned
    # })

    # return submission

    return tpTuned

imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb',    Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb',    Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat',    Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf',     Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb',     Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
])

submission = TrainML(ensemble, test)

In [None]:
# x = df_train.drop(columns=["sii"], axis=1)
# y = df_train["sii"]

# model = RandomForestRegressor(
#     n_estimators=200,
#     random_state=42
# )

# submission = TrainML(model, test)

# model.fit(x, y)

In [None]:
# y_pred = model.predict(df_test)
# rounded_pred = np.round(y_pred).astype(int)

# Submission


In [None]:
# submission = pd.DataFrame({"id": test_id, "sii": rounded_pred})

In [None]:
submission = pd.DataFrame({
    'id': sample['id'],
    'sii': submission
})

In [None]:
submission.to_csv("submission.csv", index=False)