In [1]:
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn
# %pip install tqdm

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
DATA_DICTIONARY_PATH = (
    "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"
)

PARQUET_TRAIN_PATH = (
    "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"
)

PARQUET_TEST_PATH = (
    "/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"
)

CSV_TRAIN_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"
CSV_TEST_PATH = "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"

# Data Preprocessing


In [4]:
df_data_dictionary = pd.read_csv(
    DATA_DICTIONARY_PATH
)

In [5]:
def process_parquet_file(path: str) -> list:
    df = pd.read_parquet(path)
    df.drop("step", axis=1, inplace=True)
    res = df.describe().values.flatten().tolist()
    id = path.split("=")[-1].split("\\")[0]
    res.append(id)

    return res

In [6]:
def load_parquet_files(dir: str) -> pd.DataFrame:
    all_ids = os.listdir(dir)
    all_parquet_files = [os.path.join(dir, id, "part-0.parquet") for id in all_ids]

    with ThreadPoolExecutor(max_workers=4) as excuter:
        results = list(
            tqdm(
                excuter.map(process_parquet_file, all_parquet_files), total=len(all_ids)
            )
        )

    return results

In [7]:
def create_df_parquet(dir: str) -> pd.DataFrame:
    time_series_data = load_parquet_files(dir)
    df = pd.DataFrame(
        data=time_series_data,
        columns=["Stat_" + str(i) for i in range(len(time_series_data[0]))],
    )

    df.rename(columns={df.columns[-1]: "id"}, inplace=True)

    return df

In [8]:
df_train_parquet = create_df_parquet(PARQUET_TRAIN_PATH)

df_test_parquet = create_df_parquet(PARQUET_TEST_PATH)

100%|██████████| 996/996 [01:11<00:00, 13.87it/s]
100%|██████████| 2/2 [00:00<00:00,  8.71it/s]


In [9]:
df_train_csv = pd.read_csv(CSV_TRAIN_PATH)

df_test_csv = pd.read_csv(CSV_TEST_PATH)

In [10]:
df_train = pd.merge(df_train_csv, df_train_parquet, how="left", on="id")
df_test = pd.merge(df_test_csv, df_test_parquet, how="left", on="id")

In [11]:
test_id = df_test["id"]

In [12]:
df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

In [13]:
df_train = df_train[[col for col in df_train.columns if "PCIAT" not in col]]
df_train = df_train[[col for col in df_train if col != "sii"] + ["sii"]]
df_train.dropna(subset="sii", inplace=True)

In [14]:
categorical_columns = df_data_dictionary[
    df_data_dictionary["Type"].str.contains("categorical", case=False)
]["Field"].tolist()

season_columns = df_data_dictionary[
    df_data_dictionary["Type"].str.contains("str", case=False)
]["Field"].tolist()

categorical_columns = [i for i in categorical_columns if "PCIAT" not in i]
season_columns = [i for i in season_columns if "PCIAT" not in i][1:]

In [15]:
def fill_missing_seasons(df: pd.DataFrame) -> pd.DataFrame:

    for col in season_columns:
        df[col] = df[col].fillna("Missing")

    return df

In [16]:
df_train = fill_missing_seasons(df_train)
df_test = fill_missing_seasons(df_test)

In [17]:
columns_to_encode = season_columns + categorical_columns

In [18]:
def onehot_encoding(df: pd.DataFrame) -> pd.DataFrame:
    for col in columns_to_encode:
        df_encoded = pd.get_dummies(df[col], prefix=col).astype(int)
        df = pd.concat([df.drop(col, axis=1), df_encoded], axis=1)

    return df

In [19]:
df_train = onehot_encoding(df_train)
df_test = onehot_encoding(df_test)

In [20]:
missing_columns = set(df_train.columns) - set(df_test.columns)

for col in missing_columns:
    df_test[col] = 0

df_test = df_test[df_train.columns]
df_test = df_test.drop(columns="sii", axis=1)

In [21]:
print(df_train.shape)
print(df_test.shape)

(2736, 213)
(20, 212)


In [22]:
imputer = KNNImputer(n_neighbors=10, weights="distance", keep_empty_features=True)

imputed_train = imputer.fit_transform(df_train)
imputed_test = imputer.fit_transform(df_test)

df_train = pd.DataFrame(imputed_train, columns=df_train.columns)
df_test = pd.DataFrame(imputed_test, columns=df_test.columns)

# Train model


In [23]:
x = df_train.drop(columns=["sii"], axis=1)
y = df_train["sii"]

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    bootstrap=True,
)

In [24]:
for i in range(10):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.1, random_state=i, shuffle=True
    )
    model.fit(x_train, y_train)

In [25]:
y_pred = model.predict(df_test)
rounded_pred = np.round(y_pred).astype(int)

# Submission


In [26]:
submission = pd.DataFrame({"id": test_id, "sii": rounded_pred})

In [27]:
submission.to_csv("submission.csv", index=False)