In [1]:
# !pip install pandas

# !pip install numpy

# !pip install scikit-learn

# !pip install tqdm

In [2]:
import pandas as pd

import numpy as np

import os

from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm



from sklearn.ensemble import RandomForestRegressor

# Preprocessing data


In [3]:
df_data_dictionary = pd.read_csv(

    "/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv"

)

## Load time series data


In [4]:
def process_parquet_file(path: str) -> list:

    """

    Process data in a parquet file of an id:

        - Read parquet file.

        - Drop column "step".

        - Calculate statistics for all columns then flatten it. This is used as

        additional features along with ones in train.csv.

        - Extract ID from the path and append to the end of column list above.



    Parameters:

        path (str): path to the parquet file.



    Returns:

        A list contains statistical columns as additional features and ID at the end.

    """



    df = pd.read_parquet(path)
    df[df.columns.tolist()]

    df.drop("step", axis=1, inplace=True)

    res = df.describe().values.flatten().tolist()

    id = path.split("=")[-1].split("/")[0]

    res.append(id)



    return res

In [5]:
def load_parquet_files(dir: str) -> pd.DataFrame:

    """

    Load all parquet file ands process each one in 4 threads.



    Parameters:

        dir (str): path the to the directory which contains parquet files.



    Returns:

        A list contains results of processed parquet files.

    """



    all_ids = os.listdir(dir)

    all_parquet_files = [os.path.join(dir, id, "part-0.parquet") for id in all_ids]



    with ThreadPoolExecutor(max_workers=4) as excuter:

        results = list(

            tqdm(

                excuter.map(process_parquet_file, all_parquet_files), total=len(all_ids)

            )

        )



    return results

In [6]:
def create_df_parquet(dir: str) -> pd.DataFrame:

    """

    Create a Dataframe using restuls of processed parquet files.



    Parameters:

        dir (str): path the to the directory which contains parquet files.



    Returns:

        A Dataframe contains statistical time series data of all IDs.

    """



    time_series_data = load_parquet_files(dir)

    df = pd.DataFrame(

        data=time_series_data,

        columns=["Stat_" + str(i) for i in range(len(time_series_data[0]))],

    )



    df.rename(columns={df.columns[-1]: "id"}, inplace=True)



    return df

In [7]:
df_train_parquet = create_df_parquet(

    dir="/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet"

)



df_test_parquet = create_df_parquet(

    dir="/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet"

)

100%|██████████| 996/996 [01:15<00:00, 13.27it/s]
100%|██████████| 2/2 [00:00<00:00, 10.87it/s]


## Load csv data


In [8]:
df_train_csv = pd.read_csv(

    "/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"

)



df_test_csv = pd.read_csv(

    "/kaggle/input/child-mind-institute-problematic-internet-use/test.csv"

)

## Merge time series and csv data


In [9]:
df_train = pd.merge(df_train_csv, df_train_parquet, how="left", on="id")

df_test = pd.merge(df_test_csv, df_test_parquet, how="left", on="id")

In [10]:
test_id = df_test["id"]

In [11]:
df_train.drop("id", axis=1, inplace=True)

df_test.drop("id", axis=1, inplace=True)

## Drop PCIAT columns


In [12]:
df_train = df_train[[col for col in df_train.columns if "PCIAT" not in col]]

df_train = df_train[[col for col in df_train if col != "sii"] + ["sii"]]

df_train.dropna(subset="sii", inplace=True)

## Mapping string data to numeric


In [13]:
season_columns = [

    "Basic_Demos-Enroll_Season",

    "CGAS-Season",

    "Physical-Season",

    "Fitness_Endurance-Season",

    "FGC-Season",

    "BIA-Season",

    "PAQ_A-Season",

    "PAQ_C-Season",

    "SDS-Season",

    "PreInt_EduHx-Season",

]



season_mapping = {"Summer": 0, "Winter": 1, "Spring": 2, "Fall": 3, "Missing": 4}

In [14]:
def season_to_numeric(df: pd.DataFrame) -> pd.DataFrame:

    """

    Fill in missing data of season-related columns and convert

    to numeric category.



    Parameters:

        df (pandas.Dataframe): a Dataframe to be processed.



    Returns:

        A processed Dataframe (just filling and mapping season-related columns).

    """



    for col in season_columns:

        df[col] = df[col].fillna("Missing")

        df[col] = df[col].map(season_mapping)

        df[col] = df[col].astype(int)



    return df

In [15]:
df_train = season_to_numeric(df_train)

df_test = season_to_numeric(df_test)

## Fill in missing cells using mean


In [16]:
df_train = df_train.fillna(df_train.mean())

df_test = df_test.fillna(df_test.mean())

# Train model


In [17]:
x_train = df_train.drop(columns=["sii"], axis=1)

y_train = df_train["sii"]

In [18]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(x_train, y_train)

In [19]:
y_pred = model.predict(df_test)

rounded_pred = np.round(y_pred).astype(int)

In [20]:
submission = pd.DataFrame({"id": test_id, "sii": rounded_pred})

In [21]:
submission.to_csv("submission.csv", index=False)