In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [2]:
train_df = pd.read_csv("child-mind-institute-problematic-internet-use/train.csv")
test_df = pd.read_csv("child-mind-institute-problematic-internet-use/test.csv")

In [3]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [4]:
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

In [5]:
train_ts = load_time_series("child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [01:39<00:00, 10.03it/s]
100%|██████████| 2/2 [00:00<00:00,  6.41it/s]


In [6]:
time_series_cols = train_ts.columns.tolist()

In [7]:
time_series_cols.remove("id")

In [None]:
train_df = pd.merge(train_df, train_ts, how="left", on='id')
test_df = pd.merge(test_df, test_ts, how="left", on='id')

In [8]:
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

In [9]:
TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]

In [10]:
train_df = train_df.drop(TARGET_COLS,axis=1)

In [11]:
train_df = train_df.dropna(subset=['sii'])

In [12]:
SEASON_COLS = [
    "Basic_Demos-Enroll_Season", 
    "CGAS-Season", 
    "Physical-Season", 
    "Fitness_Endurance-Season", 
    "FGC-Season", 
    "BIA-Season", 
    "PAQ_A-Season", 
    "PAQ_C-Season", 
    "SDS-Season",
    "PreInt_EduHx-Season"]

In [13]:
def update(df):
    for c in SEASON_COLS: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
train_df = update(train_df)
test_df = update(test_df)

In [14]:
season_mapping = {'Spring': 0, 'Summer': 1, 'Fall': 2, 'Winter': 3, 'Missing': 4}

In [15]:
for col in SEASON_COLS:
    train_df[col] = train_df[col].map(season_mapping)
    test_df[col] = test_df[col].map(season_mapping)

In [16]:
train_df

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,2,5,0,3,51.0,2,16.877316,46.0,50.8,,...,4,,4,,4,,,2,3.0,2.0
1,1,9,0,4,,2,14.035590,48.0,46.0,22.0,...,4,,2,2.340,2,46.0,64.0,1,0.0,0.0
2,1,10,1,2,71.0,2,16.648696,56.5,75.6,,...,4,,1,2.170,2,38.0,54.0,1,2.0,0.0
3,3,9,0,2,71.0,1,18.292347,56.0,81.6,,...,4,,3,2.451,1,31.0,45.0,3,0.0,1.0
5,0,13,1,3,50.0,1,22.279952,59.5,112.2,,...,4,,0,4.110,1,40.0,56.0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,2,8,0,4,,2,17.139810,52.5,67.2,25.0,...,4,,2,3.440,2,41.0,58.0,2,2.0,0.0
3954,1,7,1,4,,1,13.927006,48.5,46.6,23.0,...,4,,4,,1,48.0,67.0,1,0.0,1.0
3955,2,13,0,0,60.0,2,16.362460,59.5,82.4,,...,4,,3,3.260,3,35.0,50.0,2,1.0,1.0
3957,2,11,0,0,68.0,3,21.441500,60.0,109.8,,...,4,,3,2.729,3,56.0,77.0,2,0.0,1.0


In [18]:
imputer = SimpleImputer(strategy='median')
train_df = pd.DataFrame(imputer.fit_transform(train_df), columns=train_df.columns)
test_df = pd.DataFrame(imputer.fit_transform(test_df), columns=test_df.columns)

In [19]:
train_df

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,2.0,5.0,0.0,3.0,51.0,2.0,16.877316,46.0,50.8,26.0,...,4.0,2.08,4.0,2.550,4.0,39.0,55.0,2.0,3.0,2.0
1,1.0,9.0,0.0,4.0,65.0,2.0,14.035590,48.0,46.0,22.0,...,4.0,2.08,2.0,2.340,2.0,46.0,64.0,1.0,0.0,0.0
2,1.0,10.0,1.0,2.0,71.0,2.0,16.648696,56.5,75.6,26.0,...,4.0,2.08,1.0,2.170,2.0,38.0,54.0,1.0,2.0,0.0
3,3.0,9.0,0.0,2.0,71.0,1.0,18.292347,56.0,81.6,26.0,...,4.0,2.08,3.0,2.451,1.0,31.0,45.0,3.0,0.0,1.0
4,0.0,13.0,1.0,3.0,50.0,1.0,22.279952,59.5,112.2,26.0,...,4.0,2.08,0.0,4.110,1.0,40.0,56.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,2.0,8.0,0.0,4.0,65.0,2.0,17.139810,52.5,67.2,25.0,...,4.0,2.08,2.0,3.440,2.0,41.0,58.0,2.0,2.0,0.0
2732,1.0,7.0,1.0,4.0,65.0,1.0,13.927006,48.5,46.6,23.0,...,4.0,2.08,4.0,2.550,1.0,48.0,67.0,1.0,0.0,1.0
2733,2.0,13.0,0.0,0.0,60.0,2.0,16.362460,59.5,82.4,26.0,...,4.0,2.08,3.0,3.260,3.0,35.0,50.0,2.0,1.0,1.0
2734,2.0,11.0,0.0,0.0,68.0,3.0,21.441500,60.0,109.8,26.0,...,4.0,2.08,3.0,2.729,3.0,56.0,77.0,2.0,0.0,1.0


In [20]:
train_df = pd.merge(train_df, train_ts, how="left", on='id')
test_df = pd.merge(test_df, test_ts, how="left", on='id')

KeyError: 'id'

In [25]:
print(f'Train Shape : {train_df.shape} || Test Shape : {test_df.shape}')

Train Shape : (2736, 155) || Test Shape : (20, 154)
