# Import

In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, Normalizer, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
import seaborn as sns
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

pd.set_option('display.max_rows', 500)
random_state = 42
np.random.seed(random_state)
# data_filepath = Path('./data')
data_filepath = Path('/kaggle/input/child-mind-institute-problematic-internet-use')
KAPPA_SCORER = make_scorer(
    cohen_kappa_score, 
    greater_is_better=True, 
    weights='quadratic',
)

# Data

In [41]:
!du -hs $data_filepath/*
train_df = pd.read_csv(data_filepath / 'train.csv')
test_df = pd.read_csv(data_filepath / 'test.csv')
train_df.shape, test_df.shape

12K	/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv
4.0K	/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv
8.0M	/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet
6.3G	/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet
8.0K	/kaggle/input/child-mind-institute-problematic-internet-use/test.csv
924K	/kaggle/input/child-mind-institute-problematic-internet-use/train.csv


((3960, 82), (20, 59))

In [51]:
parquet_partitions = list((data_filepath / 'series_train.parquet').glob('id=*'))
len(parquet_partitions), len(set(str(c).split('=')[1] for c in parquet_partitions) & set(train_df.id))

(996, 996)

In [52]:
%%time
parquet_filepath = parquet_partitions[0]
print(f'{parquet_filepath=!s}')
train_ts = pd.read_parquet(parquet_filepath)
train_ts.info()

parquet_filepath=/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=0745c390
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50458 entries, 0 to 50457
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   step                 50458 non-null  uint32 
 1   X                    50458 non-null  float32
 2   Y                    50458 non-null  float32
 3   Z                    50458 non-null  float32
 4   enmo                 50458 non-null  float32
 5   anglez               50458 non-null  float32
 6   non-wear_flag        50458 non-null  float32
 7   light                50458 non-null  float32
 8   battery_voltage      50458 non-null  float32
 9   time_of_day          50458 non-null  int64  
 10  weekday              50458 non-null  int8   
 11  quarter              50458 non-null  int8   
 12  relative_date_PCIAT  50458 non-null  float32
dtypes: float32(9), int64(1), i

In [62]:
train_ts[['step', 'time_of_day', 'weekday', 'relative_date_PCIAT']]

Unnamed: 0,step,time_of_day,weekday,relative_date_PCIAT
0,0,52200000000000,6,15.0
1,1,52205000000000,6,15.0
2,2,54385000000000,6,15.0
3,3,54390000000000,6,15.0
4,4,54395000000000,6,15.0
...,...,...,...,...
50453,50453,33425000000000,6,57.0
50454,50454,33430000000000,6,57.0
50455,50455,33435000000000,6,57.0
50456,50456,33440000000000,6,57.0


In [65]:
import datetime
def parse_time_of_day(nanoseconds):
    seconds = nanoseconds // 10**9
    nanoseconds_remainder = nanoseconds % 10**9
    time_of_day = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(time_of_day.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    formatted_time = f"{hours:02}:{minutes:02}:{seconds:02}.{nanoseconds_remainder:09}"
    return formatted_time
train_ts.time_of_day.apply(parse_time_of_day)

0        14:30:00.000000000
1        14:30:05.000000000
2        15:06:25.000000000
3        15:06:30.000000000
4        15:06:35.000000000
                ...        
50453    09:17:05.000000000
50454    09:17:10.000000000
50455    09:17:15.000000000
50456    09:17:20.000000000
50457    09:17:25.000000000
Name: time_of_day, Length: 50458, dtype: object