# Import

In [77]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, Normalizer, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
import seaborn as sns
import datetime
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

pd.set_option('display.max_rows', 500)
random_state = 42
np.random.seed(random_state)
# data_filepath = Path('./data')
data_filepath = Path('/kaggle/input/child-mind-institute-problematic-internet-use')
KAPPA_SCORER = make_scorer(
    cohen_kappa_score,
    greater_is_better=True,
    weights='quadratic',
)

# Data

In [41]:
!du -hs $data_filepath/*
train_df = pd.read_csv(data_filepath / 'train.csv')
test_df = pd.read_csv(data_filepath / 'test.csv')
train_df.shape, test_df.shape

12K	/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv
4.0K	/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv
8.0M	/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet
6.3G	/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet
8.0K	/kaggle/input/child-mind-institute-problematic-internet-use/test.csv
924K	/kaggle/input/child-mind-institute-problematic-internet-use/train.csv


((3960, 82), (20, 59))

In [51]:
parquet_partitions = list((data_filepath / 'series_train.parquet').glob('id=*'))
len(parquet_partitions), len(set(str(c).split('=')[1] for c in parquet_partitions) & set(train_df.id))

(996, 996)

In [52]:
%%time
parquet_filepath = parquet_partitions[0]
print(f'{parquet_filepath=!s}')
train_ts = pd.read_parquet(parquet_filepath)
train_ts.info()

parquet_filepath=/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=0745c390
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50458 entries, 0 to 50457
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   step                 50458 non-null  uint32 
 1   X                    50458 non-null  float32
 2   Y                    50458 non-null  float32
 3   Z                    50458 non-null  float32
 4   enmo                 50458 non-null  float32
 5   anglez               50458 non-null  float32
 6   non-wear_flag        50458 non-null  float32
 7   light                50458 non-null  float32
 8   battery_voltage      50458 non-null  float32
 9   time_of_day          50458 non-null  int64  
 10  weekday              50458 non-null  int8   
 11  quarter              50458 non-null  int8   
 12  relative_date_PCIAT  50458 non-null  float32
dtypes: float32(9), int64(1), i

In [116]:
train_ts.columns

Index(['step', 'X', 'Y', 'Z', 'enmo', 'anglez', 'non-wear_flag', 'light',
       'battery_voltage', 'time_of_day', 'weekday', 'quarter',
       'relative_date_PCIAT', 'day_hour', 'week', 'month'],
      dtype='object')

In [117]:
train_ts.light

0         8.250000
1        15.666667
2        13.583333
3        11.500000
4         1.661765
           ...    
50453     9.625000
50454     9.625000
50455     9.625000
50456     9.625000
50457     9.625000
Name: light, Length: 50458, dtype: float32

In [94]:
def parse_time_of_day(nanoseconds):
    seconds = nanoseconds // 10**9
    nanoseconds_remainder = nanoseconds % 10**9
    time_of_day = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(time_of_day.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
#     formatted_time = f"{hours:02}:{minutes:02}:{seconds:02}.{nanoseconds_remainder:09}"
    return hours, minutes, seconds, nanoseconds_remainder

train_ts['day_hour'] = train_ts.time_of_day.apply(lambda x: parse_time_of_day(x)[0])
train_ts['week'] = train_ts['relative_date_PCIAT'] // 7 + 1
train_ts['month'] = train_ts['relative_date_PCIAT'] // 30 + 1

In [120]:
def convert_ts_to_row(ts):
    agg_cols = ['X', 'Y', 'Z', 'enmo', 'anglez', 'light', 'battery_voltage']
    onerow = ts.agg(['min', 'max', 'std', 'mean', 'median']).unstack().to_frame().T
    onerow.columns = ['_'.join(c) for c in onerow.columns]
    return onerow

convert_ts_to_row(train_ts)

Unnamed: 0,step_min,step_max,step_std,step_mean,step_median,X_min,X_max,X_std,X_mean,X_median,...,week_min,week_max,week_std,week_mean,week_median,month_min,month_max,month_std,month_mean,month_median
0,0.0,50457.0,14566.114278,25228.5,25228.5,-1.812031,1.850391,0.633126,-0.054638,0.015846,...,3.0,9.0,1.57732,4.951425,5.0,1.0,2.0,0.496938,1.444706,1.0


In [129]:
# Количество периодов без движения
# Count of periods without activity
total_inactivity_periods = train_ts[train_ts.enmo == 0].enmo.count()
total_inactivity_periods

1195

In [143]:
# Среднее количество часов активности в день
# Avg hours of activity per day
cnt_of_active_hours = train_ts[(train_ts['non-wear_flag'] == 0) & (train_ts.enmo > 0)].drop_duplicates(['relative_date_PCIAT', 'day_hour']).day_hour.count()
cnt_of_days = len(train_ts['relative_date_PCIAT'].unique())
avg_active_hours_per_day = cnt_of_active_hours / cnt_of_days
avg_active_hours_per_day

6.162162162162162

In [146]:
# weekend flag
weekend_flag = train_ts.weekday.isin([6,7]).astype(int)
weekend_flag

0        1
1        1
2        1
3        1
4        1
        ..
50453    1
50454    1
50455    1
50456    1
50457    1
Name: weekday, Length: 50458, dtype: int64