# Import

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, Normalizer, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
import seaborn as sns
import datetime
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

pd.set_option('display.max_rows', 500)
random_state = 42
np.random.seed(random_state)
# data_filepath = Path('./data')
data_filepath = Path('/kaggle/input/child-mind-institute-problematic-internet-use')
KAPPA_SCORER = make_scorer(
    cohen_kappa_score,
    greater_is_better=True,
    weights='quadratic',
)

# Data

In [2]:
!du -hs $data_filepath/*
train_df = pd.read_csv(data_filepath / 'train.csv')
test_df = pd.read_csv(data_filepath / 'test.csv')
train_df.shape, test_df.shape

12K	/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv
4.0K	/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv
8.0M	/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet
6.3G	/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet
8.0K	/kaggle/input/child-mind-institute-problematic-internet-use/test.csv
924K	/kaggle/input/child-mind-institute-problematic-internet-use/train.csv


((3960, 82), (20, 59))

In [3]:
parquet_partitions = list((data_filepath / 'series_train.parquet').glob('id=*'))
len(parquet_partitions), len(set(str(c).split('=')[1] for c in parquet_partitions) & set(train_df.id))

(996, 996)

In [4]:
%%time
def parse_time_of_day(nanoseconds):
    seconds = nanoseconds // 10**9
    nanoseconds_remainder = nanoseconds % 10**9
    time_of_day = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(time_of_day.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
#     formatted_time = f"{hours:02}:{minutes:02}:{seconds:02}.{nanoseconds_remainder:09}"
    return hours, minutes, seconds, nanoseconds_remainder

def convert_ts_to_row(ts):
    agg_cols = ['X', 'Y', 'Z', 'enmo', 'anglez', 'light', 'battery_voltage']
    onerow = ts.agg(['min', 'max', 'std', 'mean', 'median']).unstack().to_frame().T
    onerow.columns = ['_'.join(c) for c in onerow.columns]
    return onerow

def handle_parquet_partition(partition_filepath):
    user_id = str(partition_filepath).split('=')[-1]
    ts_df = pd.read_parquet(partition_filepath)
    ts_onerow = convert_ts_to_row(ts_df.copy())
    ts_onerow['id'] = user_id
    
    ts_df['day_hour'] = ts_df.time_of_day.apply(lambda x: parse_time_of_day(x)[0])
    ts_df['week'] = ts_df['relative_date_PCIAT'] // 7 + 1
    ts_df['month'] = ts_df['relative_date_PCIAT'] // 30 + 1
    
    # Количество периодов без движения
    # Count of periods without activity
    ts_onerow['total_inactivity_periods'] = ts_df[ts_df.enmo == 0].enmo.count()
    
    # Среднее количество часов активности в день
    # Avg hours of activity per day
    cnt_of_active_hours = ts_df[(ts_df['non-wear_flag'] == 0) & (ts_df.enmo > 0)].drop_duplicates(['relative_date_PCIAT', 'day_hour']).day_hour.count()
    cnt_of_days = len(ts_df['relative_date_PCIAT'].unique())
    ts_onerow['avg_active_hours_per_day'] = (cnt_of_active_hours or 1) / (cnt_of_days or 1)
    
    # weekend flag
    ts_onerow['weekend_flag'] = ts_df.weekday.isin([6,7]).astype(int)
    
    # Hour of pick activity
    # Пиковое время активности для клиента
    peak_hour_of_activity = ts_df[(ts_df['non-wear_flag'] == 0) & (ts_df.enmo > 0)].groupby([ts_df['day_hour']]).enmo.count()
    ts_onerow['peak_hour_of_activity'] = peak_hour_of_activity.sort_values(ascending=False).index[0]
    
    activity_mask = (ts_df['non-wear_flag'] == 0) & (ts_df.enmo > 0)

    # Relation of activity in weekday and weekend
    # Соотношение активности будни/выходные
    weekday_activity = ts_df[activity_mask & (~ts_df.weekday.isin([6,7]))].drop_duplicates().enmo.count()
    weekend_activity = ts_df[activity_mask & (ts_df.weekday.isin([6,7]))].drop_duplicates().enmo.count()
    ts_onerow['weekday_weekend_activity_ratio'] = (weekday_activity or 1) / (weekend_activity or 1)
    
    # Activity in differentn parts of day
    # Активность в разные части дня
    night_mask = (ts_df.day_hour > 0) & (ts_df.day_hour <= 6)
    morning_mask = (ts_df.day_hour > 6) & (ts_df.day_hour <= 12)
    day_mask = (ts_df.day_hour > 12) & (ts_df.day_hour <= 18)
    evening_mask = (ts_df.day_hour > 18) & ((ts_df.day_hour <= 23) | (ts_df.day_hour == 0))
    
    ts_onerow['night_activity'] = ts_df[activity_mask & night_mask].drop_duplicates().enmo.count()
    ts_onerow['morning_activity'] = ts_df[activity_mask & morning_mask].drop_duplicates().enmo.count()
    ts_onerow['day_activity'] = ts_df[activity_mask & day_mask].drop_duplicates().enmo.count()
    ts_onerow['evening_activity'] = ts_df[activity_mask & evening_mask].drop_duplicates().enmo.count()
    
    # Total time of device wear
    # Общее время ношения устройства
    ts_onerow['cnt_of_device_wearing_hours'] = ts_df[(ts_df['non-wear_flag'] == 0)].drop_duplicates(['relative_date_PCIAT', 'day_hour']).day_hour.count()

    # Relation of wearing / no wearing device
    # Соотношение ношения/не ношения устройства
    cnt_of_device_wearing_hours = ts_df[(ts_df['non-wear_flag'] == 0)].drop_duplicates(['relative_date_PCIAT', 'day_hour']).day_hour.count()
    cnt_of_device_no_wearing_hours = ts_df[(ts_df['non-wear_flag'] == 1)].drop_duplicates(['relative_date_PCIAT', 'day_hour']).day_hour.count()
    ts_onerow['relation_of_wearnig_no_wearing'] = (cnt_of_device_wearing_hours or 1) / (cnt_of_device_no_wearing_hours or 1)
    
    # Before / after PCIAT test activitys
    # Активность до и после теста PCIAT
    activity_mask = (ts_df['non-wear_flag'] == 0) & (ts_df.enmo > 0)
    pciat_period_mask = ts_df['relative_date_PCIAT'] >= 0
    pciat_period_activity = ts_df[activity_mask & pciat_period_mask].drop_duplicates().enmo.count()
    pre_pciat_period_activity = ts_df[activity_mask & (~pciat_period_mask)].drop_duplicates().enmo.count()
    ts_onerow['pciat_period_activity'] = pciat_period_activity
    ts_onerow['pre_pciat_period_activity'] = pre_pciat_period_activity
    
    # Relation before / after PCIAT test activitys
    # Соотношение активность до и после теста PCIAT
    ts_onerow['relation_pre_pciat_period_activity'] = (pciat_period_activity or 1) / (pre_pciat_period_activity or 1)
    return ts_onerow

with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
    results = list(tqdm(executor.map(lambda fpath: handle_parquet_partition(fpath), parquet_partitions), total=len(parquet_partitions)))
ts_train = pd.concat(results); del results
ts_train

100%|██████████| 996/996 [14:45<00:00,  1.13it/s]


CPU times: user 25min 10s, sys: 1min 14s, total: 26min 24s
Wall time: 14min 45s


Unnamed: 0,step_min,step_max,step_std,step_mean,step_median,X_min,X_max,X_std,X_mean,X_median,...,weekday_weekend_activity_ratio,night_activity,morning_activity,day_activity,evening_activity,cnt_of_device_wearing_hours,relation_of_wearnig_no_wearing,pciat_period_activity,pre_pciat_period_activity,relation_pre_pciat_period_activity
0,0.0,50457.0,14566.114278,25228.5,25228.5,-1.812031,1.850391,0.633126,-0.054638,0.015846,...,2.211408,2774,12428,19762,13853,230,230.000000,49263,0,49263.0
0,0.0,340583.0,98318.276378,170291.5,170291.5,-1.807955,1.928769,0.507897,0.113277,0.094074,...,2.998863,63927,73024,79628,60441,457,17.576923,288450,0,288450.0
0,0.0,40002.0,11548.015746,20001.0,20001.0,-1.903281,1.021510,0.454021,-0.499738,-0.644505,...,1918.550000,7653,10960,14441,5231,112,112.000000,38391,0,38391.0
0,0.0,223914.0,64638.837100,111957.0,111957.0,-1.684624,5.908000,0.586100,0.007430,0.022344,...,1.717158,19105,80538,83177,35178,527,527.000000,220424,0,220424.0
0,0.0,15419.0,4451.514911,7709.5,7709.5,-1.675859,3.231563,0.509845,0.086653,0.053034,...,1.382613,1442,1704,5582,5743,122,122.000000,14553,0,14553.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0,394127.0,113775.097785,197063.5,197063.5,-1.407426,1.491908,0.591072,-0.067798,-0.092813,...,2.825436,85054,94607,97704,75336,548,109.600000,365727,0,365727.0
0,0.0,1194.0,345.111093,597.0,597.0,-1.064844,1.353594,0.269882,0.097154,0.120130,...,57.000000,10,795,340,15,42,42.000000,1160,0,1160.0
0,0.0,393239.0,113518.754266,196619.5,196619.5,-1.508058,0.999923,0.478085,-0.147508,-0.112749,...,2.985063,48898,64654,65096,46201,445,3.156028,232381,0,232381.0
0,0.0,40084.0,11571.687107,20042.0,20042.0,-1.073320,1.004674,0.502446,-0.441574,-0.599089,...,4.476457,8400,8624,12199,8665,57,57.000000,39195,0,39195.0


In [5]:
%%time
ts_train.to_parquet('ts_train.parquet')
!du -hs 'ts_train.parquet'

456K	ts_train.parquet
CPU times: user 26.5 ms, sys: 27.5 ms, total: 54 ms
Wall time: 1.08 s


In [6]:
ts_train = pd.read_parquet('ts_train.parquet')
ts_train.shape

(996, 80)

In [10]:
%%time
train_df = pd.read_csv(data_filepath / 'train.csv')
print(train_df.shape, ts_train.shape)
train_df = pd.merge(train_df, ts_train, on='id', how='inner')
train_df.shape

(3960, 82) (996, 80)
CPU times: user 43.4 ms, sys: 1.82 ms, total: 45.3 ms
Wall time: 49.6 ms


(996, 161)

In [14]:
def feature_engineering(df):
    # season_cols = [col for col in df.columns if 'Season' in col]
    # df = df.drop(season_cols, axis=1)
    pciat_cols = [c for c in df.columns if c.startswith('PCIAT')]
    df = df.drop(columns=pciat_cols, errors='ignore')
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']

    df['Age_Weight'] = df['Basic_Demos-Age'] * df['Physical-Weight']
    df['Sex_BMI'] = df['Basic_Demos-Sex'] * df['Physical-BMI']
    df['Sex_HeartRate'] = df['Basic_Demos-Sex'] * df['Physical-HeartRate']
    df['Age_WaistCirc'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['BMI_FitnessMaxStage'] = df['Physical-BMI'] * df['Fitness_Endurance-Max_Stage']
    df['Weight_GripStrengthDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSD']
    df['Weight_GripStrengthNonDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSND']
    df['HeartRate_FitnessTime'] = df['Physical-HeartRate'] * (df['Fitness_Endurance-Time_Mins'] + df['Fitness_Endurance-Time_Sec'])
    df['Age_PushUp'] = df['Basic_Demos-Age'] * df['FGC-FGC_PU']
    df['FFMI_Age'] = df['BIA-BIA_FFMI'] * df['Basic_Demos-Age']
    df['InternetUse_SleepDisturbance'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['SDS-SDS_Total_Raw']
    df['CGAS_BMI'] = df['CGAS-CGAS_Score'] * df['Physical-BMI']
    df['CGAS_FitnessMaxStage'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Max_Stage']
    return df

train_df = feature_engineering(train_df)

In [15]:
train_df = train_df.replace([np.inf, -np.inf], np.nan)

In [16]:
%%time
train_df_imputed = train_df.drop(columns='id').copy()
cat_cols = train_df_imputed.select_dtypes(include='object').columns.tolist()
num_cols = train_df_imputed.select_dtypes(include='number').columns.tolist()
print(f'{len(cat_cols)=} {len(num_cols)=}')
impute_cols = list(set(num_cols + ['sii']))
knn_imputer = KNNImputer(n_neighbors=5)
train_df_imputed.loc[:,impute_cols] = knn_imputer.fit_transform(train_df_imputed[impute_cols])
train_df_imputed.loc[:,cat_cols] = train_df_imputed[cat_cols].fillna('missing')
train_df_imputed['sii'] = train_df_imputed.sii.round().astype(int)
train_df_imputed.isna().sum().sum()

len(cat_cols)=10 len(num_cols)=156
CPU times: user 706 ms, sys: 240 ms, total: 946 ms
Wall time: 504 ms


0

In [17]:
train_df.sii.value_counts(dropna=False)

sii
0.0    583
1.0    266
2.0    137
3.0     10
Name: count, dtype: int64

In [20]:
train_df_imputed.sii.value_counts(dropna=False)

sii
0    583
1    266
2    137
3     10
Name: count, dtype: int64

In [21]:
cols_to_drop = ['sii', 'id']
X, y = train_df_imputed.drop(columns=cols_to_drop, errors='ignore'), train_df_imputed.sii
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()
X.shape, y.shape

((996, 165), (996,))

In [22]:
X.isna().sum().sum()

0

In [23]:
numeric_transormer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    # ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
])
category_transormer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder()),
])
cols_transformer = ColumnTransformer(transformers=[
    ('numeric', numeric_transormer, num_cols),
    ('category', category_transormer, cat_cols),
])
preproc_df_pipe = Pipeline(steps=[
    ('cols_transformer', cols_transformer),
    # ('scaler', StandardScaler()), # MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
    ('scaler', MinMaxScaler()),
    # ('scaler', RobustScaler()),
    # ('scaler', MaxAbsScaler()),
])
preproc_df_pipe

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=random_state)
X_train = preproc_df_pipe.fit_transform(X_train)
X_test = preproc_df_pipe.transform(X_test)

In [25]:
%%time
clf = CatBoostClassifier()
clf.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    logging_level='Silent', 
)
preds = clf.predict(X_test)
print(cohen_kappa_score(y_test, preds, weights='quadratic'))
print(classification_report(y_test, preds))

0.17846976410307192
              precision    recall  f1-score   support

           0       0.63      0.87      0.73       175
           1       0.28      0.19      0.23        80
           2       0.50      0.05      0.09        41
           3       0.00      0.00      0.00         3

    accuracy                           0.57       299
   macro avg       0.35      0.28      0.26       299
weighted avg       0.51      0.57      0.50       299

CPU times: user 2min 45s, sys: 1.97 s, total: 2min 47s
Wall time: 43.4 s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
%%time
_clf = CatBoostClassifier(verbose=0)
cv = StratifiedShuffleSplit(n_splits=5, test_size=.3, random_state=random_state)
_pipeline = Pipeline([('transformer', preproc_df_pipe), ('estimator', _clf)])
scoring = {
    'f1': 'f1_macro',
    'cohen_kappa_score': KAPPA_SCORER, 
}
scores = cross_validate(_pipeline, X, y, cv=cv, scoring=scoring)
for metric, folds_score in scores.items():
    if not metric.startswith('test_'): continue
    print(metric, folds_score.mean().round(4), folds_score.std().round(4))

test_f1 0.313 0.0124
test_cohen_kappa_score 0.2852 0.0437
CPU times: user 13min 47s, sys: 11.1 s, total: 13min 58s
Wall time: 3min 39s


In [28]:
%%time
test_parquet_partitions = list((data_filepath / 'series_test.parquet').glob('id=*'))
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
    results = list(tqdm(executor.map(lambda fpath: handle_parquet_partition(fpath), test_parquet_partitions), total=len(test_parquet_partitions)))
ts_test = pd.concat(results); del results
ts_test

100%|██████████| 2/2 [00:01<00:00,  1.53it/s]

CPU times: user 1.45 s, sys: 73.1 ms, total: 1.53 s
Wall time: 1.32 s





Unnamed: 0,step_min,step_max,step_std,step_mean,step_median,X_min,X_max,X_std,X_mean,X_median,...,weekday_weekend_activity_ratio,night_activity,morning_activity,day_activity,evening_activity,cnt_of_device_wearing_hours,relation_of_wearnig_no_wearing,pciat_period_activity,pre_pciat_period_activity,relation_pre_pciat_period_activity
0,0.0,43329.0,12508.437919,21664.5,21664.5,-1.746094,1.507865,0.453665,-0.316384,-0.366849,...,2.118516,4090,11725,16346,8462,268,268.0,40759,0,40759.0
0,0.0,396395.0,114429.81299,198197.5,198197.5,-1.038711,1.034351,0.351545,-0.004272,-0.020622,...,2.870638,27828,30012,30269,28449,223,0.586842,120133,0,120133.0


In [61]:
test_df = pd.read_csv(data_filepath / 'test.csv')
test_df = pd.merge(test_df, ts_test, on='id', how='inner')
test_df = feature_engineering(test_df)
test_df['sii'] = 2
test_df.loc[:,impute_cols] = knn_imputer.transform(test_df[impute_cols])
test_df.drop(columns='sii', inplace=True)
# test_df.loc[:,num_cols] = KNNImputer(n_neighbors=5).fit_transform(test_df[num_cols])
test_df.loc[:,cat_cols] = test_df[cat_cols].fillna('missing')
_id = test_df.id
test_df = pd.DataFrame(preproc_df_pipe.transform(test_df), columns=test_df[X.columns].columns)
test_df['id'] = _id; del _id
test_df['sii'] = clf.predict(test_df[X.columns])
test_df.sii.value_counts()

sii
0    1
1    1
Name: count, dtype: int64

In [62]:
test_df[['id', 'sii']].to_csv('submission.csv', index=False)
!du -hs 'submission.csv'

4.0K	submission.csv
