In [None]:
1. Распарсить timeseries

# Import

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, Normalizer, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
import seaborn as sns

pd.set_option('display.max_rows', 500)
random_state = 42
np.random.seed(random_state)
data_filepath = Path('./data')
# data_filepath = Path('/kaggle/input/child-mind-institute-problematic-internet-use')
KAPPA_SCORER = make_scorer(
    cohen_kappa_score, 
    greater_is_better=True, 
    weights='quadratic',
)

# Data

In [2]:
!du -hs $data_filepath/*
train_df = pd.read_csv(data_filepath / 'train.csv')
test_df = pd.read_csv(data_filepath / 'test.csv')
train_df.shape, test_df.shape

6.2G	data/child-mind-institute-problematic-internet-use.zip
 12K	data/data_dictionary.csv
4.0K	data/sample_submission.csv
7.9M	data/series_test.parquet
6.3G	data/series_train.parquet
8.0K	data/test.csv
924K	data/train.csv


((3960, 82), (20, 59))

# Feature egnineering

In [3]:
def feature_engineering(df):
    # season_cols = [col for col in df.columns if 'Season' in col]
    # df = df.drop(season_cols, axis=1)
    pciat_cols = [c for c in df.columns if c.startswith('PCIAT')]
    df = df.drop(columns=pciat_cols, errors='ignore')
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']

    df['Age_Weight'] = df['Basic_Demos-Age'] * df['Physical-Weight']
    df['Sex_BMI'] = df['Basic_Demos-Sex'] * df['Physical-BMI']
    df['Sex_HeartRate'] = df['Basic_Demos-Sex'] * df['Physical-HeartRate']
    df['Age_WaistCirc'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['BMI_FitnessMaxStage'] = df['Physical-BMI'] * df['Fitness_Endurance-Max_Stage']
    df['Weight_GripStrengthDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSD']
    df['Weight_GripStrengthNonDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSND']
    df['HeartRate_FitnessTime'] = df['Physical-HeartRate'] * (df['Fitness_Endurance-Time_Mins'] + df['Fitness_Endurance-Time_Sec'])
    df['Age_PushUp'] = df['Basic_Demos-Age'] * df['FGC-FGC_PU']
    df['FFMI_Age'] = df['BIA-BIA_FFMI'] * df['Basic_Demos-Age']
    df['InternetUse_SleepDisturbance'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['SDS-SDS_Total_Raw']
    df['CGAS_BMI'] = df['CGAS-CGAS_Score'] * df['Physical-BMI']
    df['CGAS_FitnessMaxStage'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Max_Stage']
    return df

train_df = feature_engineering(train_df)

In [4]:
train_df = train_df.replace([np.inf, -np.inf], np.nan)

# Fill missing features and target

In [5]:
train_df

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,Age_WaistCirc,BMI_FitnessMaxStage,Weight_GripStrengthDominant,Weight_GripStrengthNonDominant,HeartRate_FitnessTime,Age_PushUp,FFMI_Age,InternetUse_SleepDisturbance,CGAS_BMI,CGAS_FitnessMaxStage
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,0.0,69.0885,,860.743100,
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,198.0,,,,,45.0,115.4286,0.0,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,83.243480,1111.32,771.12,3760.0,70.0,,76.0,1182.057420,355.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,,109.754082,,,4462.0,45.0,126.6660,0.0,1298.756633,426.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,,,1639.76,1483.20,,130.0,182.8177,35.0,981.747617,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,270.0,,,,,40.0,,,,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,,,1734.84,2031.30,,0.0,162.8473,0.0,1458.022000,
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,,,,,,,169.8892,33.0,856.512667,


In [6]:
%%time
train_df_imputed = train_df.drop(columns='id').copy()
cat_cols = train_df_imputed.select_dtypes(include='object').columns.tolist()
num_cols = train_df_imputed.select_dtypes(include='number').columns.tolist()
print(f'{len(cat_cols)=} {len(num_cols)=}')
impute_cols = list(set(num_cols + ['sii']))
knn_imputer = KNNImputer(n_neighbors=5)
train_df_imputed.loc[:,impute_cols] = knn_imputer.fit_transform(train_df_imputed[impute_cols])
train_df_imputed.loc[:,cat_cols] = train_df_imputed[cat_cols].fillna('missing')
train_df_imputed['sii'] = train_df_imputed.sii.round().astype(int)
train_df_imputed.isna().sum().sum()

len(cat_cols)=10 len(num_cols)=77
CPU times: user 5.58 s, sys: 2.31 s, total: 7.89 s
Wall time: 4.62 s


0

In [7]:
train_df.sii.value_counts(dropna=False)

sii
0.0    1594
NaN    1224
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [8]:
train_df_imputed.sii.value_counts(dropna=False)

sii
0    2209
1    1275
2     442
3      34
Name: count, dtype: int64

In [9]:
cols_to_drop = ['sii', 'id']
X, y = train_df_imputed.drop(columns=cols_to_drop, errors='ignore'), train_df_imputed.sii
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()
X.shape, y.shape

((3960, 86), (3960,))

In [10]:
X.isna().sum().sum()

0

# Catboost

In [25]:
numeric_transormer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    # ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
])
category_transormer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder()),
])
cols_transformer = ColumnTransformer(transformers=[
    ('numeric', numeric_transormer, num_cols),
    ('category', category_transormer, cat_cols),
])
preproc_df_pipe = Pipeline(steps=[
    ('cols_transformer', cols_transformer),
    # ('scaler', StandardScaler()), # MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
    ('scaler', MinMaxScaler()),
    # ('scaler', RobustScaler()),
    # ('scaler', MaxAbsScaler()),
])
preproc_df_pipe

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=random_state)
X_train = preproc_df_pipe.fit_transform(X_train)
X_test = preproc_df_pipe.transform(X_test)

In [27]:
%%time
clf = CatBoostClassifier()
clf.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    logging_level='Silent', 
)
preds = clf.predict(X_test)
print(cohen_kappa_score(y_test, preds, weights='quadratic'))
print(classification_report(y_test, preds))

0.4186241610738254
              precision    recall  f1-score   support

           0       0.72      0.88      0.79       663
           1       0.59      0.53      0.56       382
           2       0.44      0.14      0.22       133
           3       0.00      0.00      0.00        10

    accuracy                           0.68      1188
   macro avg       0.44      0.39      0.39      1188
weighted avg       0.64      0.68      0.65      1188

CPU times: user 33.8 s, sys: 2.34 s, total: 36.1 s
Wall time: 5.34 s


# CV

In [28]:
%%time
_clf = CatBoostClassifier(verbose=0)
cv = StratifiedShuffleSplit(n_splits=5, test_size=.3, random_state=random_state)
_pipeline = Pipeline([('transformer', preproc_df_pipe), ('estimator', _clf)])
scoring = {
    'f1': 'f1_macro',
    'cohen_kappa_score': KAPPA_SCORER, 
}
scores = cross_validate(_pipeline, X, y, cv=cv, scoring=scoring)
for metric, folds_score in scores.items():
    if not metric.startswith('test_'): continue
    print(metric, folds_score.mean().round(4), folds_score.std().round(4))

test_f1 0.4065 0.0088
test_cohen_kappa_score 0.4523 0.0203
CPU times: user 2min 48s, sys: 11.2 s, total: 2min 59s
Wall time: 26.1 s


# Save

In [15]:
test_df = pd.read_csv(data_filepath / 'test.csv')
test_df = feature_engineering(test_df)
test_df.loc[:,num_cols] = KNNImputer(n_neighbors=5).fit_transform(test_df[num_cols])
test_df.loc[:,cat_cols] = test_df[cat_cols].fillna('missing')
_id = test_df.id
test_df = pd.DataFrame(preproc_df_pipe.transform(test_df), columns=test_df[X.columns].columns)
test_df['id'] = _id; del _id
test_df['sii'] = clf.predict(test_df[X.columns])
test_df.sii.value_counts()

sii
0    11
1     9
Name: count, dtype: int64

In [16]:
test_df[['id', 'sii']].to_csv('submission.csv', index=False)
!du -hs 'submission.csv'

4.0K	submission.csv


In [None]:
# StandScaler - Score: 0.303
# MinMaxScaler - Score: 0.312