In [2]:
# 1. Добавить временные ряды
# 2. Grid Search с разными энкодерами и скейлерами
# 3. Optuna
# 4. Блендирнг или стейкинг
# 5. Tabnet
# 6. Feature engineering

# Import

In [3]:
# !pip install pyarrow ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [62]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
pd.set_option('display.max_rows', 500)
random_state = 42
data_filepath = './data/'

# Data

In [5]:
!du -hs $data_filepath*

6.2G	./data/child-mind-institute-problematic-internet-use.zip
 12K	./data/data_dictionary.csv
4.0K	./data/sample_submission.csv
7.9M	./data/series_test.parquet
6.3G	./data/series_train.parquet
8.0K	./data/test.csv
924K	./data/train.csv


In [6]:
train_df = pd.read_csv(data_filepath + 'train.csv')
test_df = pd.read_csv(data_filepath + 'test.csv')
train_df.shape, test_df.shape

((3960, 82), (20, 59))

In [7]:
target_source_cols = [c for c in train_df.columns if c.startswith('PCIAT')]
train_df.drop(columns=target_source_cols, inplace=True)
train_df.shape

(3960, 60)

In [8]:
train_df.sii.value_counts(dropna=False)

sii
0.0    1594
NaN    1224
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [9]:
train_df = train_df.dropna(subset='sii')
train_df.shape

(2736, 60)

# Cat num cols split 

In [39]:
cols_to_drop = ['sii', 'id']
X, y = train_df.drop(columns=cols_to_drop), train_df.sii
X.shape, y.shape

((2736, 58), (2736,))

In [40]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()

# Pipeline

In [49]:
numeric_transormer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='median')),
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
])  
category_transormer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder()),
])
cols_transformer = ColumnTransformer(transformers=[
    ('numeric', numeric_transormer, num_cols),
    ('category', category_transormer, cat_cols),
])
preproc_df_pipe = Pipeline(steps=[
    ('cols_transformer', cols_transformer),
    ('scaler', StandardScaler()),
])
preproc_df_pipe

# Train 

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2188, 58), (548, 58), (2188,), (548,))

In [89]:
X_train = preproc_df_pipe.fit_transform(X_train)
X_test = preproc_df_pipe.transform(X_test)
X_train.shape, X_test.shape

((2188, 58), (548, 58))

In [90]:
%%time
clf = CatBoostClassifier()
clf.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    logging_level='Verbose', 
)

Learning rate set to 0.10968
0:	learn: 1.3032797	test: 1.3043475	best: 1.3043475 (0)	total: 6.25ms	remaining: 6.24s
1:	learn: 1.2397225	test: 1.2428041	best: 1.2428041 (1)	total: 10.4ms	remaining: 5.17s
2:	learn: 1.1867615	test: 1.1890074	best: 1.1890074 (2)	total: 14.3ms	remaining: 4.75s
3:	learn: 1.1417180	test: 1.1456159	best: 1.1456159 (3)	total: 17.6ms	remaining: 4.38s
4:	learn: 1.1055714	test: 1.1103992	best: 1.1103992 (4)	total: 20.8ms	remaining: 4.15s
5:	learn: 1.0780245	test: 1.0822822	best: 1.0822822 (5)	total: 24.7ms	remaining: 4.09s
6:	learn: 1.0504643	test: 1.0563172	best: 1.0563172 (6)	total: 27.8ms	remaining: 3.95s
7:	learn: 1.0265240	test: 1.0345169	best: 1.0345169 (7)	total: 31.1ms	remaining: 3.85s
8:	learn: 1.0066226	test: 1.0154191	best: 1.0154191 (8)	total: 34.1ms	remaining: 3.76s
9:	learn: 0.9896852	test: 0.9998913	best: 0.9998913 (9)	total: 37.2ms	remaining: 3.68s
10:	learn: 0.9731568	test: 0.9860574	best: 0.9860574 (10)	total: 40.5ms	remaining: 3.64s
11:	learn: 0

<catboost.core.CatBoostClassifier at 0x16a560bc0>

In [91]:
preds = clf.predict(X_test)
print(cohen_kappa_score(y_test, preds))
print(classification_report(y_test, preds))

0.24206229312278016
              precision    recall  f1-score   support

         0.0       0.68      0.91      0.78       319
         1.0       0.46      0.29      0.35       146
         2.0       0.32      0.13      0.19        76
         3.0       0.00      0.00      0.00         7

    accuracy                           0.62       548
   macro avg       0.37      0.33      0.33       548
weighted avg       0.56      0.62      0.57       548



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [92]:
fi = clf.get_feature_importance(prettified=True)
fi['Feature name'] = fi['Feature Id'].apply(lambda i: X.columns[int(i)])
fi

Unnamed: 0,Feature Id,Importances,Feature name
0,47,9.531135,BIA-BIA_SMM
1,46,5.797507,BIA-BIA_LST
2,45,5.354437,BIA-BIA_LDM
3,0,5.316825,Basic_Demos-Enroll_Season
4,1,3.607892,Basic_Demos-Age
5,5,3.503148,Physical-Season
6,8,3.180329,Physical-Weight
7,9,3.15901,Physical-Waist_Circumference
8,4,2.881777,CGAS-CGAS_Score
9,49,2.754726,PAQ_A-Season


# Cross validation

In [83]:
%%time
clf = CatBoostClassifier(verbose=0)
cv = StratifiedShuffleSplit(n_splits=5, test_size=.3, random_state=random_state)
_pipeline = Pipeline([('transformer', preproc_df_pipe), ('estimator', clf)])
scoring = {
    'f1': 'f1_macro',
    'cohen_kappa_score': make_scorer(cohen_kappa_score), 
}
scores = cross_validate(_pipeline, X, y, cv=cv, scoring=scoring)
for metric, folds_score in scores.items():
    if not metric.startswith('test_'): continue
    print(metric, folds_score.mean().round(4), folds_score.std().round(4))

test_f1 0.3114 0.0043
test_cohen_kappa_score 0.1738 0.0149
CPU times: user 1min 26s, sys: 10.8 s, total: 1min 37s
Wall time: 16 s


# Save

In [93]:
test_df = pd.read_csv(data_filepath + 'test.csv')
test_df['sii'] = clf.predict(preproc_df_pipe.transform(test_df[X.columns]))
test_df.sii.value_counts()

sii
0.0    19
1.0     1
Name: count, dtype: int64

In [None]:
test_df[['id', 'sii']].to_csv('submission.csv', index=False)
!du -hs 'submission.csv'