# Import

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report, make_scorer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, Normalizer
from sklearn.model_selection import train_test_split, cross_validate, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
import seaborn as sns

pd.set_option('display.max_rows', 500)
random_state = 42
np.random.seed(random_state)
data_filepath = Path('./data')
# data_filepath = Path('/kaggle/input/child-mind-institute-problematic-internet-use')
KAPPA_SCORER = make_scorer(
    cohen_kappa_score, 
    greater_is_better=True, 
    weights='quadratic',
)

# Data

In [2]:
!du -hs $data_filepath/*
train_df = pd.read_csv(data_filepath / 'train.csv')
test_df = pd.read_csv(data_filepath / 'test.csv')
train_df.shape, test_df.shape

6.2G	data/child-mind-institute-problematic-internet-use.zip
 12K	data/data_dictionary.csv
4.0K	data/sample_submission.csv
7.9M	data/series_test.parquet
6.3G	data/series_train.parquet
8.0K	data/test.csv
924K	data/train.csv


((3960, 82), (20, 59))

In [3]:
cols_to_drop = ['sii', 'id']
X, y = train_df.drop(columns=cols_to_drop), train_df.sii
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()
X.shape, y.shape

((3960, 80), (3960,))

# Fill missing features and target

In [4]:
# numeric_transormer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     # ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
# ])  
# category_transormer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('encoder', OrdinalEncoder()),
# ])
# cols_transformer = ColumnTransformer(transformers=[
#     ('numeric', numeric_transormer, num_cols),
#     ('category', category_transormer, cat_cols),
# ])
# preproc_df_pipe = Pipeline(steps=[
#     ('cols_transformer', cols_transformer),
#     # ('scaler', StandardScaler()),
# ])
# X = pd.DataFrame(preproc_df_pipe.fit_transform(X), columns=X.columns)
# X.isna().sum().sum()

In [5]:
train_df_imputed = train_df.drop(columns='id').copy()
impute_cols = num_cols + ['sii']
train_df_imputed.loc[:,impute_cols] = KNNImputer(n_neighbors=5).fit_transform(train_df_imputed[impute_cols])
train_df_imputed.loc[:,cat_cols] = train_df_imputed[cat_cols].fillna('missing')
train_df_imputed['sii'] = train_df_imputed.sii.round().astype(int)
train_df_imputed.isna().sum().sum()

0

In [6]:
y.value_counts(dropna=False)

sii
0.0    1594
NaN    1224
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [7]:
train_df_imputed.sii.value_counts(dropna=False)

sii
0    2101
1    1394
2     431
3      34
Name: count, dtype: int64

In [8]:
cols_to_drop = ['sii', 'id']
X, y = train_df_imputed.drop(columns=cols_to_drop, errors='ignore'), train_df_imputed.sii
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()
X.shape, y.shape

((3960, 80), (3960,))

In [9]:
X.isna().sum().sum()

0

# Feature egnineering

# Catboost

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=random_state)
X_train = preproc_df_pipe.fit_transform(X_train)
X_test = preproc_df_pipe.transform(X_test)

ValueError: Input y contains NaN.

# CV

# Save