In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
pd.set_option('display.max_rows', 500)

In [2]:
!du -hs data/*

6.2G	data/child-mind-institute-problematic-internet-use.zip
 12K	data/data_dictionary.csv
4.0K	data/sample_submission.csv
7.9M	data/series_test.parquet
6.3G	data/series_train.parquet
8.0K	data/test.csv
924K	data/train.csv


In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_id = test_df.id
train_df.drop(columns='id', inplace=True)
base_features = test_df.drop(columns='id').columns
train_df.shape, test_df.shape

((3960, 81), (20, 59))

In [4]:
train_df.sii.value_counts(sort=True, dropna=False)

sii
0.0    1594
NaN    1224
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [5]:
train_df = train_df.dropna(subset='sii')
train_df.shape

(2736, 81)

In [6]:
na_ratio = train_df[base_features].isna().mean().sort_values(ascending=False)
na_ratio

PAQ_A-PAQ_A_Total                         0.867325
PAQ_A-Season                              0.867325
Physical-Waist_Circumference              0.823465
Fitness_Endurance-Time_Sec                0.733918
Fitness_Endurance-Time_Mins               0.733918
Fitness_Endurance-Max_Stage               0.732822
FGC-FGC_GSD_Zone                          0.684211
FGC-FGC_GSND_Zone                         0.684211
FGC-FGC_GSD                               0.681652
FGC-FGC_GSND                              0.681287
Fitness_Endurance-Season                  0.539474
PAQ_C-PAQ_C_Total                         0.473684
PAQ_C-Season                              0.473684
BIA-BIA_Fat                               0.337354
BIA-BIA_SMM                               0.337354
BIA-BIA_BMC                               0.337354
BIA-BIA_BMI                               0.337354
BIA-BIA_BMR                               0.337354
BIA-BIA_DEE                               0.337354
BIA-BIA_ECW                    

In [7]:
threshold = .4
base_features = na_ratio[na_ratio < threshold].index

In [8]:
def handle_missing(df):
    cat_cols = df.select_dtypes(include='object').columns
    num_cols = df.select_dtypes(exclude='object').columns
    df[cat_cols] = df[cat_cols].fillna('missing')
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())
    return df

train_df = handle_missing(train_df)
test_df = handle_missing(test_df)

In [9]:
base_features = test_df.drop(columns='id').columns
base_features

Index(['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
       'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
       'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
       'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
       'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA

In [12]:
cat_features = train_df.select_dtypes(include='object').columns

scaler = StandardScaler()
le = LabelEncoder()

for catf in cat_features:
    train_df[catf] = le.fit_transform(train_df[catf]).astype(int)
    test_df[catf] = le.fit_transform(test_df[catf]).astype(int)
train_df[base_features] = scaler.fit_transform(train_df[base_features])
test_df[base_features] = scaler.fit_transform(test_df[base_features])

In [13]:
X, y = train_df.drop(columns='sii'), train_df.sii
X.shape, y.shape

((2736, 80), (2736,))

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state)
clf = CatBoostClassifier(verbose=0)
clf.fit(X_train, y_train)
cohen_kappa_score(y_test, clf.predict(X_test))

1.0

In [19]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, cohen_kappa_score

random_state = 42

cv = StratifiedShuffleSplit(n_splits=5, test_size=.2, random_state=random_state)
clf = CatBoostClassifier(verbose=0)
scoring = {
    'cohen_kappa_score': make_scorer(cohen_kappa_score), 
    'f1': 'f1_macro',
}
scores = cross_validate(clf, X, y, cv=cv, scoring=scoring)
for metric, metrics_scores in scores.items():
    if not metric.startswith('test_'): continue
    print(metric, metrics_scores.mean(), metrics_scores.std())

test_cohen_kappa_score 1.0 0.0
test_f1 1.0 0.0
