# Import

In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
pd.set_option('display.max_rows', 500)
random_state = 42

# Load data

In [2]:
train_df = pd.read_csv('../input/child-mind-institute-problematic-internet-use/train.csv')
test_df = pd.read_csv('../input/child-mind-institute-problematic-internet-use/test.csv')

In [3]:
# test_df = train_df[train_df.id.isin(submission_id)]
# test_df.drop(columns='sii', inplace=True)
# train_df = train_df[~train_df.id.isin(submission_id)]
# train_df.shape, test_df.shape

In [4]:
set(train_df.id) & set(test_df.id)

{'00008ff9',
 '000fd460',
 '00105258',
 '00115b9f',
 '0016bb22',
 '001f3379',
 '0038ba98',
 '0068a485',
 '0069fbed',
 '0083e397',
 '0087dd65',
 '00abe655',
 '00ae59c9',
 '00af6387',
 '00bd4359',
 '00c0cd71',
 '00d56d4b',
 '00d9913d',
 '00e6167c',
 '00ebc35d'}

# Filter missing target

In [5]:
train_df = train_df.dropna(subset='sii')
train_df.shape

(2736, 82)

In [6]:
train_df.sii.value_counts()

sii
0.0    1594
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

# Filter columns with missing vals

In [9]:
base_features = test_df.drop(columns=['id', 'sii'], errors='ignore').columns.tolist()
na_ratio = train_df[base_features].isna().mean().sort_values(ascending=False)
na_ratio

PAQ_A-PAQ_A_Total                         0.867325
PAQ_A-Season                              0.867325
Physical-Waist_Circumference              0.823465
Fitness_Endurance-Time_Sec                0.733918
Fitness_Endurance-Time_Mins               0.733918
Fitness_Endurance-Max_Stage               0.732822
FGC-FGC_GSD_Zone                          0.684211
FGC-FGC_GSND_Zone                         0.684211
FGC-FGC_GSD                               0.681652
FGC-FGC_GSND                              0.681287
Fitness_Endurance-Season                  0.539474
PAQ_C-PAQ_C_Total                         0.473684
PAQ_C-Season                              0.473684
BIA-BIA_Fat                               0.337354
BIA-BIA_SMM                               0.337354
BIA-BIA_BMC                               0.337354
BIA-BIA_BMI                               0.337354
BIA-BIA_BMR                               0.337354
BIA-BIA_DEE                               0.337354
BIA-BIA_ECW                    

In [10]:
threshold = .2
base_features = na_ratio[na_ratio < threshold].index
na_ratio[base_features]

CGAS-CGAS_Score                           0.144006
CGAS-Season                               0.144006
Physical-Systolic_BP                      0.094298
Physical-Diastolic_BP                     0.094298
Physical-HeartRate                        0.091374
SDS-SDS_Total_T                           0.077120
SDS-Season                                0.076389
SDS-SDS_Total_Raw                         0.076389
Physical-BMI                              0.076389
Physical-Height                           0.075292
Physical-Weight                           0.059942
Physical-Season                           0.051535
FGC-Season                                0.032529
PreInt_EduHx-computerinternet_hoursday    0.029971
PreInt_EduHx-Season                       0.006213
Basic_Demos-Age                           0.000000
Basic_Demos-Sex                           0.000000
Basic_Demos-Enroll_Season                 0.000000
dtype: float64

# Split cat num cols

In [11]:
cat_cols = train_df[base_features].select_dtypes(include='object').columns
num_cols = train_df[base_features].select_dtypes(exclude='object').columns

In [12]:
train_df[cat_cols].nunique()

CGAS-Season                  4
SDS-Season                   4
Physical-Season              4
FGC-Season                   4
PreInt_EduHx-Season          4
Basic_Demos-Enroll_Season    4
dtype: int64

# Handle missing values

In [13]:
def handle_missing(df, cat_cols, num_cols):
    df[cat_cols] = df[cat_cols].fillna('missing')
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())
    return df

train_df = handle_missing(train_df, cat_cols, num_cols)

# Split train test

In [14]:
X, y = train_df[base_features].drop(columns='sii', errors='ignore'), train_df.sii
X.shape, y.shape

((2736, 18), (2736,))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2188, 18), (548, 18), (2188,), (548,))

# Encoder & scaler

In [16]:
scaler = StandardScaler()
le = LabelEncoder()

for catf in cat_cols:
    X_train[catf] = le.fit_transform(X_train[catf])
    X_test[catf] = le.fit_transform(X_test[catf])

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape, X_test.shape

((2188, 18), (548, 18))

# Validation

In [17]:
clf = CatBoostClassifier(verbose=0)
clf.fit(X_train, y_train)
print(cohen_kappa_score(y_test, clf.predict(X_test)))
print(classification_report(y_test, clf.predict(X_test)))

0.18241907663640822
              precision    recall  f1-score   support

         0.0       0.68      0.80      0.74       319
         1.0       0.34      0.27      0.30       146
         2.0       0.29      0.20      0.23        76
         3.0       0.00      0.00      0.00         7

    accuracy                           0.57       548
   macro avg       0.33      0.32      0.32       548
weighted avg       0.53      0.57      0.54       548



In [18]:
test_id = test_df.id
test_df = test_df[base_features].copy()
for catf in cat_cols:
    test_df[catf] = le.fit_transform(test_df[catf])
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
test_df['sii'] = clf.predict(test_df)
test_df['sii'] = test_df['sii'].astype(int)
test_df['id'] = test_id

In [None]:
# !rm -rf catboost_info

In [19]:
test_df[['id', 'sii']].to_csv('submission.csv', index=False)

In [20]:
!du -hs submission.csv

4.0K	submission.csv
