## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

## Read the CSV file

In [2]:
train_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv", index_col="id")
test_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv", index_col="id")

In [3]:
train_df = train_data.copy()
test_df = test_data.copy()

## Handle missing output sample

In [4]:
#Remove sample which not have sii
train_df = train_df.dropna(subset=['sii'])

## One-hot encoding process

In [5]:
train_df.head(20)

Unnamed: 0_level_0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,,...,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0
0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,,...,4.0,1.0,0.0,20.0,Winter,27.0,40.0,Fall,3.0,0.0
00abe655,Fall,11,0,Summer,66.0,,,,,,...,0.0,1.0,0.0,10.0,Winter,42.0,59.0,Fall,0.0,0.0
00ae59c9,Fall,13,0,,,Winter,21.079065,57.75,100.0,,...,2.0,2.0,1.0,31.0,Fall,33.0,47.0,Fall,1.0,1.0
00c0cd71,Winter,7,0,Summer,51.0,Spring,29.315775,54.0,121.6,,...,3.0,3.0,1.0,58.0,Spring,35.0,50.0,Winter,2.0,2.0
00d56d4b,Spring,5,1,Summer,80.0,Spring,17.284504,44.0,47.6,,...,0.0,0.0,0.0,0.0,Spring,37.0,53.0,Spring,0.0,0.0


In [6]:
threshold = 0.5
missing_ratio = train_df.isnull().mean()
dropped_columns = missing_ratio[missing_ratio > threshold].index.tolist()

train_df.drop(columns=dropped_columns, inplace=True)
train_df.drop(columns=[col for col in train_df if 'PCIAT' in col], inplace=True)

test_df.drop(columns=dropped_columns, inplace=True)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 00008ff9 to ffed1dd5
Data columns (total 48 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Enroll_Season               2736 non-null   object 
 1   Basic_Demos-Age                         2736 non-null   int64  
 2   Basic_Demos-Sex                         2736 non-null   int64  
 3   CGAS-Season                             2342 non-null   object 
 4   CGAS-CGAS_Score                         2342 non-null   float64
 5   Physical-Season                         2595 non-null   object 
 6   Physical-BMI                            2527 non-null   float64
 7   Physical-Height                         2530 non-null   float64
 8   Physical-Weight                         2572 non-null   float64
 9   Physical-Diastolic_BP                   2478 non-null   float64
 10  Physical-HeartRate                      2486 non-null 

In [8]:
int_cols = ['BIA-BIA_Activity_Level_num', 'FGC-FGC_GSD_Zone','FGC-FGC_GSND_Zone','BIA-BIA_Frame_num', 'PreInt_EduHx-computerinternet_hoursday']
categorical_int_cols = [col for col in int_cols if col not in dropped_columns]

categorical_str_cols = [col for col in train_df.columns if 'Season' in col and col not in dropped_columns]
categorical_cols = categorical_str_cols + categorical_int_cols

In [9]:
binary_cols = [col for col in train_df.columns if train_df[col].nunique() == 2]

In [10]:
numerical_cols = [col for col in train_df.columns if col != 'sii' and col not in categorical_cols and col not in binary_cols]

In [11]:
for col in categorical_cols:
    mode_value = train_df[col].mode()[0]
    train_df[col] = train_df[col].fillna(mode_value)
    train_df[col] = train_df[col].astype(object)

for col in binary_cols:
    mode_value = train_df[col].mode()[0]
    train_df[col] = train_df[col].fillna(mode_value)
    train_df[col] = train_df[col].astype(int)

for col in numerical_cols:
    mean_value = train_df[col].mean()
    train_df[col] = train_df[col].fillna(mean_value)


In [12]:
for col in categorical_cols:
    mode_value = test_df[col].mode()[0]
    test_df[col] = test_df[col].fillna(mode_value)
    test_df[col] = test_df[col].astype(object)

for col in binary_cols:
    mode_value = test_df[col].mode()[0]
    test_df[col] = test_df[col].fillna(mode_value)
    test_df[col] = test_df[col].astype(int)

for col in numerical_cols:
    mean_value = test_df[col].mean()
    test_df[col] = test_df[col].fillna(mean_value)


In [13]:
scaler = StandardScaler()

# Helper function
def standardize(df):
    columns_to_standardize = [col for col in numerical_cols]
    df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])
    return df

#Proceed with standardizing
train_df = standardize(train_df)
test_df = standardize(test_df)

In [14]:
#Helper function
def OneHot_Encoding(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    original_dataframe = pd.concat([original_dataframe, dummies], axis=1)
    original_dataframe = original_dataframe.drop([feature_to_encode], axis=1)
    return original_dataframe

for col in categorical_cols:
    train_df = OneHot_Encoding(train_df, col)
    test_df = OneHot_Encoding(test_df, col)

In [15]:
train_df.head()

Unnamed: 0_level_0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,FGC-FGC_CU,...,BIA-BIA_Activity_Level_num_3.0,BIA-BIA_Activity_Level_num_4.0,BIA-BIA_Activity_Level_num_5.0,BIA-BIA_Frame_num_1.0,BIA-BIA_Frame_num_2.0,BIA-BIA_Frame_num_3.0,PreInt_EduHx-computerinternet_hoursday_0.0,PreInt_EduHx-computerinternet_hoursday_1.0,PreInt_EduHx-computerinternet_hoursday_2.0,PreInt_EduHx-computerinternet_hoursday_3.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008ff9,-1.528487,0,-1.296014,-0.476635,-1.39205,-0.88136,0.0,-1.082896e-15,-8.672395e-16,-1.187686,...,0,0,0,1,0,0,0,0,0,1
000fd460,-0.361407,0,0.0,-1.07906,-1.110744,-0.995576,0.39973,-0.9009683,0.2973259,-0.888595,...,0,0,0,1,0,0,1,0,0,0
00105258,-0.069637,1,0.534609,-0.5251,0.084807,-0.291242,-0.362392,0.9278811,-0.007806868,0.806251,...,1,0,0,0,1,0,0,0,1,0
00115b9f,-0.361407,0,0.534609,-0.176658,0.01448,-0.148472,-0.743453,1.156487,-0.007806868,0.606857,...,1,0,0,0,1,0,1,0,0,0
001f3379,0.805674,1,-1.387545,0.668686,0.506766,0.579658,-0.743453,-0.6723621,-0.9232052,0.008676,...,0,0,0,0,1,0,1,0,0,0


In [16]:
# Remove feature which does not appear in test data, excluding 'sii'
train_miss = (set(train_df.columns) - set(test_df.columns)) - {'sii'}

train_df = train_df.drop(columns=train_miss)

In [17]:
train_df.head()

Unnamed: 0_level_0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,FGC-FGC_CU,...,PreInt_EduHx-Season_Winter,BIA-BIA_Activity_Level_num_2.0,BIA-BIA_Activity_Level_num_3.0,BIA-BIA_Activity_Level_num_5.0,BIA-BIA_Frame_num_1.0,BIA-BIA_Frame_num_2.0,PreInt_EduHx-computerinternet_hoursday_0.0,PreInt_EduHx-computerinternet_hoursday_1.0,PreInt_EduHx-computerinternet_hoursday_2.0,PreInt_EduHx-computerinternet_hoursday_3.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008ff9,-1.528487,0,-1.296014,-0.476635,-1.39205,-0.88136,0.0,-1.082896e-15,-8.672395e-16,-1.187686,...,0,1,0,0,1,0,0,0,0,1
000fd460,-0.361407,0,0.0,-1.07906,-1.110744,-0.995576,0.39973,-0.9009683,0.2973259,-0.888595,...,0,1,0,0,1,0,1,0,0,0
00105258,-0.069637,1,0.534609,-0.5251,0.084807,-0.291242,-0.362392,0.9278811,-0.007806868,0.806251,...,0,0,1,0,0,1,0,0,1,0
00115b9f,-0.361407,0,0.534609,-0.176658,0.01448,-0.148472,-0.743453,1.156487,-0.007806868,0.606857,...,1,0,1,0,0,1,1,0,0,0
001f3379,0.805674,1,-1.387545,0.668686,0.506766,0.579658,-0.743453,-0.6723621,-0.9232052,0.008676,...,0,1,0,0,0,1,1,0,0,0


## Standardize process

## Handle missing cells

## Extract df

In [18]:
features = [col for col in train_df.columns if col != 'sii']
X = train_df[features]
y = train_df.sii

In [19]:
X.head()

Unnamed: 0_level_0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,FGC-FGC_CU,...,PreInt_EduHx-Season_Winter,BIA-BIA_Activity_Level_num_2.0,BIA-BIA_Activity_Level_num_3.0,BIA-BIA_Activity_Level_num_5.0,BIA-BIA_Frame_num_1.0,BIA-BIA_Frame_num_2.0,PreInt_EduHx-computerinternet_hoursday_0.0,PreInt_EduHx-computerinternet_hoursday_1.0,PreInt_EduHx-computerinternet_hoursday_2.0,PreInt_EduHx-computerinternet_hoursday_3.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008ff9,-1.528487,0,-1.296014,-0.476635,-1.39205,-0.88136,0.0,-1.082896e-15,-8.672395e-16,-1.187686,...,0,1,0,0,1,0,0,0,0,1
000fd460,-0.361407,0,0.0,-1.07906,-1.110744,-0.995576,0.39973,-0.9009683,0.2973259,-0.888595,...,0,1,0,0,1,0,1,0,0,0
00105258,-0.069637,1,0.534609,-0.5251,0.084807,-0.291242,-0.362392,0.9278811,-0.007806868,0.806251,...,0,0,1,0,0,1,0,0,1,0
00115b9f,-0.361407,0,0.534609,-0.176658,0.01448,-0.148472,-0.743453,1.156487,-0.007806868,0.606857,...,1,0,1,0,0,1,1,0,0,0
001f3379,0.805674,1,-1.387545,0.668686,0.506766,0.579658,-0.743453,-0.6723621,-0.9232052,0.008676,...,0,1,0,0,0,1,1,0,0,0


## Split dataset

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Define and train model

In [21]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier

# from sklearn.metrics import cohen_kappa_score, make_scorer


# def quadratic_weighted_kappa(y_true, y_pred):
#     return cohen_kappa_score(y_true, y_pred, weights="quadratic")


# qwk_scorer = make_scorer(quadratic_weighted_kappa)


# param_grid = {
#     'n_estimators': [150, 200, 250, 300],
#     'max_depth': [5, 8, 10],
#     'min_samples_split': [2, 3, 4, 5],
#     'min_samples_leaf': [1, 2, 3, 4],
#     'criterion':['entropy', 'gini'],
# }

# # Khởi tạo mô hình
# rf_model = RandomForestClassifier(
#     random_state=42,
#     class_weight='balanced',
#     max_features='sqrt',
# )

# # GridSearchCV với 3-fold cross-validation
# grid_search = GridSearchCV(
#     estimator=rf_model,
#     param_grid=param_grid,
#     scoring=qwk_scorer,
#     cv=3,
#     verbose=3,
#     n_jobs=-1
# )

# # Tìm kiếm
# grid_search.fit(X, y)

# print("Best parameters found:", grid_search.best_params_)
# print("Best cross-validation accuracy:", grid_search.best_score_)
# best_rf_model = grid_search.best_estimator_


In [22]:
#Validation model,train on X_val test

val_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    max_features='sqrt',
    min_samples_split=5,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
)

val_model.fit(X_train, y_train)

In [23]:
#Test model,train on all X
test_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    max_features='sqrt',
    min_samples_split=5,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
)

test_model.fit(X, y)

## Evaluation

In [24]:
#Evaluation function
def QWK(y_true, y_pred, n_classes):
    """
    Calculate the Quadratic Weighted Kappa (QWK) score.

    Parameters:
    y_true (list or numpy array): Actual values (ground truth).
    y_pred (list or numpy array): Predicted values.
    n_classes (int): Number of distinct classes/labels.

    Returns:
    float: QWK score.
    """
    # Create histogram matrix O (observed matrix)
    O = np.zeros((n_classes, n_classes), dtype=np.float64)
    for true, pred in zip(y_true, y_pred):
        O[true, pred] += 1

    # Create weight matrix W
    W = np.zeros((n_classes, n_classes), dtype=np.float64)
    for i in range(n_classes):
        for j in range(n_classes):
            W[i, j] = ((i - j) ** 2) / ((n_classes - 1) ** 2)

    # Create expected matrix E
    actual_hist = np.sum(O, axis=1)
    pred_hist = np.sum(O, axis=0)
    E = np.outer(actual_hist, pred_hist) / np.sum(O)

    # Calculate QWK
    numerator = np.sum(W * O)
    denominator = np.sum(W * E)
    kappa = 1 - (numerator / denominator)

    return kappa

In [25]:
val_preds = val_model.predict(X_val)
val_preds = np.array(val_preds).astype(int)

y_val = np.array(y_val).astype(int)

print(QWK(y_val, val_preds, 4))

0.3969221017044654


## Submit

In [26]:
# missing_columns = (set(X.columns) - set(test_df.columns))

# for col in missing_columns:
#     test_df[col] = 0

# test_df = test_df[X.columns]

In [27]:
test_df.head()

Unnamed: 0_level_0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,FGC-FGC_CU,...,PreInt_EduHx-Season_Winter,BIA-BIA_Activity_Level_num_2.0,BIA-BIA_Activity_Level_num_3.0,BIA-BIA_Activity_Level_num_5.0,BIA-BIA_Frame_num_1.0,BIA-BIA_Frame_num_2.0,PreInt_EduHx-computerinternet_hoursday_0.0,PreInt_EduHx-computerinternet_hoursday_1.0,PreInt_EduHx-computerinternet_hoursday_2.0,PreInt_EduHx-computerinternet_hoursday_3.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00008ff9,-1.583385,0,-1.723923,-0.775133,-1.294562,-1.551454,0.0,0.0,0.0,-1.420614,...,0,1,0,0,1,0,0,0,0,1
000fd460,-0.4819,0,0.0,-1.519641,-0.9226433,-1.813672,0.334979,-1.688635,0.296288,-0.930314,...,0,1,0,0,1,0,1,0,0,0
00105258,-0.206529,1,1.274204,-0.83503,0.6580092,-0.1966632,-0.417015,1.785129,-0.03628,1.848055,...,0,1,0,0,0,1,0,0,1,0
00115b9f,-0.4819,0,1.274204,-0.404408,0.5650296,0.1311088,-0.793012,2.219349,-0.03628,1.521188,...,1,0,1,0,0,1,1,0,0,0
0016bb22,1.996442,1,0.0,0.0,-1.321319e-15,7.7632e-16,0.0,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,1,0


In [28]:
test_preds = test_model.predict(test_df)
test_preds = np.array(test_preds).astype(int)

In [29]:
output = pd.DataFrame({'id': test_data.index,
                       'sii': test_preds})
output.to_csv('submission.csv', index=False)