In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
print(train['sii'].value_counts())

train = train.drop('id', axis=1)

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season','BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']


train = train[featuresCols]
train = train.dropna(subset='sii')

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train.select_dtypes(include=['object', 'category']).columns
train[numeric_cols] = train[numeric_cols].replace([np.inf, -np.inf], np.nan)

def impute_data(train):
    # Xử lý các cột số bằng KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    train_numeric_imputed = pd.DataFrame(knn_imputer.fit_transform(train[numeric_cols]), 
                                         columns=numeric_cols, index=train.index)
    
    # Xử lý các cột categorical bằng SimpleImputer (chiến lược thay thế là 'most_frequent')
    simple_imputer = SimpleImputer(strategy='most_frequent')
    train_categorical_imputed = pd.DataFrame(simple_imputer.fit_transform(train[categorical_cols]), 
                                             columns=categorical_cols, index=train.index)
    
    # Gộp dữ liệu đã xử lý lại
    train_imputed = pd.concat([train_numeric_imputed, train_categorical_imputed], axis=1)
    
    return train_imputed

train = impute_data(train)

#season mapping
season_columns = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
                  'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
                  'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

# Ánh xạ giá trị Season thành số
season_map = {'Fall': 0, 'Winter': 1, 'Spring': 2, 'Summer': 3, 'Missing': -1}

def map_season_values(df):
    for col in season_columns:
        df[col] = df[col].fillna('Missing')
        df[col] = df[col].map(season_map)
    return df

# Áp dụng cho train và test
train = map_season_values(train)
test = map_season_values(test)
print(test.shape)

sii
0.0    1594
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64
(20, 59)


In [3]:
common_features = [col for col in featuresCols if col in test.columns]
X = train[common_features]
y = train['sii']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_cols = [col for col in common_features if train[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in common_features if train[col].dtype == 'object']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

#Tạo pipeline với bộ xử lý dữ liệu và mô hình
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    ))
])

# Huấn luyện mô hình
model_pipeline.fit(X_train, y_train)

# Dự đoán và đánh giá
X_val = X_val[~y_val.isna()]
y_val = y_val.dropna()

y_pred = model_pipeline.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.71      0.87      0.78       336
         1.0       0.37      0.29      0.32       131
         2.0       0.42      0.21      0.28        72
         3.0       0.00      0.00      0.00         9

    accuracy                           0.63       548
   macro avg       0.37      0.34      0.35       548
weighted avg       0.58      0.63      0.59       548



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
common_features = [col for col in featuresCols if col in test.columns]
test['sii'] = model_pipeline.predict(test[common_features])

submission = test[['id', 'sii']]
submission.to_csv('submission.csv', index=False)