<a href="https://colab.research.google.com/github/ngtht71/ML-Childmind/blob/main/ChildMind_KTT_team.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

child_mind_institute_problematic_internet_use_path = kagglehub.competition_download('child-mind-institute-problematic-internet-use')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import seaborn as sns
from matplotlib.ticker import PercentFormatter

# Data Preprocessing

In [None]:
# function to read data file
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
# Load datasets
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
print(sample.shape)
sample.head()

## Visualization data in train

In [None]:
# (Bar Chart)
plt.figure(figsize=(8, 5))
sns.countplot(x=train['sii'], palette='coolwarm')
plt.title('Distribution of Target Variable (sii)')
plt.xlabel('sii')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Keep samples with non-null values in column 'sii'
supervised_usable = train.dropna(subset=['sii'])

# Calculate the number of missing values and the missing data rate for each column
missing_count = supervised_usable.isnull().sum().reset_index()
missing_count.columns = ['feature', 'null_count']
missing_count['null_ratio'] = missing_count['null_count'] / len(supervised_usable)
missing_count = missing_count.sort_values('null_count', ascending=False)

# Draw a horizontal bar chart showing the ratio of missing and available data
plt.figure(figsize=(6, 15))
plt.title(f'Missing values over the {len(supervised_usable)} samples which have a label sii')

# Missing part (coral color)
plt.barh(np.arange(len(missing_count)),
         missing_count['null_ratio'],
         color='coral', label='missing')

# Available part (darkseagreen color)
plt.barh(np.arange(len(missing_count)),
         1 - missing_count['null_ratio'],
         left=missing_count['null_ratio'],
         color='darkseagreen', label='available')

# Customize y and x axis display
plt.yticks(np.arange(len(missing_count)), missing_count['feature'])
plt.gca().xaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=0))
plt.xlim(0, 1)
plt.legend()
plt.show()

In [None]:
# (Pie Chart) for 'Basic_Demos-Enroll_Season' column
plt.figure(figsize=(5, 5))
train['Basic_Demos-Enroll_Season'].value_counts().plot.pie(
    autopct='%1.1f%%',
    startangle=90,
    cmap='Set2',
    #wedgeprops={'edgecolor': 'black'}
)
plt.title('Enrollment Season Distribution', fontsize=14)
plt.ylabel('')  # Remove the label on y axis
plt.show()

In [None]:
# Pie Chart for 'Basic_Demos-Sex' column
plt.figure(figsize=(5, 5))  # Change size of chart
train['Basic_Demos-Sex'].value_counts().plot.pie(
    autopct='%1.1f%%',
    startangle=90,
    cmap='Set2',
    labels=['Girl', 'Boy'],  # Thêm nhãn cho các phần
)
plt.title('Sex Distribution', fontsize=14)
plt.ylabel('')  # Remove the label on y axis
plt.show()

In [None]:
# sii distribution chart by gender

# initialize target_labels
target_labels = ['None', 'Mild', 'Moderate', 'Severe']

# Tạo figure và subplots
_, axs = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(8, 6))

for sex in range(2):
    ax = axs.ravel()[sex]

    filtered = train[train['Basic_Demos-Sex'] == sex]

    vc = filtered['sii'].value_counts(normalize=True)

    ax.bar(vc.index,
           vc.values,
           color=['green', 'coral'][sex],
           label=['boys', 'girls'][sex])

    # Đặt nhãn và trục
    ax.set_xticks(np.arange(len(target_labels)))  # Đặt vị trí trên trục x
    ax.set_xticklabels(target_labels)  # Đặt nhãn cho trục x
    ax.yaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=0))
    ax.set_ylabel('Proportion (%)')
    ax.legend()

# Đặt tiêu đề và nhãn trục x
plt.suptitle('Target Distribution by Gender')
axs.ravel()[1].set_xlabel('Severity Impairment Index (sii)')
plt.tight_layout()
plt.show()


In [None]:
# Chỉ lấy các cột số
numeric_data = train.select_dtypes(include=['float64', 'int64'])

# Ma trận tương quan
correlation_matrix = numeric_data.corr()

# Heatmap
plt.figure(figsize=(30, 30))
sns.heatmap(correlation_matrix, annot=False, cmap='Set2')
plt.title('Correlation Matrix')
plt.show()


## Merge Data

In [None]:
# Load time series data
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

In [None]:
print(train_ts.shape)
train_ts.head()

In [None]:
print(test_ts.shape)
test_ts.head()

In [None]:
# filter data to get features before merging training set, test with time series
feature_col = test.columns.tolist()
feature_col.remove("id")          # remove 'id' column
feature_col.append("sii")
print(feature_col)
print("There are ", len(feature_col), "features.")

In [None]:
# merge data train, test với time series
train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')
train = train.dropna(thresh=10, axis=0)                     # remove row have valid values, not NaN < 10
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
# đọc data time series
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

In [None]:
print(time_series_cols)
print(len(time_series_cols))

In [None]:
# lấy đặc điểm đã lưu ở trên - chỉ chạy 1 lần nếu không sẽ bị lặp
featuresCols = feature_col
print("Before add time series, there are", len(featuresCols),"features.")
featuresCols += time_series_cols
# print(featuresCols)
print("After add time series, there are ",len(featuresCols), "features.")

In [None]:
# lọc bảng train, chỉ lấy các cột trong featuresCols, và xóa các hàng có sii không xác định
train = train[featuresCols]
train = train.dropna(subset=['sii'])
print(train.shape)
train.head()

In [None]:
# lọc các cột có kiểu dữ liệu là string để đổi thành categorical
data_dictionary = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv')
data_dictionary = data_dictionary[["Field", "Type"]]
filtered_data = data_dictionary[data_dictionary['Type'] == 'str']

# lọc các cột không có trong list featuresCols
filtered_data = filtered_data[filtered_data['Field'].isin(featuresCols)]

print(filtered_data)
filtered_data.shape

In [None]:
category_list = filtered_data['Field'].tolist()
print(category_list)

In [None]:
# chuyển các giá trị bị NaN thành Missing và chuyển kiểu dữ liệu thành category
def update(df):
    global category_list
    for c in category_list:
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

In [None]:
train.head()

In [None]:
# tạo hàm ánh xạ các giá trị string thành các chuỗi số
def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in category_list:
    # Nếu cột đã là số nguyên, không cần ánh xạ lại
    if train[col].dtype.kind in 'i':
        print(f"{col} is already integer. Skipping mapping.")
        continue

    mapping = create_mapping(col, train)
    mapping_test = create_mapping(col, test)
    print("Train: ", col, "have the map ", mapping)
    print("Test: ", col, "have the map ", mapping_test)
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mapping_test).astype(int)

In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

# Model Training

## Processing data for train, test and validation

In [None]:
# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# Suppress warnings
warnings.filterwarnings('ignore')

# Pandas option for displaying all columns
pd.options.display.max_columns = None

# Constants
n_splits = 5

In [None]:
X = train.drop(['sii'], axis=1).values         # dữ liệu
y = train['sii'].values                        # label
print(X.shape)
print(y.shape)

In [None]:
# lấy các giá trị trong dataframe test và chuyển thành mảng test_data
test_data = test.values
print(test_data.shape)

In [None]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
test_data = imputer.transform(test_data)

In [None]:
X_res, y_res = X, y

In [None]:
# chia dữ liệu đánh giá, để giảm overfitting
SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
train_S = []
test_S = []
oof_non_rounded = np.zeros(len(y_res), dtype=float)
oof_rounded = np.zeros(len(y_res), dtype=int)
test_preds = np.zeros((len(test_data), n_splits))

## Create parameter to create model

* Various model are used, include: LightGBM, XGBoost, CatBosst
  * LightGBM: A gradient boosting framework known for its speed and efficiency when working with large datasets.
  * XGBoost: Another powerful gradient boosting model, used for structured data.
  * CatBoost: Optimized for categorical features without complex preprocessing.
  * Voting Regressor: An ensemble model that combines predictions from LightGBM, XGBoost, and CatBoost to leverage the strengths of each model to produce more accurate results.
    
* Cross-Validation: Uses Stratified K-Folds cross-validation to split the data into training and testing sets, ensuring balanced class distribution in each fold. it is suitable for problems with imbalanced data between classes
* Threshold Optimization: The minimize function from the scipy.optimize library is used to fine-tune the decision thresholds to map continuous predictions to discrete categories (None, Mild, Moderate, Severe).
* Quadratic Weighted Kappa (QWK): The performance of the models is evaluated using QWK, a measure of the degree of agreement between predicted and actual values, while taking into account the ordinal nature of the target variable.

In [None]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01,
    'device': 'gpu'
}

# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': SEED,
    'tree_method': 'hist',
    'device': 'cuda'
}

# CatBoost parameters including specification of categorical features
CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 100,
    'task_type': 'GPU'
}



> [IMPROVEMENT] Change weight of combine model from [4 3 4] to [20 10 15]



In [None]:
# Create model instances
Light = LGBMRegressor(**Params, verbose=-1, n_estimators=200)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
], weights=[20, 10, 15])

## Training model

In [None]:
for fold, (train_idx, val_idx) in enumerate(SKF.split(X_res, y_res)):
    X_train, X_val = X_res[train_idx], X_res[val_idx]
    y_train, y_val = y_res[train_idx], y_res[val_idx]

    # Huấn luyện mô hình
    model = voting_model
    model.fit(X_train, y_train)

    # mô hình đưa ra dự đoán 2 tập X_train, X_val lưu nhãn vào 2 mảng y predict
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    oof_non_rounded[val_idx] = y_val_pred
    y_val_pred_rounded = np.round(y_val_pred).astype(int)
    oof_rounded[val_idx] = y_val_pred_rounded

    train_kappa = quadratic_weighted_kappa(y_train, np.round(y_train_pred).astype(int))
    val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

    train_S.append(train_kappa)
    test_S.append(val_kappa)

    test_preds[:, fold] = model.predict(test_data)

    print(f"Fold {fold + 1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

In [None]:
print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")



> [IMPROVEMENT] Change parameter of KappaOptimizer from x0=[0.5, 1.49, 2.5] to x0=[0.5, 1.5, 2.5]





In [None]:
# Optimize thresholds with Nelder-Mead method
KappaOptimizer = minimize(evaluate_predictions,
                          x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                          method='Nelder-Mead')

assert KappaOptimizer.success, "Optimization did not converge."
oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)
tKappa = quadratic_weighted_kappa(y_res, oof_tuned)
print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")



> [IMPROVEMENT] Add fold_weight to calculate Threshold_Rounder



In [None]:
fold_weights = [1.25, 1.0, 1.0, 1.0, 1.0]
tpm = test_preds.dot(fold_weights) / np.sum(fold_weights)
tpTuned = threshold_Rounder(tpm, KappaOptimizer.x)

In [None]:
# Count total misclassifications
misclassifications = np.sum(y != oof_rounded)
print(f"Total wrong validation predictions in train dataset: {misclassifications}")

# Final Classification Report
print("\nClassification Report (OOF Predictions):")
report = classification_report(y, oof_rounded, digits=4)
print(report)
with open("classification_report.txt", "w") as f:
    f.write(report)

# Confusion Matrix for Final OOF Predictions
conf_matrix = confusion_matrix(y, oof_rounded)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=np.unique(y))
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix (OOF Predictions)")
plt.savefig("confusion_matrix_of_model.png")
plt.show()

In [None]:
submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
})

# Submission Model

In [None]:
# Save submission
submission.to_csv('submission.csv', index=False)
print(submission['sii'].value_counts())