## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

## Read the CSV file

In [2]:
train_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv", index_col="id")
test_data = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv", index_col="id")

train_df = train_data.copy()
test_df = test_data.copy()

## One-hot encoding process

In [3]:
#Helper function
def OneHot_Encoding(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    original_dataframe = pd.concat([original_dataframe, dummies], axis=1)
    original_dataframe = original_dataframe.drop([feature_to_encode], axis=1)
    return original_dataframe
    
# Proceed with encoding
categorical_cols = [col for col in test_df.columns if 'Season' in col]
for col in categorical_cols:
    train_df = OneHot_Encoding(train_df, col)
    test_df = OneHot_Encoding(test_df, col)

In [4]:
# Remove feature which does not appear in test data, excluding 'sii'
missing_columns = (set(train_df.columns) - set(test_df.columns)) - {'sii'}

train_df = train_df.drop(columns=missing_columns)

## Standardize process

In [5]:
# Helper function
def standardize(df):
    binary_columns = [col for col in df.columns if df[col].nunique() == 2]
    columns_to_standardize = [col for col in df.columns if col not in binary_columns and col != 'sii']
    df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])
    return df

#Proceed with standardizing
train_df = standardize(train_df)
test_df = standardize(test_df)

## Handle missing value process

In [6]:
#Remove sample which not have sii
train_df = train_df.dropna(subset=['sii'])

In [7]:
# Fill missing cell with mean value of its feature
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

## Extract df

In [8]:
features = [col for col in train_df.columns if col != 'sii']
X = train_df[features]
y = train_df.sii

## Split dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Define and train model

In [10]:
from sklearn.ensemble import RandomForestClassifier

val_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
)

final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
)

val_model.fit(X_train, y_train)
final_model.fit(X, y)

## Evaluation function

In [11]:
#Evaluation function
def quadratic_weighted_kappa(y_true, y_pred, n_classes):
    """
    Calculate the Quadratic Weighted Kappa (QWK) score.

    Parameters:
    y_true (list or numpy array): Actual values (ground truth).
    y_pred (list or numpy array): Predicted values.
    n_classes (int): Number of distinct classes/labels.

    Returns:
    float: QWK score.
    """
    # Create histogram matrix O (observed matrix)
    O = np.zeros((n_classes, n_classes), dtype=np.float64)
    for true, pred in zip(y_true, y_pred):
        O[true, pred] += 1

    # Create weight matrix W
    W = np.zeros((n_classes, n_classes), dtype=np.float64)
    for i in range(n_classes):
        for j in range(n_classes):
            W[i, j] = ((i - j) ** 2) / ((n_classes - 1) ** 2)

    # Create expected matrix E
    actual_hist = np.sum(O, axis=1)
    pred_hist = np.sum(O, axis=0)
    E = np.outer(actual_hist, pred_hist) / np.sum(O)

    # Calculate QWK
    numerator = np.sum(W * O)
    denominator = np.sum(W * E)
    kappa = 1 - (numerator / denominator)

    return kappa

val_preds = val_model.predict(X_val)
val_preds = np.array(val_preds).astype(int)

y_val = np.array(y_val).astype(int)

print(quadratic_weighted_kappa(y_val, val_preds, 4))

0.4024796628442614


## Submit

In [12]:
test_preds = final_model.predict(test_df)
test_preds = np.array(test_preds).astype(int)

In [13]:
output = pd.DataFrame({'id': test_data.index,
                       'sii': test_preds})
output.to_csv('submission.csv', index=False)