Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
scaler = StandardScaler()

Read the train CSV file

In [3]:
df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv", index_col="id")

Dataset overview

Delete row which not have output

In [4]:
df = df.dropna(subset=['sii'])

Remove columns which not appear in test.csv

One-hot encoding process

In [5]:
#Helper function
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    original_dataframe = pd.concat([original_dataframe, dummies], axis=1)
    original_dataframe = original_dataframe.drop([feature_to_encode], axis=1)
    return original_dataframe

In [6]:
# Proceed with encoding
categorical_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
       'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
       'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season']
df_encoded = df
for col in categorical_cols:
    df_encoded = encode_and_bind(df_encoded, col)

In [7]:
binary_columns = [col for col in df_encoded.columns if df_encoded[col].nunique() == 2]
columns_to_standardize = [col for col in df_encoded.columns if col not in binary_columns and col != 'sii']
df_encoded[columns_to_standardize] = scaler.fit_transform(df_encoded[columns_to_standardize])

Bring output to end

In [8]:
cols = [col for col in df_encoded.columns if col != 'sii'] + ['sii']
df_encoded = df_encoded[cols]

In [9]:
#missing colums
missing_columns = ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
       'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08',
       'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12',
       'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16',
       'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20',
       'PCIAT-PCIAT_Total']
df_processed = df_encoded
for column in missing_columns:
    df_processed = df_processed.drop(columns=column)

In [10]:
features = [col for col in df_processed.columns if col != 'sii']
X = df_processed[features]
y = df_processed.sii

In [11]:
#Fill missing cells
X = X.fillna(X.mean())

In [12]:
exclusion_list = [
    "Fitness_Endurance-Season_Winter", "BIA-Season_Spring", 
    "PAQ_A-Season_Fall", "PAQ_A-Season_Spring", "PAQ_A-Season_Winter", 
    "PCIAT-Season_Fall", "PCIAT-Season_Spring", "PCIAT-Season_Summer", 
    "PCIAT-Season_Winter"
]
filtered_features = [feature for feature in features if feature not in exclusion_list]
X = X[filtered_features]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Define a random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

### Evaluation function

In [15]:
#Evaluation function
def quadratic_weighted_kappa(y_true, y_pred, n_classes):
    """
    Calculate the Quadratic Weighted Kappa (QWK) score.

    Parameters:
    y_true (list or numpy array): Actual values (ground truth).
    y_pred (list or numpy array): Predicted values.
    n_classes (int): Number of distinct classes/labels.

    Returns:
    float: QWK score.
    """
    # Create histogram matrix O (observed matrix)
    O = np.zeros((n_classes, n_classes), dtype=np.float64)
    for true, pred in zip(y_true, y_pred):
        O[true, pred] += 1

    # Create weight matrix W
    W = np.zeros((n_classes, n_classes), dtype=np.float64)
    for i in range(n_classes):
        for j in range(n_classes):
            W[i, j] = ((i - j) ** 2) / ((n_classes - 1) ** 2)

    # Create expected matrix E
    actual_hist = np.sum(O, axis=1)
    pred_hist = np.sum(O, axis=0)
    E = np.outer(actual_hist, pred_hist) / np.sum(O)

    # Calculate QWK
    numerator = np.sum(W * O)
    denominator = np.sum(W * E)
    kappa = 1 - (numerator / denominator)

    return kappa

val_preds = rf_model.predict(X_val)
val_preds = np.array(val_preds).astype(int)

y_val = np.array(y_val).astype(int)

print(quadratic_weighted_kappa(y_val, val_preds, 4))

0.2810574298657813


### Handle test data

In [16]:
test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [17]:
test_categorical_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
       'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
       'PAQ_C-Season','SDS-Season', 'PreInt_EduHx-Season']
for col in test_categorical_cols:
    test_data = encode_and_bind(test_data, col)

In [18]:
binary_columns = [col for col in test_data.columns if test_data[col].nunique() == 2]
columns_to_standardize = [col for col in test_data.columns if col not in binary_columns and col != 'sii' and col != 'id']
test_data[columns_to_standardize] = scaler.fit_transform(test_data[columns_to_standardize])

In [19]:
test_X = test_data[filtered_features]

test_X = test_X.fillna(test_X.mean())

In [20]:
test_X.head()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,PAQ_C-Season_Summer,PAQ_C-Season_Winter,SDS-Season_Fall,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter
0,-1.583385,0,-1.090305,-0.6249326,-1.043709,-1.250822,4.662937e-16,-6.055762e-17,-5.366078e-16,-8.074349000000001e-17,...,0,0,0,0,0,0,1,0,0,0
1,-0.4819,0,-4.8572260000000006e-17,-1.225173,-0.7438588,-1.462229,-1.214286,0.2484271,-1.308011,0.2197332,...,0,0,1,0,0,0,0,0,1,0
2,-0.206529,1,0.8058773,-0.6732225,0.5305039,-0.1585549,4.662937e-16,-0.3092664,1.382755,-0.0269061,...,1,0,1,0,0,0,0,0,1,0
3,-0.4819,0,0.8058773,-0.3260439,0.4555414,0.1057033,4.662937e-16,-0.5881131,1.719101,-0.0269061,...,0,1,0,0,1,0,0,0,0,1
4,1.996442,1,-4.8572260000000006e-17,7.173749e-16,2.732857e-16,-8.540177e-16,4.662937e-16,-6.055762e-17,-5.366078e-16,-8.074349000000001e-17,...,0,0,0,0,0,0,0,0,0,0


In [21]:
test_preds = rf_model.predict(test_X)
test_preds = np.array(test_preds).astype(int)

### Submit

In [22]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'id': test_data.id,
                       'sii': test_preds})
output.to_csv('submission.csv', index=False)