Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Read the train CSV file

In [2]:
df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv", index_col="id")

Dataset overview

Delete row which not have output

In [3]:
df = df.dropna(subset=['sii'])

One-hot encoding process

In [4]:
#Helper function
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    original_dataframe = pd.concat([original_dataframe, dummies], axis=1)
    original_dataframe = original_dataframe.drop([feature_to_encode], axis=1)
    return original_dataframe

In [5]:
# Proceed with encoding
categorical_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
       'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
       'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season']
df_encoded = df
for col in categorical_cols:
    df_encoded = encode_and_bind(df_encoded, col)

Bring output to end

In [6]:
cols = [col for col in df_encoded.columns if col != 'sii'] + ['sii']
df_encoded = df_encoded[cols]

Remove columns which not appear in test.csv

In [7]:
#missing colums
missing_columns = ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
       'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08',
       'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12',
       'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16',
       'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20',
       'PCIAT-PCIAT_Total']
df_processed = df_encoded
for column in missing_columns:
    df_processed = df_processed.drop(columns=column)

In [8]:
features = [col for col in df_processed.columns if col != 'sii']
X = df_processed[features]
y = df_processed.sii

In [9]:
#Fill missing cells
X = X.fillna(X.mean())

In [10]:
exclusion_list = [
    "Fitness_Endurance-Season_Winter", "BIA-Season_Spring", 
    "PAQ_A-Season_Fall", "PAQ_A-Season_Spring", "PAQ_A-Season_Winter", 
    "PCIAT-Season_Fall", "PCIAT-Season_Spring", "PCIAT-Season_Summer", 
    "PCIAT-Season_Winter"
]
filtered_features = [feature for feature in features if feature not in exclusion_list]
X = X[filtered_features]

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Define a random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

In [12]:
test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [13]:
test_categorical_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
       'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
       'PAQ_C-Season','SDS-Season', 'PreInt_EduHx-Season']
for col in test_categorical_cols:
    test_data = encode_and_bind(test_data, col)

In [14]:
test_X = test_data[filtered_features]

test_X = test_X.fillna(test_X.mean())

In [15]:
test_preds = rf_model.predict(test_X)

In [16]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'id': test_data.index,
                       'sii': test_preds})
output.to_csv('submission.csv', index=False)