# Imports

In [2]:
import os
import logging
import json

import optuna

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kaggle.api.kaggle_api_extended import KaggleApi

from utils import DepressionDataProcessor

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

from catboost import CatBoostClassifier

plt.style.use('ggplot')
%matplotlib inline

ISKAGGLE = os.environ.get("KAGGLE_KERNEL_RUN_TYPE", "") != ""

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# Data Loading

In [3]:

processor = DepressionDataProcessor("anthonytherrien/depression-dataset")
processor.download_and_extract()

2025-02-20 23:22:58,882 - INFO - Downloading dataset using Kaggle API...


datasets\depression-dataset
Dataset URL: https://www.kaggle.com/datasets/anthonytherrien/depression-dataset


2025-02-20 23:23:01,429 - INFO - Dataset downloaded and extracted successfully.


In [4]:
raw_df = processor.load_data("depression_data.csv")

In [5]:

processed_df = processor.preprocess(raw_df)
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413768 entries, 0 to 413767
Data columns (total 20 columns):
 #   Column                           Non-Null Count   Dtype   
---  ------                           --------------   -----   
 0   Age                              413768 non-null  int64   
 1   Marital Status                   413768 non-null  category
 2   Education Level                  413768 non-null  category
 3   Number of Children               413768 non-null  category
 4   Smoking Status                   413768 non-null  category
 5   Physical Activity Level          413768 non-null  category
 6   Employment Status                413768 non-null  category
 7   Income                           413768 non-null  float64 
 8   Alcohol Consumption              413768 non-null  category
 9   Dietary Habits                   413768 non-null  category
 10  Sleep Patterns                   413768 non-null  category
 11  History of Mental Illness        413768 non-null  ca

In [6]:
for feature in processed_df.select_dtypes(include='category').columns:
        print(f'{feature}: {processed_df[feature].unique()}')

Marital Status: ['Married', 'Widowed', 'Divorced', 'Single']
Categories (4, object): ['Divorced', 'Married', 'Single', 'Widowed']
Education Level: ['Bachelor's Degree', 'High School', 'Master's Degree', 'Associate Degree', 'PhD']
Categories (5, object): ['Associate Degree', 'Bachelor's Degree', 'High School', 'Master's Degree', 'PhD']
Number of Children: ['2', '1', '3', '0', '4']
Categories (5, object): ['0', '1', '2', '3', '4']
Smoking Status: ['Non-smoker', 'Former', 'Current']
Categories (3, object): ['Current', 'Former', 'Non-smoker']
Physical Activity Level: ['Active', 'Sedentary', 'Moderate']
Categories (3, object): ['Active', 'Moderate', 'Sedentary']
Employment Status: ['Unemployed', 'Employed']
Categories (2, object): ['Employed', 'Unemployed']
Alcohol Consumption: ['Moderate', 'High', 'Low']
Categories (3, object): ['High', 'Low', 'Moderate']
Dietary Habits: ['Moderate', 'Unhealthy', 'Healthy']
Categories (3, object): ['Healthy', 'Moderate', 'Unhealthy']
Sleep Patterns: ['Fair

In [7]:
features = [
    'Income Bucket Employment Status',
    'Marital Status', 
    'Number of Children', 
    'Smoking Status',
    'Physical Activity Level', 
    'Alcohol Consumption',
    'Dietary Habits', 
    'Sleep Patterns',
    'History of Substance Abuse', 
    'Family History of Depression',
    'Chronic Medical Conditions', 
    'Age Bucket', 
    'Attended University',
    'Education Level',
]
target = 'History of Mental Illness'

# Prepare the data
X = processed_df[features]
y = processed_df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Model Hyper Parameter Tuning

In [8]:
categorical_features_indices = [X_train.columns.get_loc(col) for col in X_train.select_dtypes(include='category').columns]

X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

class_weights = y_train_sub.value_counts(normalize=True).to_dict()
class_weights = class_weights[0] / class_weights[1]

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'scale_pos_weight': class_weights,
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'task_type': "GPU",
        'devices': '0',
        'eval_metric': 'Recall',
        'verbose': False
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train_sub, y_train_sub, cat_features=categorical_features_indices, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

    y_pred = model.predict(X_valid)
    return recall_score(y_valid, y_pred, average='weighted')

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

best_params = study.best_params
best_params.update({'task_type': "GPU", 'devices': '0', 'eval_metric': 'Recall', 'verbose': False})

[I 2025-02-20 23:23:03,118] A new study created in memory with name: no-name-87e42a6d-6855-4afa-9d0e-3d0128cd75f2
[I 2025-02-20 23:23:06,317] Trial 0 finished with value: 0.6167092125734484 and parameters: {'iterations': 317, 'depth': 7, 'learning_rate': 0.0014311659852842894, 'bagging_temperature': 0.832013166260799, 'l2_leaf_reg': 3.006147667167895, 'border_count': 245}. Best is trial 0 with value: 0.6167092125734484.
[I 2025-02-20 23:23:26,062] Trial 1 finished with value: 0.5600048336178118 and parameters: {'iterations': 643, 'depth': 9, 'learning_rate': 0.07448479616662573, 'bagging_temperature': 0.6119369408528859, 'l2_leaf_reg': 1.1973544902203124, 'border_count': 163}. Best is trial 0 with value: 0.6167092125734484.
[I 2025-02-20 23:23:27,775] Trial 2 finished with value: 0.5836291406733833 and parameters: {'iterations': 185, 'depth': 4, 'learning_rate': 0.09432682538265774, 'bagging_temperature': 0.004056058287597697, 'l2_leaf_reg': 3.070270573235277, 'border_count': 117}. Bes

In [9]:
with open('catboost_best_params.json', 'w') as f:
    json.dump(best_params, f)