In [45]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_predict
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import optuna
import math
import optuna.visualization as vis
import kagglehub
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Decide between local or kaggle cloud storage     
'''
if 'kaggle' in os.listdir('/'):
    print('Kaggle environment')
    data_path = '/kaggle'
    KAGGLE_ENV = True
else:
    print('Local machine')
    data_path = 'kaggle'
    KAGGLE_ENV = False
    ''' 
    
KAGGLE_ENV = 'kaggle' in os.listdir('/')
data_path = '/kaggle' if KAGGLE_ENV else 'kaggle'
    
    
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

kaggle/working/submission.csv
kaggle/working/submission_0_92979.csv
kaggle/working/submission_0_93369.csv
kaggle/working/submission_final.csv
kaggle/working/submission_val_2.csv
kaggle/working/submission_optuna.csv
kaggle/working/submission_0_93337.csv
kaggle/working/submission_val.csv
kaggle/input/submission.csv
kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv
kaggle/input/playground-series-s4e11/sample_submission.csv
kaggle/input/playground-series-s4e11/test.csv
kaggle/input/playground-series-s4e11/train.csv


In [46]:
# load the data
df_train = pd.read_csv(data_path + '/input/playground-series-s4e11/train.csv')
df_test = pd.read_csv(data_path + '/input/playground-series-s4e11/test.csv')
df_sample_submission = pd.read_csv(data_path + '/input/playground-series-s4e11/sample_submission.csv')

df_original = pd.read_csv(data_path + '/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv')

# Prepare Data

In [47]:

df_original['Depression'] = df_original['Depression'].replace({'No': 0, 'Yes': 1})
df_original['Age'] = df_original['Age'].astype(float)
df_original['Work/Study Hours'] = df_original['Work/Study Hours'].astype(float)
df_original['Financial Stress'] = df_original['Financial Stress'].astype(float)

for column in df_train.select_dtypes(include=['object']).columns:
    df_train[column] = df_train[column].astype('category')
    
for column in df_test.select_dtypes(include=['object']).columns:
    df_test[column] = df_test[column].astype('category')

for column in df_original.select_dtypes(include=['object']).columns:
    df_original[column] = df_original[column].astype('category')  

X = df_train.drop('Depression', axis=1)
y = df_train['Depression']


# Model Section

## Simple XGB with all features, no cleaning at all

In [48]:
model = xgb.XGBClassifier(enable_categorical=True, 
                        eval_metric='logloss',
                        random_state=42)
model.fit(X,y)

y_pred = model.predict(df_test)

accuracy = accuracy_score(df_sample_submission['Depression'], y_pred)


print(f"Accuracy Score: {accuracy:.4f}")
print ("Atcually accuarcy is 0.92979 at kaggle")


if not KAGGLE_ENV:
    output = pd.DataFrame({'id': df_test['id'], 'Depression': y_pred})
    output.to_csv(data_path + '/working/submission_0_92979.csv', index=False)

Accuracy Score: 0.8433
Atcually accuarcy is 0.92979 at kaggle


## XGB Second Version

- drop id feature from train data
- merge df_train and original dataset
- split merged datset in train and val set

In [49]:
df_train = df_train.drop('id', axis=1)
df_mixed = pd.concat([df_train, df_original], ignore_index=True)

X = df_mixed.drop('Depression', axis=1)
y = df_mixed['Depression']

In [50]:
for column in X.select_dtypes(include=['object']).columns:
    X[column] = X[column].astype('category')

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(enable_categorical=True, 
                        eval_metric='logloss',
                        random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on the validation set: {accuracy:.5f}")

df_test_features = df_test.drop('id', axis=1)
y_pred_test = model.predict(df_test_features)


# 0_93337
if not KAGGLE_ENV:
    output = pd.DataFrame({'id': df_test['id'], 'Depression': y_pred_test})
    output.to_csv(data_path + '/working/submission_0_93337.csv', index=False)

Accuracy on the validation set: 0.93337


## XGB + Hyperparameter Tuning with GridSearchCV
- started with big set of parameters for GridSearchCV - took with big GPU something about 3hours to calc

In [51]:

param_grid = {
    #, 5, 7, 10],  # Controls the depth of the trees
        'learning_rate': [0.1],#0.01, 0.05, 0.1, 0.2],  # Step size for updates
        'n_estimators': [300],#100, 200, 300],  # Number of trees
        'subsample': [0.8],#, 1.0],  # Fraction of data per tree
        'colsample_bytree': [0.8],#, 1.0],  # Fraction of features per tree
        'gamma': [0],#, 0.1, 0.2],  # Minimum loss reduction to make a split
        'tree_method': ['hist']  # GPU-friendly method
}
grid_search = GridSearchCV(
    estimator=XGBClassifier(enable_categorical=True, random_state=42, eval_metric='logloss'),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1  # all CPU cores
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy (CV):", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on the validation set with optimized parameters: {accuracy:.5f}")

df_test_features = df_test.drop('id', axis=1)
y_pred_test = best_model.predict(df_test_features)

if not KAGGLE_ENV:
    output = pd.DataFrame({'id': df_test['id'], 'Depression': y_pred_test})
    output.to_csv('kaggle/working/submission_val_2.csv', index=False)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


4117.31s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4117.51s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
4117.72s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4117.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'tree_method': 'hist'}
Best Accuracy (CV): 0.9364856256024087
Accuracy on the validation set with optimized parameters: 0.93669


## Cross-Validation

In [52]:
# Best parameters from GridSearchCV
best_params = grid_search.best_params_

# Model with best parameters
model = xgb.XGBClassifier(
    **best_params,
    enable_categorical=True,
    eval_metric='logloss',
    random_state=42
)

# 5-fold cross-validation
scores = cross_val_score(
    model, X, y,  
    cv=5,  
    scoring=make_scorer(accuracy_score),
    n_jobs=-1
)

print(f"Cross-validation results: {scores}")
print(f"Average accuracy: {scores.mean():.5f}")

Cross-validation results: [0.93742147 0.937524   0.9350808  0.93714006 0.93933894]
Average accuracy: 0.93730


## Hyperparameter tuning with Optuna


In [53]:
import os
import pandas as pd
import xgboost as xgb
import optuna
from optuna.visualization import plot_param_importances, plot_optimization_history
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'eval_metric': 'logloss'  # Internal metric for XGBoost
    }

    # Initialize and train the model
    model = xgb.XGBClassifier(**params, enable_categorical=True, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions and accuracy calculation
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    
    # Return 1 - accuracy to minimize in Optuna
    return 1 - accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Display the best parameters and score
print("Best parameters:", study.best_params)
print("Best accuracy:", 1 - study.best_value)

# Retrain the best model on the entire dataset
best_params = study.best_params
best_model = xgb.XGBClassifier(
    **best_params,
    enable_categorical=True,
    random_state=42
)

best_model.fit(X, y)

# Predictions for test data
df_test_features = df_test.drop('id', axis=1)
y_pred_test = best_model.predict(df_test_features)

# Save predictions for Kaggle if not in Kaggle environment
if not os.getenv('KAGGLE_KERNEL_RUN_TYPE'):
    output = pd.DataFrame({'id': df_test['id'], 'Depression': y_pred_test})
    output.to_csv('kaggle/working/submission_optuna.csv', index=False)

# Visualizations
plot_param_importances(study).show()
plot_optimization_history(study).show()


[I 2024-11-10 22:23:30,290] A new study created in memory with name: no-name-f021f83f-0c7a-4cdd-a00b-1274fd4f0325


[I 2024-11-10 22:23:42,038] Trial 0 finished with value: 0.0643934105821583 and parameters: {'max_depth': 7, 'learning_rate': 0.18533773825371938, 'n_estimators': 473, 'subsample': 0.8140482134457037, 'colsample_bytree': 0.820293706349073, 'gamma': 2.6480296365752642, 'min_child_weight': 4, 'reg_alpha': 0.6145657653666939, 'reg_lambda': 0.14080819285479573}. Best is trial 0 with value: 0.0643934105821583.
[I 2024-11-10 22:23:52,413] Trial 1 finished with value: 0.06613848945972356 and parameters: {'max_depth': 10, 'learning_rate': 0.191032962211525, 'n_estimators': 386, 'subsample': 0.6253554524659439, 'colsample_bytree': 0.6625650641285773, 'gamma': 2.7022419912243483, 'min_child_weight': 2, 'reg_alpha': 0.922723054243628, 'reg_lambda': 0.030735679231652435}. Best is trial 0 with value: 0.0643934105821583.
[I 2024-11-10 22:23:55,442] Trial 2 finished with value: 0.06215970961887474 and parameters: {'max_depth': 6, 'learning_rate': 0.23137965408037364, 'n_estimators': 402, 'subsample':

Best parameters: {'max_depth': 6, 'learning_rate': 0.03481022519760896, 'n_estimators': 287, 'subsample': 0.6199248986582387, 'colsample_bytree': 0.9646115128357537, 'gamma': 1.1748114496291104, 'min_child_weight': 3, 'reg_alpha': 0.16048663214285405, 'reg_lambda': 0.476652850573662}
Best accuracy: 0.9400390897668575
