In [184]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
from scipy import stats

# Model
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV, KFold, RepeatedStratifiedKFold, StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool, cv

# hyperparameter tuning
import optuna

In [375]:
try:
    kaggle_train_file_loc = "../input/playground-series-s4e11/train.csv"
    kaggle_test_file_loc = "../input/playground-series-s4e11/test.csv"
    df = pd.read_csv(kaggle_train_file_loc, index_col=0)
    df_test = pd.read_csv(kaggle_test_file_loc, index_col=0)
    
except:
    github_train_file_loc = "./dataset/train.csv"
    github_test_file_loc = "./dataset/test.csv"
    df = pd.read_csv(github_train_file_loc, index_col=0)
    df_test = pd.read_csv(github_test_file_loc, index_col=0)

# General Checks on the dataset

In [72]:
df.shape

(140700, 19)

In [73]:
df["Depression"].value_counts() / df.shape[0]

Depression
0    0.818287
1    0.181713
Name: count, dtype: float64

In [74]:
df.isna().sum()

Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140700 entries, 0 to 140699
Data columns (total 19 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Name                                   140700 non-null  object 
 1   Gender                                 140700 non-null  object 
 2   Age                                    140700 non-null  float64
 3   City                                   140700 non-null  object 
 4   Working Professional or Student        140700 non-null  object 
 5   Profession                             104070 non-null  object 
 6   Academic Pressure                      27897 non-null   float64
 7   Work Pressure                          112782 non-null  float64
 8   CGPA                                   27898 non-null   float64
 9   Study Satisfaction                     27897 non-null   float64
 10  Job Satisfaction                       112790 non-null  float

In [76]:
# X_train, X_valid, X_test, X_valid = train_test_split()

# Building a Preprocessing pipeline

1. Combine Profession
2. Combine Work + Academic Pressure
3. Study + Job Satisfaction
4. Clean Sleep Duration
5. Clean Degree
6. Separate Degree into Qualification and Field of Study

In [379]:
def combine_profession(row):
    if (row["Working Professional or Student"] != 'Student') and (pd.isna(row["Profession"])):
        # We make a huge assumption that working professionals with missing professions are unemployed.
        return 'Unemployed'
    elif row["Working Professional or Student"] != "Student":
        return row["Profession"]
    else:
        return "Student"

def combine_pressure(row):
    # Idea is to reduce the number of missing data since both columns are complementary.
    # Note that despite combining there are still rows that contain missing values and we will rely on simple imputation to fill missing values
    if pd.notna(row["Academic Pressure"]) and pd.isna(row["Work Pressure"]):
        return row["Academic Pressure"]
    elif pd.isna(row["Academic Pressure"]) and pd.notna(row["Work Pressure"]):
        return row["Work Pressure"]

def combine_satisfaction(row):
    # Idea is to reduce the number of missing data since both columns are complementary.
    # Note that despite combining there are still rows that contain missing values and we will rely on simple imputation to fill missing values
    
    if pd.notna(row["Study Satisfaction"]) and pd.isna(row["Job Satisfaction"]):
        return row["Study Satisfaction"]
    elif pd.isna(row["Study Satisfaction"]) and pd.notna(row["Job Satisfaction"]):
        return row["Job Satisfaction"]

def clean_sleep_duration(row):
    if row["Sleep Duration"] in ["Less than 5 hours", "3-4 hours", "1-3 hours", "1-2 hours",'2-3 hours', '3-4 hours', '4-5 hours']:
        return "Less than 5 hours"
    elif row["Sleep Duration"] in ["9-11 hours", '10-11 hours', '8-9 hours', "More than 8 hours"]:
        return "More than 8 hours"
    elif row["Sleep Duration"] in ["6-8 hours", "5-6 hours", "7-8 hours", "6-8 hours", '6-7 hours', '8 hours']:
        return "5 to 8 hours"
    else:
        return "Others"

# def clean_sleep_duration(row):
#     sleep={
#             "More than 8 hours":9,
#             'Less than 5 hours':4,
#             '5-6 hours':5.5,
#             '7-8 hours':7.5,
#             '1-2 hours':1.5,
#             '6-8 hours':7,
#             '4-6 hours':5,
#             '6-7 hours':6.5,
#             '10-11 hours':10.5,
#             '8-9 hours':8.5,
#             '9-11 hours':10,
#             '2-3 hours':2.5,
#             '3-4 hours':3.5,
#             'Moderate':6,
#             '4-5 hours':4.5,
#             '9-6 hours':7.5,
#             '1-3 hours':2,
#             '1-6 hours':4,
#             '8 hours':8,
#             '10-6 hours':8,
#             'Unhealthy':3,
#             'Work_Study_Hours':6,
#             '3-6 hours':3.5,
#             '9-5':7,
#             '9-5 hours':7,
#     }
#     return sleep.get(row["Sleep Duration"])


def clean_degree(row):
    degree = {
        "BCom": "B.Com",
        "B.Com": "B.Com",
        "B.Comm": "B.Com",
        "B.Tech": "B.Tech",
        "BTech": "B.Tech",
        "B.T": "B.Tech",
        "BSc": "B.Sc",
        "B.Sc": "B.Sc",
        "Bachelor of Science": "B.Sc",
        "BArch": "B.Arch",
        "B.Arch": "B.Arch",
        "BA": "B.A",
        "B.A": "B.A",
        "BBA": "BBA",
        "BB": "BBA",
        "BCA": "BCA",
        "BE": "BE",
        "BEd": "B.Ed",
        "B.Ed": "B.Ed",
        "BPharm": "B.Pharm",
        "B.Pharm": "B.Pharm",
        "BHM": "BHM",
        "LLB": "LLB",
        "LL B": "LLB",
        "LL BA": "LLB",
        "LL.Com": "LLB",
        "LLCom": "LLB",
        "MCom": "M.Com",
        "M.Com": "M.Com",
        "M.Tech": "M.Tech",
        "MTech": "M.Tech",
        "M_Tech": "M.Tech", 
        "M.T": "M.Tech",
        "MSc": "M.Sc",
        "M.Sc": "M.Sc",
        "Master of Science": "M.Sc",
        "MBA": "MBA",
        "MCA": "MCA",
        "MD": "MD",
        "ME": "ME",
        "MEd": "M.Ed",
        "M.Ed": "M.Ed",
        "MArch": "M.Arch",
        "M.Arch": "M.Arch",
        "MPharm": "M.Pharm",
        "M.Pharm": "M.Pharm",
        "MA": "MA",
        "M.A": "MA",
        "MHM": "MHM",
        "MPA": "MPA",
        "LLM": "LLM",
        "PhD": "PhD",
        "MBBS": "MBBS",
        "CA": "CA",
        "Class 12": "Class 12",
        "12th": "Class 12",
        "Class 11": "Class 11",
        "11th": "Class 11",
    }
    return degree.get(row["Degree"])

def create_qualification(row):
    if row["Degree"] in ["Class 12", "Class 11"]:
        return "High School"
    elif row["Degree"] in ["B.Ed", "B.Arch", "B.Com", "B.Pharm", "BCA", "BBA", "B.Sc", "LLB", "B.Tech", "B.A", "BE", "MBBS", "BHM"]:
        return "Bachelor"
    elif row["Degree"] in ["M.Ed", "MCA", "LLM", "M.Sc", "M.Tech", "M.Pharm", "MBA", "ME", "MD", "M.Com", "MA", "M.Arch", "MPA", "MHM"]:
        return "Masters"
    elif row["Degree"] == "PhD":
        return "PhD"

def create_field_of_study(row):
    if row["Degree"] in ["Class 12", "Class 11", "MPA"]:
        # MPA is given an unknown here because there is only 1 record of MPA
        return "General"
    elif row["Degree"] in ["PhD"]:
        return "Specialist"
    elif row["Degree"] in ["B.Ed", "M.Ed"]:
        return "Education"
    elif row["Degree"] in ["B.Arch", "M.Arch"]:
        return "Architecture"
    elif row["Degree"] in ["B.Com", "M.Com"]:
        return "Commerce"
    elif row["Degree"] in ["B.Pharm", "M.Pharm"]:
        return "Pharmacy"
    elif row["Degree"] in ["BCA", "MCA"]:
        return "Computer Application"
    elif row["Degree"] in ["BBA", "MBA"]:
        return "Business Administration"    
    elif row["Degree"] in ["B.Sc", "M.Sc"]:
        return "Science"
    elif row["Degree"] in ["B.Sc", "M.Sc"]:
        return "Science"
    elif row["Degree"] in ["LLB", "LLM"]:
        return "Law"
    elif row["Degree"] in ["B.Tech", "M.Tech"]:
        return "Technology"
    elif row["Degree"] in ["BHM", "MHM"]:
        return "Hospitality"
    elif row["Degree"] in ["B.A", "MA"]:
        return "Arts"
    elif row["Degree"] in ["ME", "BE"]:
        return "Engineering"
    elif row["Degree"] in ["MD", "MBBS"]:
        return "Medicine"


# def clean_diet(row):
#     diet = {
#         "More Healthy": 0,
#         "Healthy": 1,
#         "Less than Healthy": 2,
#         "Less Healthy": 2,
#         "Moderate": 3,
#         "Unhealthy": 4,
#         "No Healthy": 4,
#     }
#     if diet.get(row["Dietary Habits"]):
#         return diet.get(row["Dietary Habits"])
#     else:
#         return pd.NA

def manipulate_dataframe(df):
    result = df.copy()
    result["Profession"] = result.apply(combine_profession, axis=1)
    result["Pressure"] = result.apply(combine_pressure, axis=1)
    result["Satisfaction"] = result.apply(combine_satisfaction, axis=1)
    result["Sleep"] = result.apply(clean_sleep_duration, axis=1)
    # result["Diet"] = result.apply(clean_diet, axis=1)
    result["Degree"] = result.apply(clean_degree, axis=1)
    result["Qualification"] = result.apply(create_qualification, axis=1)
    result["Subject"] = result.apply(create_field_of_study, axis=1)

    # # this one omits the dietary habits
    # result = result.drop(["Name", 'Working Professional or Student', "Academic Pressure", "Work Pressure", "Study Satisfaction", "Job Satisfaction", "Sleep Duration", "CGPA", "Dietary Habits", "Degree"], axis=1)
    
    # # this variant includes Dietary Habits and Name
    # result = result.rename(columns={"Dietary Habits":"Diet"})
    # result = result.drop(['Working Professional or Student', "Academic Pressure", "Work Pressure", "Study Satisfaction", "Job Satisfaction", "Sleep Duration", "CGPA", "Degree"], axis=1)

    # # This variant includes Dietary Habits but omits name
    result = result.rename(columns={"Dietary Habits":"Diet"})
    result = result.drop(["Name", 'Working Professional or Student', "Academic Pressure", "Work Pressure", "Study Satisfaction", "Job Satisfaction", "Sleep Duration", "CGPA", "Degree"], axis=1)

    return result

In [384]:
prep_df = manipulate_dataframe(df)
X = prep_df.drop(columns=["Depression"])
y = prep_df.loc[:, "Depression"]

# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [387]:
# categorical values, require imputation, one hot encoding
cat_multiple_impute_ohe_list = ["Diet", "Qualification", "Subject"]
cat_multiple_impute_ohe_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
])

# numerical values, require imputation, quantile transform
num_qtrns_list = ["Work/Study Hours", "Age"]
num_impute_qtransform_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
    ('quantile_transform', StandardScaler())
])

# numerical values, no imputation, quantile transform
num_impute_qtranse_list = ["Pressure","Satisfaction","Financial Stress"]
num_qtransform_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
    ('quantile_transform', StandardScaler())
])

ct = ColumnTransformer([
    ('categorical_impute', cat_multiple_impute_ohe_pipeline, cat_multiple_impute_ohe_list),
    ('numerical_impute', num_impute_qtransform_pipeline, num_impute_qtranse_list),
    ('numerical_transform', num_qtransform_pipeline, num_qtrns_list)
], remainder='passthrough')

In [392]:
X_prep = pd.DataFrame(ct.fit_transform(X), columns=ct.get_feature_names_out())

col_names = []
for i in ct.get_feature_names_out():
    col_names.append(i.split("__")[1])

X_prep.columns = col_names

In [394]:
cat_features = [
    "Diet",
    "Qualification",
    "Subject",
    "Gender",
    "Have you ever had suicidal thoughts ?",
    "Family History of Mental Illness",
    "City",
    "Profession",
    "Sleep",
]

train_pool = Pool(X_prep, y, cat_features=cat_features)

### Optuna

In [398]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X_prep, y, test_size=0.1)

    params = {
        "loss_function": "Logloss",
        "depth": trial.suggest_int("depth", 4, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.9),
        "iterations": trial.suggest_int("iterations", 100, 1500),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 50, 51),
        # "custom_loss": trial.suggest_categorical("custom_loss", ["AUC"])
        # "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])
    }

    # processed_X = ct.fit_transform(X)
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, verbose=False)

    return model.best_score_["validation"]["Logloss"]

In [399]:
# Define EarlyStoppingCallback for Optuna
class EarlyStoppingCallback:
    def __init__(self, patience: int):
        self.patience = patience
        self.no_improvement_trials = 0
        self.best_value = -np.inf

    def __call__(self, study, trial):
        if study.best_value > self.best_value:
            self.best_value = study.best_value
            self.no_improvement_trials = 0
        else:
            self.no_improvement_trials += 1

        if self.no_improvement_trials >= self.patience:
            print(f"Early stopping triggered. No improvement after {self.patience} trials.")
            study.stop()

In [400]:
early_stopping = EarlyStoppingCallback(patience=200)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500, callbacks=[early_stopping])

[I 2024-11-28 15:19:57,271] A new study created in memory with name: no-name-ad589222-7e83-4757-bbc2-958be5b9005c
[I 2024-11-28 15:20:06,269] Trial 0 finished with value: 0.15162938689802619 and parameters: {'depth': 6, 'learning_rate': 0.7701325717477382, 'iterations': 1321, 'l2_leaf_reg': 8.194886238374218, 'random_strength': 4.901896728185532, 'bagging_temperature': 0.2012747763212519, 'border_count': 157, 'early_stopping_rounds': 50}. Best is trial 0 with value: 0.15162938689802619.
[I 2024-11-28 15:20:15,510] Trial 1 finished with value: 0.15684736139295427 and parameters: {'depth': 10, 'learning_rate': 0.7398348120036699, 'iterations': 1474, 'l2_leaf_reg': 1.9213206536038205, 'random_strength': 8.272026337013564, 'bagging_temperature': 0.4976240571885985, 'border_count': 248, 'early_stopping_rounds': 51}. Best is trial 0 with value: 0.15162938689802619.
[I 2024-11-28 15:21:26,634] Trial 2 finished with value: 0.14693769378508886 and parameters: {'depth': 10, 'learning_rate': 0.03

Early stopping triggered. No improvement after 200 trials.


In [401]:
best_params = study.best_params
# best_params = {'n_estimators': 623,
#  'max_depth': 6,
#  'learning_rate': 0.04337168717098735,
#  'subsample': 0.5762710438011511,
#  'colsample_bytree': 0.19790812731138763,
#  'gamma': 0.22686965140524085,
#  'min_child_weight': 3}
best_params

{'depth': 7,
 'learning_rate': 0.09245092826492232,
 'iterations': 547,
 'l2_leaf_reg': 6.749220199485208,
 'random_strength': 7.617718293873805,
 'bagging_temperature': 0.9069155038665843,
 'border_count': 53,
 'early_stopping_rounds': 50}

In [408]:
# optuna.visualization.plot_optimization_history(study)
# optuna.visualization.plot_parallel_coordinate(study)
# optuna.visualization.plot_slice(study)
optuna.visualization.plot_param_importances(study)

In [403]:
test_df = manipulate_dataframe(df_test)
test_df
orig_cols = test_df.columns

In [404]:
X_test_prep = pd.DataFrame(ct.transform(test_df), columns=col_names)
X_test_prep.head()

Unnamed: 0,Diet,Qualification,Subject,Pressure,Satisfaction,Financial Stress,Work/Study Hours,Age,Gender,City,Profession,Have you ever had suicidal thoughts ?,Family History of Mental Illness,Sleep
0,Moderate,Bachelor,Law,-0.732899,1.445772,0.007813,0.712923,1.018356,Male,Visakhapatnam,Judge,No,Yes,Less than 5 hours
1,Moderate,Bachelor,Education,-0.732899,0.734117,0.715218,-0.06557,1.422101,Female,Kolkata,Educational Consultant,No,No,Less than 5 hours
2,Moderate,Bachelor,Architecture,0.6938,-1.400847,0.715218,1.491416,1.018356,Male,Jaipur,Teacher,Yes,No,5 to 8 hours
3,Moderate,Bachelor,Science,1.407149,-1.400847,0.715218,0.972421,-1.404114,Female,Rajkot,Student,Yes,No,More than 8 hours
4,Moderate,Bachelor,Computer Application,1.407149,1.445772,0.715218,-0.844062,0.533862,Male,Kalyan,Teacher,Yes,No,5 to 8 hours


In [405]:
catboost_clf = CatBoostClassifier(**best_params)
catboost_clf.fit(X_prep, y, cat_features=cat_features)


0:	learn: 0.5563973	total: 57.3ms	remaining: 31.3s
1:	learn: 0.4635238	total: 105ms	remaining: 28.7s
2:	learn: 0.3918999	total: 152ms	remaining: 27.5s
3:	learn: 0.3560947	total: 179ms	remaining: 24.3s
4:	learn: 0.3210233	total: 229ms	remaining: 24.8s
5:	learn: 0.2902974	total: 280ms	remaining: 25.2s
6:	learn: 0.2628750	total: 325ms	remaining: 25s
7:	learn: 0.2474028	total: 373ms	remaining: 25.1s
8:	learn: 0.2405023	total: 400ms	remaining: 23.9s
9:	learn: 0.2275013	total: 452ms	remaining: 24.3s
10:	learn: 0.2203488	total: 505ms	remaining: 24.6s
11:	learn: 0.2156919	total: 541ms	remaining: 24.1s
12:	learn: 0.2103997	total: 583ms	remaining: 23.9s
13:	learn: 0.2051951	total: 629ms	remaining: 23.9s
14:	learn: 0.1992139	total: 677ms	remaining: 24s
15:	learn: 0.1926684	total: 727ms	remaining: 24.1s
16:	learn: 0.1895201	total: 771ms	remaining: 24s
17:	learn: 0.1866602	total: 806ms	remaining: 23.7s
18:	learn: 0.1845831	total: 848ms	remaining: 23.6s
19:	learn: 0.1830639	total: 885ms	remaining: 2

<catboost.core.CatBoostClassifier at 0x34799d690>

In [407]:
df_test["pred"] = catboost_clf.predict(X_test_prep)
# df_test.loc[:, ["pred"]].to_csv("submission.csv")
df_test.loc[:, ["pred"]].to_csv("try2_28nov.csv")

In [378]:
# df_test

Unnamed: 0_level_0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
140700,Shivam,Male,53.0,Visakhapatnam,Working Professional,Judge,,2.0,,,5.0,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes,0
140701,Sanya,Female,58.0,Kolkata,Working Professional,Educational Consultant,,2.0,,,4.0,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No,0
140702,Yash,Male,53.0,Jaipur,Working Professional,Teacher,,4.0,,,1.0,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No,0
140703,Nalini,Female,23.0,Rajkot,Student,,5.0,,6.84,1.0,,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No,1
140704,Shaurya,Male,47.0,Kalyan,Working Professional,Teacher,,5.0,,,5.0,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234495,Zoya,Female,49.0,Jaipur,Working Professional,Pilot,,3.0,,,5.0,Less than 5 hours,Moderate,BSc,Yes,2.0,2.0,Yes,0
234496,Shlok,Male,29.0,Ahmedabad,Working Professional,Pilot,,5.0,,,1.0,7-8 hours,Moderate,BE,Yes,11.0,3.0,Yes,1
234497,Rishi,Male,24.0,Visakhapatnam,Student,,1.0,,7.51,4.0,,7-8 hours,Moderate,B.Tech,No,7.0,1.0,No,0
234498,Eshita,Female,23.0,Kalyan,Working Professional,Marketing Manager,,4.0,,,2.0,5-6 hours,Healthy,BA,Yes,7.0,5.0,Yes,1


### RandomSearchCV

In [1040]:
# clf_xgb = Pipeline([
#     ('preprocessing', ct),
#     ('model', XGBClassifier())
# ])
# param_dist = {
#     "model__n_estimators": stats.randint(100, 500),
#     "model__learning_rate": stats.uniform(0.01, 0.8),
#     "model__subsample": stats.uniform(0.2, 0.8),
#     "model__max_depth": stats.randint(1, 10),
#     "model__colsample_bytree": stats.uniform(0.1, 1),
#     "model__min_child_weight": [1, 2, 3, 4],
# }

# numFolds = 5
# kfold_5 = RepeatedStratifiedKFold(n_splits=numFolds, n_repeats=5)

# clf = RandomizedSearchCV(
#     clf_xgb,
#     param_distributions=param_dist,
#     cv=kfold_5,
#     n_iter=100,
#     scoring="roc_auc",
#     error_score=0,
#     verbose=3,
#     n_jobs=-1,
# )

# clf.fit(X, y)

In [376]:
df_test

Unnamed: 0_level_0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
140700,Shivam,Male,53.0,Visakhapatnam,Working Professional,Judge,,2.0,,,5.0,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes
140701,Sanya,Female,58.0,Kolkata,Working Professional,Educational Consultant,,2.0,,,4.0,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No
140702,Yash,Male,53.0,Jaipur,Working Professional,Teacher,,4.0,,,1.0,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No
140703,Nalini,Female,23.0,Rajkot,Student,,5.0,,6.84,1.0,,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No
140704,Shaurya,Male,47.0,Kalyan,Working Professional,Teacher,,5.0,,,5.0,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234495,Zoya,Female,49.0,Jaipur,Working Professional,Pilot,,3.0,,,5.0,Less than 5 hours,Moderate,BSc,Yes,2.0,2.0,Yes
234496,Shlok,Male,29.0,Ahmedabad,Working Professional,Pilot,,5.0,,,1.0,7-8 hours,Moderate,BE,Yes,11.0,3.0,Yes
234497,Rishi,Male,24.0,Visakhapatnam,Student,,1.0,,7.51,4.0,,7-8 hours,Moderate,B.Tech,No,7.0,1.0,No
234498,Eshita,Female,23.0,Kalyan,Working Professional,Marketing Manager,,4.0,,,2.0,5-6 hours,Healthy,BA,Yes,7.0,5.0,Yes


### Old way of catboost?

In [None]:
# def objective(trial):
#     params = {
#         "loss_function": "Logloss",
#         "task_type": "GPU",
#         "depth": trial.suggest_int("depth", 4, 12),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 3),
#         "iterations": trial.suggest_int("iterations", 100, 1500),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
#         "random_strength": trial.suggest_float("random_strength", 0.1, 10),
#         "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
#         "border_count": trial.suggest_int("border_count", 32, 255),
#         "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 50, 51),
#         # "custom_loss": trial.suggest_categorical("custom_loss", ["AUC"])
#         # "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])
#     }

#     # processed_X = ct.fit_transform(X)
#     train_pool = Pool(X, y, cat_features=cat_indices)

#     # model = CatBoostClassifier(**params)
#     # model.fit(train_pool, verbose=False)
#     # model.score

#     catboost_cv = cv(
#         params=params,
#         pool=train_pool,
#         fold_count=5,
#         shuffle=True,
#         verbose=100,
#         return_models=False
#     )

#     return catboost_cv.loc[catboost_cv.shape[0] - 1, "test-Logloss-mean"]