# 1. INTRODUCTION
<center>
<img src="https://images.unsplash.com/photo-1507652955-f3dcef5a3be5?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" width=1300 height=800 />
</center>

### 🧠 Depression Risk Prediction Analysis

### 📌 Problem Statement
This dataset was collected through an anonymous survey conducted between January and June 2023, focusing on understanding depression risk factors among adults. The survey targeted both working professionals and students, collecting comprehensive information about their demographic details, academic/work life, lifestyle factors, mental health history, and current mental well-being status.

### 📊 Dataset Description
The dataset consists of three files:
- **train.csv**: Training dataset with labeled depression risk
- **test.csv**: Test dataset for predictions
- **sample_submission.csv**: Template for submission format

### 📋 Column Descriptions

1. **Name**: Identifier for participants (anonymized)
2. **Gender**: Participant's gender identity
3. **Age**: Participant's age
4. **City**: Location of residence
5. **Working Professional or Student**: Current occupation category
6. **Profession**: Specific profession/field of work
7. **Degree**: Educational qualification
8. **CGPA**: Academic performance measure
9. **Academic Pressure**: Level of pressure from academic responsibilities
10. **Work Pressure**: Level of pressure from work responsibilities
11. **Study Satisfaction**: Level of satisfaction with studies
12. **Job Satisfaction**: Level of satisfaction with current job
13. **Work/Study Hours**: Daily hours spent on work/study
14. **Sleep Duration**: Average daily sleep hours
15. **Dietary Habits**: Eating patterns and food preferences
16. **Have you ever had suicidal thoughts ?**: History of suicidal ideation (Yes/No)
17. **Financial Stress**: Level of stress related to financial situation
18. **Family History of Mental Illness**: Presence of mental illness in family (Yes/No)

**Target Variable:** **Depression**, Binary indicator (1 = Yes, 0 = No) of depression risk

### 📈 Evaluation Metric
- The model performance is evaluated using **Accuracy Score**
- Accuracy = (Number of Correct Predictions) / (Total Number of Predictions)

### 🎯 Objectives
1. Build a machine learning model to predict depression risk based on various life factors
2. Identify key contributors to mental health challenges
3. Analyze the relationship between different life aspects (academic, professional, personal) and depression risk
4. Create a reliable predictive model for early depression risk assessment
5. Understand the impact of lifestyle factors on mental health

# 2. IMPORTS

In [None]:
# Essential Python and System Libraries
import numpy as np
import pandas as pd
import string
import time
import sys
import os
from copy import deepcopy
from functools import partial
import gc
import warnings
warnings.filterwarnings('ignore')

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
from yellowbrick.classifier import ROCAUC, ClassificationReport
from yellowbrick.features import PCA as PCAVisualizer
%matplotlib inline
sns.set(style='darkgrid', font_scale=1.2)
pd.set_option('display.max_columns', None)

# Progress Tracking
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
tqdm_notebook.get_lock().locks = []

# Missing Value Analysis
import missingno as msno

# Feature Engineering and Processing
from sklearn.preprocessing import (
    LabelEncoder, 
    StandardScaler, 
    MinMaxScaler,
    PowerTransformer, 
    FunctionTransformer,
    RobustScaler
)
from category_encoders import (
    OneHotEncoder, 
    OrdinalEncoder, 
    CountEncoder, 
    CatBoostEncoder,
    TargetEncoder
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Unsupervised Learning for Feature Engineering
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD, FastICA
from sklearn.manifold import TSNE, Isomap
from sklearn.feature_selection import (
    SelectKBest,
    chi2,
    mutual_info_classif,
    RFE
)

# Statistical Analysis
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.optimize import minimize_scalar

!pip install spacy
!python -m spacy download en_core_web_md

import spacy
import numpy as np
from difflib import SequenceMatcher

# Model Selection and Evaluation
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    KFold,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve,
    f1_score,
    precision_score,
    recall_score,
    log_loss,
    make_scorer
)

# Traditional Machine Learning Models
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    StackingClassifier,
    VotingClassifier, 
    HistGradientBoostingClassifier,

)

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier,
    RidgeClassifier,
    LogisticRegressionCV
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis
)

# Boosting Libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense, 
    Dropout, 
    BatchNormalization,
    Input,
    Concatenate
)
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau
)
from tensorflow.keras.optimizers import Adam

# Scikit-learn compatible wrapper for Keras (custom implementation)
class KerasClassifierWrapper(tf.keras.Model):
    def fit(self, X, y, **kwargs):
        return super().fit(X, y, **kwargs)
    
    def predict(self, X):
        return np.argmax(super().predict(X), axis=1)
    
    def predict_proba(self, X):
        return super().predict(X)

# Hyperparameter Optimization
!pip install optuna
!pip install cmaes
import optuna
from optuna.samplers import TPESampler, CmaEsSampler

# Imbalanced Learning
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline

# For reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# 3. DATA

In [None]:
# Check for GPU availability
global device
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPU is available")
    device = 'gpu'
    # Enable memory growth for GPU
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("GPU is not available")
    device = 'cpu'

# Define paths
KAGGLE_PATH = '/kaggle/input/depression-surveydataset-for-analysis'
TRAIN_PATH = '/kaggle/input/playground-series-s4e11/train.csv'
TEST_PATH = '/kaggle/input/playground-series-s4e11/test.csv'
ORIGINAL_PATH = f'{KAGGLE_PATH}/final_depression_dataset_1.csv'

# Load datasets
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
original = pd.read_csv(ORIGINAL_PATH)

# Create copies for backup
train_copy = train.copy()
test_copy = test.copy()
original_copy = original.copy()

# Drop ID columns if present
if 'id' in train.columns:
    train.drop(columns=['id'], inplace=True)
if 'id' in test.columns:
    test.drop(columns=['id'], inplace=True)

# Convert Depression from Yes/No to 1/0 in original dataset
depression_mapping = {'Yes': 1, 'No': 0}
original['Depression'] = original['Depression'].map(depression_mapping)

# Mark data sources
original['original'] = 1
train['original'] = 0
test['original'] = 0

# Combine training data with original dataset
train = pd.concat([train, original], axis=0).reset_index(drop=True)

# Define target variable
targets = ['Depression']
target = targets[0]

print("Dataset Shapes:")
print(f"Training set: {train.shape}")
print(f"Test set: {test.shape}")

print("\nValue distribution (%) in Depression column:")
display(train[target].value_counts(normalize=True).round(3) * 100)

print("\nSample of training data:")
display(train.head())

## 3.1 MISSING VALUES

In [None]:
table = PrettyTable()
table.title = "🔍 Missing Value Analysis for Train and Test Sets"
table.field_names = ['Column Name', 'Data Type', 'Train Missing %', 'Test Missing %']

# Set alignment for better readability
table.align['Column Name'] = 'l'  # left align column names
table.align['Data Type'] = 'l'    # left align data types
table.align['Train Missing %'] = 'r'  # right align percentages
table.align['Test Missing %'] = 'r'   # right align percentages

for column in train.columns:
    data_type = str(train[column].dtype)
    # Calculate missing percentages with 4 decimal places
    non_null_count_train = format(100 - (train[column].count()/train.shape[0]*100), '.4f')
    if column != target:
        non_null_count_test = format(100 - (test[column].count()/test.shape[0]*100), '.4f')
    else:
        non_null_count_test = "NA"
    table.add_row([column, data_type, non_null_count_train, non_null_count_test])

print(table)

<font size="3">There are missing values, I will let the algorithms to handle the missing for now</font>

# 4. EDA

## 4.1 NUMERICAL FEATURES

In [None]:
# Set up the visualization style
plt.style.use('seaborn')
sns.set_palette("husl")

# Separate numerical and categorical columns
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target and original column from features
if 'Depression' in numerical_features:
    numerical_features.remove('Depression')
if 'original' in numerical_features:
    numerical_features.remove('original')

# 1. Target Distribution across Numerical Features
print("📊 Target Distribution Across Numerical Features")
fig = plt.figure(figsize=(20, 5*((len(numerical_features)+1)//2)))

for idx, feature in enumerate(numerical_features, 1):
    plt.subplot((len(numerical_features)+1)//2, 2, idx)
    
    # Create boxplot
    sns.boxplot(x='Depression', y=feature, data=train)
    plt.title(f'Depression Distribution by {feature}', pad=20)
    plt.xlabel('Depression (0: No, 1: Yes)')
    plt.ylabel(feature)
    
plt.tight_layout(pad=3.0)
plt.show()

# Additional Statistical Summary for Numerical Features
print("\n📈 Statistical Summary of Numerical Features by Target")
for feature in numerical_features:
    print(f"\n{'-'*50}")
    print(f"Feature: {feature}")
    print(train.groupby('Depression')[feature].describe())


## 4.2 CATEGORICAL FEATURES

### CLEANING

In [None]:
def process_sleep_duration(duration):
    try:
        # Handle NaN
        if pd.isna(duration):
            return np.nan, np.nan
            
        duration = str(duration).lower().replace('hours', '').strip()
        
        # Handle 'more than' or 'less than' cases
        if 'more than' in duration:
            return float(duration.replace('more than', '').strip()), 12  # assuming 12 as max
        if 'less than' in duration or 'than' in duration:
            return 0, float(duration.replace('less than', '').replace('than', '').strip())
        
        # Handle weekly hours - if number is too large, divide by 7
        if '-' not in duration:
            try:
                hours = float(duration)
                if hours > 24:  # if more than 24, assume it's weekly
                    daily = hours/7
                    return daily, daily
                return hours, hours
            except:
                return np.nan, np.nan
                
        # Handle ranges
        start, end = duration.split('-')
        start = float(start)
        end = float(end)
        
        # Fix reversed ranges (like 9-6 to 6-9)
        if start > end:
            start, end = end, start
            
        # If range is too large, divide by 7 (weekly to daily)
        if end > 24:
            start = start/7
            end = end/7
            
        return start, end
    except:
        return np.nan, np.nan
        
# Function to transform categorical columns
def transform_categorical_columns(df):
   # Convert 'Yes'/'No' to 1/0 and rename columns
   df['family_history'] = (df['Family History of Mental Illness'].str.lower() == 'yes').astype(int)
   df['suicidal'] = (df['Have you ever had suicidal thoughts ?'].str.lower() == 'yes').astype(int) 
   df['Working Professional or Student'] = (df['Working Professional or Student'].str.lower() == 'Working Professional').astype(int)
   
   # Convert Gender to binary (Female = 0, Male = 1)
   df['Gender'] = (df['Gender'] == 'Male').astype(int)
   
   # Clean Dietary Habits - convert invalid categories to NaN
   valid_dietary = ['Healthy', 'Unhealthy', 'Less Healthy', 'More Healthy', 
                   '5 Healthy', '5 Unhealthy', 'Moderate', 'No Healthy',
                   'Less than Healthy']
   df.loc[~df['Dietary Habits'].isin(valid_dietary), 'Dietary Habits'] = np.nan
    
   df[['Min_sleep', 'Max_sleep']] = df['Sleep Duration'].apply(process_sleep_duration).apply(pd.Series)
    
   # Drop the original columns
   df = df.drop(['Family History of Mental Illness', 'Have you ever had suicidal thoughts ?', 'Sleep Duration'], axis=1)
   return df

# Apply transformations to both train and test
train = transform_categorical_columns(train)
test = transform_categorical_columns(test)

In [None]:
def clean_degree_and_related_columns(df):
    """
    Clean and standardize degree-related columns in a DataFrame based on the exact set of degrees present.
    
    Parameters:
        df (pandas.DataFrame): Input DataFrame containing 'Degree' column
        
    Returns:
        pandas.DataFrame: Cleaned DataFrame with standardized degrees
    """
    df = df.copy()
    
    # Exact degree standardization mapping based on the data
    degree_standardization = {
        # Bachelor's Degrees
        "btech": "B.Tech",
        "b.tech": "B.Tech",
        "b tech": "B.Tech",
        "btech": "B.Tech",
        "b b.tech": "B.Tech",
        "mechanical engineer": "B.Tech",
        
        "bcom": "B.Com",
        "b.com": "B.Com",
        "b_com": "B.Com",
        "b.m.com": "B.Com",
        "degree": "B.Com",  # Based on previous mapping
        
        "bsc": "B.Sc",
        "b.sc": "B.Sc",
        
        "barch": "B.Arch",
        "b.arch": "B.Arch",
        
        "bca": "BCA",
        "b.ca": "BCA",
        "b bca": "BCA",
        "bhca": "BCA",
        
        "ba": "BA",
        
        "bba": "BBA",
        "b.ba": "BBA",
        
        "be": "BE",
        
        "bed": "B.Ed",
        "b.ed": "B.Ed",
        "a.ed": "B.Ed",
        "e.ed": "B.Ed",
        "i.ed": "B.Ed",
        "g.ed": "B.Ed",
        "j.ed": "B.Ed",
        "k.ed": "B.Ed",
        
        "bpharm": "B.Pharm",
        "b.pharm": "B.Pharm",
        "b_pharm": "B.Pharm",
        "b._pharm": "B.Pharm",
        "s.pharm": "B.Pharm",
        
        "bhm": "BHM",
        "b.h": "BHM",
        "bh": "BHM",
        
        # Master's Degrees
        "mtech": "M.Tech",
        "m.tech": "M.Tech",
        
        "mcom": "M.Com",
        "m.com": "M.Com",
        
        "msc": "MSc",
        
        "mca": "MCA",
        "gca": "MCA",
        "rca": "MCA",
        "pca": "MCA",
        
        "ma": "MA",
        "m.": "MA",
        "m": "MA",
        
        "mba": "MBA",
        
        "me": "ME",
        
        "med": "M.Ed",
        "m.ed": "M.Ed",
        "m.m.ed": "M.Ed",
        "m.b.ed": "M.Ed",
        
        "march": "M.Arch",
        "m.arch": "M.Arch",
        
        "mpharm": "M.Pharm",
        "m.pharm": "M.Pharm",
        
        "mhm": "MHM",
        
        # Other Professional Degrees
        "llb": "LLB",
        "llm": "LLM",
        "phd": "PhD",
        "mbbs": "MBBS",
        "md": "MD",
        
        # School Education
        "class12": "Class 12",
        "class 12": "Class 12",
        "12th": "Class 12"
    }
    
    # Names found in the degree column
    names = {
        'advait', 'vibha', 'gagan', 'eshita', 'navya', 'bian', 'kavya', 
        'vrinda', 'moham', 'magan', 'rupak', 'aadhya', 'banchal'
    }
    
    # Cities found in the degree column
    cities = {'pune', 'bhopal'}
    
    # Job titles found in the degree column
    job_titles = {
        'business analyst', 'travel consultant', 'financial analyst'
    }
    
    def standardize_text(text):
        """Standardize text by converting to lowercase and removing punctuation/spaces"""
        if pd.isna(text):
            return ''
        text = str(text).lower()
        text = ''.join(c for c in text if c not in string.punctuation)
        return text.replace(' ', '')
    
    def is_numeric(x):
        """Check if a value can be converted to float"""
        try:
            float(x)
            return True
        except (ValueError, TypeError):
            return False
    
    def standardize_degree(degree, degree_std):
        """Standardize degree values based on mapping and patterns"""
        if pd.isna(degree):
            return degree
            
        # Return NaN for non-degree values
        if (degree_std in job_titles or 
            degree_std in names or 
            degree_std in cities or 
            is_numeric(degree)):
            return np.nan
            
        # Check standardization mapping
        if degree_std in degree_standardization:
            return degree_standardization[degree_std]
        
        # Handle special cases
        if 'b.gender' in degree_std:
            return np.nan
        if 'b.study_hours' in degree_std:
            return np.nan
        if 'b.press' in degree_std:
            return np.nan
        if 'm.ui' in degree_std:
            return np.nan
            
        return np.nan
    
    # Standardize degree text for comparison
    df['Degree_std'] = df['Degree'].apply(standardize_text)
    
    # Handle job titles
    job_mask = df['Degree_std'].apply(lambda x: any(job in x for job in job_titles))
    df.loc[job_mask & df['Profession'].isna(), 'Profession'] = df.loc[job_mask, 'Degree']
    
    # Handle CGPA values - specific values seen in data: '5.65', '3.0', '8.95', '20'
    cgpa_mask = df['Degree'].apply(lambda x: isinstance(x, (int, float)) or 
                                 (isinstance(x, str) and is_numeric(x)))
    df.loc[cgpa_mask, 'CGPA'] = df.loc[cgpa_mask, 'Degree'].apply(
        lambda x: float(x) if is_numeric(x) and float(x) <= 10 else np.nan)
    
    # Handle cities
    city_mask = df['Degree_std'].isin(cities)
    df.loc[city_mask & df['City'].isna(), 'City'] = df.loc[city_mask, 'Degree']
    
    # Standardize degrees
    df['Degree'] = df.apply(lambda x: standardize_degree(x['Degree'], x['Degree_std']), axis=1)
    
    # Drop temporary standardization column
    df = df.drop('Degree_std', axis=1)
    
    return df

train = clean_degree_and_related_columns(train)
test = clean_degree_and_related_columns(test)

In [None]:

def clean_profession_column(df):
    df = df.copy()
    
    # Standardize format: lowercase, remove punctuation and extra spaces
    df['Profession_std'] = df['Profession'].fillna('').astype(str).str.lower()
    df['Profession_std'] = df['Profession_std'].apply(lambda x: ''.join(c for c in x if c not in string.punctuation))
    df['Profession_std'] = df['Profession_std'].str.replace(' ', '')
    
    # 1. Names to convert to NaN
    names = {
        'manvi', 'pranav', 'samar', 'simran', 'yogesh', 'yuvraj'
    }
    
    # 2. Cities to be moved to City column if City is NaN
    cities = {
        'nagpur', 'patna', 'surat', 'visakhapatnam'
    }
    city_mask = df['Profession_std'].isin(cities)
    df.loc[city_mask & df['City'].isna(), 'City'] = df.loc[city_mask, 'Profession']
    
    # 3. Degrees to be moved to Degree column if Degree is NaN
    degrees = {
        'bcom': 'B.Com',
        'bed': 'B.Ed',
        'bpharm': 'B.Pharm',
        'bba': 'BBA',
        'bca': 'BCA',
        'be': 'BE',
        'llm': 'LLM',
        'med': 'M.Ed',
        'mpharm': 'M.Pharm',
        'mtech': 'M.Tech',
        'mba': 'MBA',
        'mbbs': 'MBBS',
        'mca': 'MCA',
        'md': 'MD',
        'me': 'ME',
        'phd': 'PhD'
    }
    
    # Modified this part to avoid the mapping issue
    for prof_std, degree in degrees.items():
        mask = (df['Profession_std'] == prof_std) & df['Degree'].isna()
        df.loc[mask, 'Degree'] = degree
    
    # 4. Invalid/nonsensical entries to convert to NaN
    invalid_entries = {
        '24th', '3m', 'moderate', 'no', 'name', 'profession', 'unhealthy', 
        'unveil', 'familyvirar', 'dev'
    }
    
    # 5. Standardize similar professions
    profession_standardization = {
        'academic': 'Teacher',
        'medicaldoctor': 'Doctor',
        'financialanalyst': 'Financial Analyst',
        'finanancialanalyst': 'Financial Analyst',
        'businessanalyst': 'Business Analyst',
        'softwareengineer': 'Software Engineer',
        'civilengineer': 'Civil Engineer',
        'mechanicalengineer': 'Mechanical Engineer',
        'cityconsultant': 'City Consultant',
        'educationalconsultant': 'Educational Consultant',
        'familyconsultant': 'Family Consultant',
        'travelconsultant': 'Travel Consultant',
        'digitalmarketer': 'Digital Marketer',
        'marketingmanager': 'Marketing Manager',
        'hrmanager': 'HR Manager',
        'citymanager': 'City Manager',
        'graphicdesigner': 'Graphic Designer',
        'uxuidesigner': 'UX/UI Designer',
        'contentwriter': 'Content Writer',
        'investmentbanker': 'Investment Banker',
        'salesexecutive': 'Sales Executive',
        'customersupport': 'Customer Support',
        'researchanalyst': 'Research Analyst',
        'workingprofessional': 'Working Professional'
    }
    
    def standardize_profession(prof, prof_std):
        if pd.isna(prof) or prof_std == '':
            return np.nan
            
        # Return NaN for names, cities, and invalid entries
        if (prof_std in names or prof_std in cities or 
            prof_std in invalid_entries or prof_std in degrees.keys()):
            return np.nan
            
        # Return standardized profession if it exists
        if prof_std in profession_standardization:
            return profession_standardization[prof_std]
            
        # Return original profession if it's already standard
        if prof in ['Accountant', 'Analyst', 'Architect', 'Chef', 'Chemist',
                   'Consultant', 'Data Scientist', 'Doctor', 'Electrician',
                   'Entrepreneur', 'Judge', 'Lawyer', 'Manager', 'Pharmacist',
                   'Pilot', 'Plumber', 'Researcher', 'Student', 'Surgeon',
                   'Teacher', 'Unemployed']:
            return prof
            
        return np.nan
    
    # Apply standardization
    df['Profession'] = df.apply(lambda x: standardize_profession(x['Profession'], x['Profession_std']), axis=1)
    
    # Drop temporary column
    df = df.drop('Profession_std', axis=1)
    
    return df

# Apply to both datasets
train = clean_profession_column(train)
test = clean_profession_column(test)


In [None]:
def clean_city_column(df):
    df = df.copy()
    
    # Standardize format: remove punctuation and extra spaces
    df['City_std'] = df['City'].fillna('').astype(str).str.lower()
    df['City_std'] = df['City_std'].apply(lambda x: ''.join(c for c in x if c not in string.punctuation))
    df['City_std'] = df['City_std'].str.replace(' ', '')
    
    # Valid Indian cities and their standardized names
    city_standardization = {
        'agra': 'Agra',
        'ahmedabad': 'Ahmedabad',
        'bangalore': 'Bangalore',
        'bhopal': 'Bhopal',
        'ghopal': 'Bhopal',
        'mhopal': 'Bhopal',
        'chennai': 'Chennai',
        'delhi': 'Delhi',
        'moredelhi': 'Delhi',
        'lessdelhi': 'Delhi',
        'faridabad': 'Faridabad',
        'ghaziabad': 'Ghaziabad',
        'khaziabad': 'Ghaziabad',
        'gurgaon': 'Gurgaon',
        'hyderabad': 'Hyderabad',
        'indore': 'Indore',
        'jaipur': 'Jaipur',
        'kalyan': 'Kalyan',
        'malyan': 'Kalyan',
        'nalyan': 'Kalyan',
        'kanpur': 'Kanpur',
        'iskanpur': 'Kanpur',
        'kolkata': 'Kolkata',
        'golkata': 'Kolkata',
        'molkata': 'Kolkata',
        'rolkata': 'Kolkata',
        'tolkata': 'Kolkata',
        'lucknow': 'Lucknow',
        'ludhiana': 'Ludhiana',
        'meerut': 'Meerut',
        'morena': 'Morena',
        'mumbai': 'Mumbai',
        'nagpur': 'Nagpur',
        'nashik': 'Nashik',
        'patna': 'Patna',
        'pune': 'Pune',
        'rajkot': 'Rajkot',
        'vaikot': 'Rajkot',
        'srinagar': 'Srinagar',
        'surat': 'Surat',
        'thane': 'Thane',
        'thani': 'Thane',
        'vadodara': 'Vadodara',
        'varanasi': 'Varanasi',
        'vasaivirar': 'Vasai-Virar',
        'sanvasaivirar': 'Vasai-Virar',
        'unirar': 'Vasai-Virar',
        'visakhapatnam': 'Visakhapatnam'
    }
    
    # Names to convert to NaN
    names = {
        'aaradhya', 'abhinav', 'aditi', 'aditya', 'aishwarya', 'anvi', 
        'armaan', 'atharv', 'avni', 'ayansh', 'ayush', 'bhavna', 'chhavi', 
        'dhruv', 'gaurav', 'harsh', 'harsha', 'hrithik', 'ira', 'ivaan', 
        'jhanvi', 'kagan', 'kashish', 'keshav', 'khushi', 'krishna', 'leela', 
        'mahi', 'malyansh', 'mihir', 'mira', 'nalini', 'nandini', 'parth', 
        'pooja', 'pratham', 'pratyush', 'raghavendra', 'rashi', 'reyansh', 
        'saanvi', 'sara', 'saurav', 'shrey', 'siddhesh', 'tushar', 'vaanya', 
        'vaishnavi', 'vidhi', 'vidya', 'vikram'
    }
    
    # Invalid entries to convert to NaN
    invalid_entries = {
        '3.0', 'city', 'no', 'no12', 'chemist', 'lawyer', 'researcher',
        'mca', 'mcom', 'mtech', 'me', 'msc', 'lessthan5hours', 
        'lessthan5kalyan', 'galesabad', 'ishanabad', 'ishkarsh', 'ithal', 
        'itheg', 'kashk', 'kibara', 'krinda', 'moreadhyay', 'plata', 'unaly'
    }
    
    def standardize_city(city, city_std):
        if pd.isna(city) or city_std == '':
            return np.nan
            
        # Return NaN for names and invalid entries
        if city_std in names or city_std in invalid_entries:
            return np.nan
            
        # Return standardized city if it exists
        if city_std in city_standardization:
            return city_standardization[city_std]
            
        return np.nan
    
    # Apply standardization
    df['City'] = df.apply(lambda x: standardize_city(x['City'], x['City_std']), axis=1)
    
    # Drop temporary column
    df = df.drop('City_std', axis=1)
    
    return df

# Apply to both datasets
train = clean_city_column(train)
test = clean_city_column(test)

In [None]:
categorical_features = train.select_dtypes(include=['object']).columns.tolist()

train[categorical_features].nunique()

In [None]:
train[categorical_features].head()


In [None]:
plt.style.use('seaborn')
sns.set_palette("husl")

# Plot for each categorical feature
for col in categorical_features:
    if col!="Name":
        # Create contingency table with normalized values (percentages)
        contingency_table = pd.crosstab(train[col], train['Depression'], normalize='index')
        
        # Set style
        sns.set(style="whitegrid")
        
        # Create stacked bar plot
        contingency_table.plot(kind="bar", 
                        stacked=True, 
                        figsize=(20, 4))
        
        # Customize plot
        plt.title(f"Percentage Distribution of Depression across {col}")
        plt.xlabel(col)
        plt.ylabel("Percentage")
        plt.legend(title="Depression")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

# 5. FEATURE ENGINEERING

## 5.1 TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np

def process_categorical_features(categorical_features, train, test, n, p):
    """
    Process categorical features using TF-IDF and SVD without dropping original features.
    
    Parameters:
        categorical_features (list): List of categorical column names to process
        train (pd.DataFrame): Training dataset
        test (pd.DataFrame): Test dataset
        n (int): Maximum number of features for TF-IDF
        p (int): Number of components for SVD
        
    Returns:
        tuple: Processed (train, test) datasets with original features preserved
    """
    # Create copies to avoid modifying original dataframes
    train = train.copy()
    test = test.copy()
    
    def tf_idf(train_df, test_df, column, n_features, n_components):
        """
        Apply TF-IDF and SVD transformation to a categorical column.
        
        Parameters:
            train_df (pd.DataFrame): Training data
            test_df (pd.DataFrame): Test data
            column (str): Column name to process
            n_features (int): Maximum number of features for TF-IDF
            n_components (int): Number of components for SVD
            
        Returns:
            tuple: Processed (train_df, test_df)
        """
        # Handle missing values
        train_df[column] = train_df[column].fillna('')
        test_df[column] = test_df[column].fillna('')
        
        # Convert to string
        train_df[column] = train_df[column].astype(str)
        test_df[column] = test_df[column].astype(str)
        
        try:
            # TF-IDF transformation
            vectorizer = TfidfVectorizer(max_features=n_features)
            vectors_train = vectorizer.fit_transform(train_df[column])
            vectors_test = vectorizer.transform(test_df[column])
            
            # SVD transformation
            svd = TruncatedSVD(n_components=n_components)
            x_pca_train = svd.fit_transform(vectors_train)
            x_pca_test = svd.transform(vectors_test)
            
            # Create feature names
            feature_names = [f"{column}_tfidf_{i}" for i in range(n_components)]
            
            # Convert to DataFrame
            tfidf_df_train = pd.DataFrame(x_pca_train, columns=feature_names)
            tfidf_df_test = pd.DataFrame(x_pca_test, columns=feature_names)
            
            # Concatenate with original dataframes
            train_df = pd.concat([train_df, tfidf_df_train], axis=1)
            test_df = pd.concat([test_df, tfidf_df_test], axis=1)
            
            print(f"Successfully processed {column}")
            return train_df, test_df
            
        except Exception as e:
            print(f"Error processing {column}: {str(e)}")
            return train_df, test_df
    
    # Process each categorical feature
    for column in categorical_features:
        if column in train.columns:  # Only process if column exists
            train, test = tf_idf(
                train, 
                test, 
                column,
                n,
                p
            )
        else:
            print(f"Warning: Column {column} not found in the dataset")
    
    return train, test

n = 1000  # max features for TF-IDF
p = 5     # number of components for SVD
train, test= process_categorical_features(categorical_features, train, test, n, p)


## 5.2  CATEGORICAL ENCODING

In [None]:
# Global variables
overall_best_score = 0
overall_best_col = None

def OHE(train_df, test_df, cols, target):
   '''
   One hot encoding function that handles train and test together
   '''
   combined = pd.concat([train_df, test_df], axis=0)
   for col in cols:
       one_hot = pd.get_dummies(combined[col]).astype(int)
       counts = combined[col].value_counts()
       min_count_category = counts.idxmin()
       one_hot = one_hot.drop(min_count_category, axis=1)
       one_hot.columns = [str(f)+col+"_OHE" for f in one_hot.columns]
       combined = pd.concat([combined, one_hot], axis="columns")
       combined = combined.loc[:, ~combined.columns.duplicated()]
   
   train_ohe = combined[:len(train_df)]
   test_ohe = combined[len(train_df):]
   test_ohe.reset_index(inplace=True, drop=True)
   test_ohe.drop(columns=[target], inplace=True)
   return train_ohe, test_ohe

def high_freq_ohe(train, test, extra_cols, target, n_limit=50):
   '''
   One hot encoding for high cardinality features
   '''
   train_copy = train.copy()
   test_copy = test.copy()
   
   for col in extra_cols:
       dict1 = train_copy[col].value_counts().to_dict()
       ordered = dict(sorted(dict1.items(), key=lambda x: x[1], reverse=True))
       rare_keys = list([*ordered.keys()][n_limit:])
       rare_key_map = dict(zip(rare_keys, np.full(len(rare_keys), 9999)))
       
       train_copy[col] = train_copy[col].replace(rare_key_map)
       test_copy[col] = test_copy[col].replace(rare_key_map)
       
   train_copy, test_copy = OHE(train_copy, test_copy, extra_cols, target)
   drop_cols = [f for f in train_copy.columns if "9999" in f or train_copy[f].nunique()==1]
   train_copy = train_copy.drop(columns=drop_cols)
   test_copy = test_copy.drop(columns=drop_cols)
   
   return train_copy, test_copy

def handle_categorical_features(train, test, cat_cols):
    '''
    Initial processing of categorical features with proper handling of text categories
    '''
    cat_cols_updated = []
    train_processed = train.copy()
    test_processed = test.copy()
    
    for col in cat_cols:
        print(f"Processing {col}...")
        # Find uncommon categories
        train_categories = set(train[col].unique())
        test_categories = set(test[col].unique())
        uncommon = list(train_categories.union(test_categories) - train_categories.intersection(test_categories))
        
        if train[col].dtype != "O":
            # Handle numeric categorical
            train_processed[f"{col}_cat"] = train[col]
            test_processed[f"{col}_cat"] = test[col]
            cat_cols_updated.append(f"{col}_cat")
            
            if uncommon:
                train_processed[f"{col}_cat"] = train_processed[f"{col}_cat"].apply(
                    lambda x: np.nan if x in uncommon else x)
                test_processed[f"{col}_cat"] = test_processed[f"{col}_cat"].apply(
                    lambda x: np.nan if x in uncommon else x)
                
        else:
            # Handle string categorical
            cat_cols_updated.append(col)
            
            # Create category maps using only training data
            categories = train[col].unique()
            cat_map = {cat: f"{cat}_{col}" for cat in categories}
            
            # Apply mapping to both train and test
            train_processed[col] = train[col].map(cat_map).fillna('Other_' + col)
            test_processed[col] = test[col].map(cat_map).fillna('Other_' + col)
            
            # Count encoding (will be used later)
            train_processed[f"{col}_count"] = train[col].map(train[col].value_counts())
            test_processed[f"{col}_count"] = test[col].map(train[col].value_counts().to_dict()).fillna(0)
    
    return train_processed, test_processed, cat_cols_updated

def cat_encoding(train, test, cat_cols_updated, target):
    '''
    Main encoding function with feature selection using Accuracy Score
    '''
    global overall_best_score
    global overall_best_col
    
    table = PrettyTable()
    table.field_names = ['Feature', 'Encoded Features', 'Accuracy Score']
    
    train_copy = train.copy()
    test_copy = test.copy()
    
    for feature in cat_cols_updated:
        print(f"Encoding {feature}...")
        
        # For numeric features
        if train[feature].dtype != 'O':
            # Count encoding
            dic = train[feature].value_counts().to_dict()
            train_copy[feature + "_count"] = train[feature].map(dic)
            test_copy[feature + "_count"] = test[feature].map(dic)
            
            # Rank encoding
            dic2 = train[feature].value_counts().to_dict()
            list1 = np.arange(len(dic2.values()))
            dic3 = dict(zip(list(dic2.keys()), list1))
            
            train_copy[feature+"_count_label"] = train[feature].replace(dic3).astype(float)
            test_copy[feature+"_count_label"] = test[feature].replace(dic3).astype(float)
            
            temp_cols = [feature + "_count", feature + "_count_label"]
            
        else:
            # For categorical features, use only count encoding
            temp_cols = [feature + "_count"]
        
        # Handle high cardinality
        if train_copy[feature].nunique() <= 25:
            train_copy, test_copy = OHE(train_copy, test_copy, [feature], target)
        else:
            train_copy, test_copy = high_freq_ohe(train_copy, test_copy, [feature], target, n_limit=25)
        
        train_copy = train_copy.drop(columns=[feature])
        test_copy = test_copy.drop(columns=[feature])
        
        # Cross validation
        kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
        acc_scores = []
        
        for f in temp_cols:
            if f not in train_copy.columns:
                continue
                
            X = train_copy[[f]].values
            y = train_copy[target].values
            
            cv_scores = []
            for train_idx, val_idx in kf.split(X, y):
                X_train, y_train = X[train_idx], y[train_idx]
                x_val, y_val = X[val_idx], y[val_idx]
                model = lgb.LGBMClassifier(random_state=RANDOM_STATE,verbose=-1)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(x_val)
                cv_scores.append(accuracy_score(y_val, y_pred))
                
            acc_scores.append((f, np.mean(cv_scores)))
            if overall_best_score < np.mean(cv_scores):
                overall_best_score = np.mean(cv_scores)
                overall_best_col = f
        
        if acc_scores:  # Only if we have scores to process
            best_col, best_acc = sorted(acc_scores, key=lambda x: x[1], reverse=True)[0]
            
            # Feature selection based on correlation
            available_cols = [col for col in temp_cols if col in train_copy.columns]
            if len(available_cols) > 1:
                corr = train_copy[available_cols].corr(method='pearson')
                corr_with_best_col = corr[best_col]
                cols_to_drop = [f for f in available_cols if corr_with_best_col[f] > 0.5 and f != best_col]
                
                # if cols_to_drop:
                #     train_copy = train_copy.drop(columns=cols_to_drop)
                #     test_copy = test_copy.drop(columns=cols_to_drop)
                    
            table.add_row([feature, best_col, best_acc])
    
    print(table)
    print("Overall best CV score: ", overall_best_score)
    return train_copy, test_copy

train_processed, test_processed, cat_cols_updated = handle_categorical_features(train, test, categorical_features)
train, test = cat_encoding(train_processed, test_processed, cat_cols_updated, 'Depression')

## 5.3 ARITHMETIC FEATURES

In [None]:
def better_features(train, test, target, cols, best_score):
    new_cols = []
    skf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)  # Stratified k-fold object
    best_list=[]
    for i in tqdm(range(len(cols)), desc='Generating Columns'):
        col1 = cols[i]
        temp_df = pd.DataFrame()  # Temporary dataframe to store the generated columns
        temp_df_test = pd.DataFrame()  # Temporary dataframe for test data

        for j in range(i+1, len(cols)):
            col2 = cols[j]
            # Multiply
            temp_df[col1 + '*' + col2] = train[col1] * train[col2]
            temp_df_test[col1 + '*' + col2] = test[col1] * test[col2]

            # Divide (col1 / col2)
            temp_df[col1 + '/' + col2] = train[col1] / (train[col2] + 1e-5)
            temp_df_test[col1 + '/' + col2] = test[col1] / (test[col2] + 1e-5)

            # Divide (col2 / col1)
            temp_df[col2 + '/' + col1] = train[col2] / (train[col1] + 1e-5)
            temp_df_test[col2 + '/' + col1] = test[col2] / (test[col1] + 1e-5)

            # Subtract
            temp_df[col1 + '-' + col2] = train[col1] - train[col2]
            temp_df_test[col1 + '-' + col2] = test[col1] - test[col2]

            # Add
            temp_df[col1 + '+' + col2] = train[col1] + train[col2]
            temp_df_test[col1 + '+' + col2] = test[col1] + test[col2]

        SCORES = []
        for column in temp_df.columns:
            scores = []
            for train_index, val_index in skf.split(train, train[target]):
                X_train, X_val = temp_df[column].iloc[train_index].values.reshape(-1, 1), temp_df[column].iloc[val_index].values.reshape(-1, 1)
                y_train, y_val = train[target].iloc[train_index], train[target].iloc[val_index]
                model =lgb.LGBMClassifier(random_state=RANDOM_STATE,verbose=-1)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                score = accuracy_score(y_val,y_pred)
                scores.append(score)
            mean_score = np.mean(scores)
            SCORES.append((column, mean_score))

        if SCORES:
            best_col, best_acc = sorted(SCORES, key=lambda x: x[1],reverse=True)[0]
            corr_with_other_cols = train.drop([target] + new_cols, axis=1).corrwith(temp_df[best_col])
            if (corr_with_other_cols.abs().max() < 0.9 or best_acc > best_score) and corr_with_other_cols.abs().max() !=1 :
                train[best_col] = temp_df[best_col]
                test[best_col] = temp_df_test[best_col]
                new_cols.append(best_col)
                print(f"Added column '{best_col}' with Accuracy Score: {best_acc:.4f} & Correlation {corr_with_other_cols.abs().max():.4f}")

    return train, test, new_cols

In [None]:
selected_features=[f for f in test.columns if train[f].nunique()>2]
len(selected_features)

# train, test,new_cols=better_features(train, test, target, selected_features, overall_best_score)
new_cols=['Work Pressure*Work/Study Hours',
 'CGPA*Financial Stress',
 'Job Satisfaction*Work/Study Hours',
 'Work/Study Hours*Profession_count',
 'Financial Stress*Profession_count',
 'Min_sleep*Profession_count']

<font size="3">We don't have to run the above algorithm every time, we can just store the combinations and compute the required columns. Unimportant columns and binary columns are not considered while assessing the combinations</font>

In [None]:
def apply_arithmetic_operations(train_df, test_df, expressions_list):
    for expression in expressions_list:
        if expression not in train_df.columns:
            # Split the expression based on operators (+, -, *, /)
            parts = expression.split('+') if '+' in expression else \
                    expression.split('-') if '-' in expression else \
                    expression.split('*') if '*' in expression else \
                    expression.split('/')

            # Get the DataFrame column names involved in the operation
            cols = [col for col in parts]

            # Perform the corresponding arithmetic operation based on the operator in the expression
            if cols[0] in train_df.columns and cols[1] in train_df.columns:
                if '+' in expression:
                    train_df[expression] = train_df[cols[0]] + train_df[cols[1]]
                    test_df[expression] = test_df[cols[0]] + test_df[cols[1]]
                elif '-' in expression:
                    train_df[expression] = train_df[cols[0]] - train_df[cols[1]]
                    test_df[expression] = test_df[cols[0]] - test_df[cols[1]]
                elif '*' in expression:
                    train_df[expression] = train_df[cols[0]] * train_df[cols[1]]
                    test_df[expression] = test_df[cols[0]] * test_df[cols[1]]
                elif '/' in expression:
                    train_df[expression] = train_df[cols[0]] / (train_df[cols[1]]+1e-5)
                    test_df[expression] = test_df[cols[0]] /( test_df[cols[1]]+1e-5)
    
    return train_df, test_df

train, test = apply_arithmetic_operations(train, test, new_cols)

# 6.1 FEATURE SELECTION

In [None]:
final_features=[f for f in train.columns if f not in [target] and train[f].dtype== float]
final_features=[*set(final_features)]

sc=StandardScaler()

train_scaled=train.copy()
test_scaled=test.copy()
train_scaled[final_features]=sc.fit_transform(train[final_features])
test_scaled[final_features]=sc.transform(test[final_features])

In [None]:
def post_processor(train, test, target):
    cols = train.drop(columns=[target]).columns
    train_cop = train.copy()
    test_cop = test.copy()
    drop_cols = []
    
    col_hash_map = {}

    for feature in cols:
        col_hash = pd.util.hash_pandas_object(train_cop[feature], index=False).sum()

        if col_hash in col_hash_map:
            drop_cols.append(feature)
        else:
            col_hash_map[col_hash] = feature

    print("Columns to drop:", drop_cols)
    
    train_cop.drop(columns=drop_cols, inplace=True)
    test_cop.drop(columns=drop_cols, inplace=True)
    
    return train_cop, test_cop

train_cop, test_cop = post_processor(train_scaled, test_scaled, target)
# train_cop, test_cop = train_scaled.copy(), test_scaled.copy()

In [None]:
X_train = train_cop.drop(columns=[target])
y_train = train[target]

X_test = test_cop.copy()

cat_features = X_train.select_dtypes(include=['object']).columns.tolist() 

X_train[cat_features]=X_train[cat_features].astype(str).fillna('None')
X_test[cat_features]=X_test[cat_features].astype(str).fillna('None')

print(X_train.shape, X_test.shape)

In [None]:

def get_most_important_features(X_train, y_train, n, model_input, device='cpu', RANDOM_STATE=42):
    """
    Get most important features using various boosting algorithms
    
    Parameters:
    -----------
    X_train : pd.DataFrame
        Training features
    y_train : pd.Series
        Target variable
    n : int
        Number of top features to return
    model_input : str
        Type of model to use ('xgb', 'cat', or 'lgb')
    device : str
        Device to use for computation ('cpu' or 'gpu')
    RANDOM_STATE : int
        Random seed for reproducibility
    
    Returns:
    --------
    list
        Top n most important features
    """
    # Set style
    plt.style.use('seaborn')
    sns.set_palette("husl")
    
    # Model parameters
    xgb_params = {
        'n_jobs': -1,
        'tree_method': 'hist',
        'verbosity': 0,
        'random_state': RANDOM_STATE,
    }
    
    lgb_params = {
        'boosting_type': 'gbdt',
        'random_state': RANDOM_STATE,
        'device': device.lower(),
        'verbose': -1,
        'n_jobs': -1,
        'metric': 'accuracy'
    }
    
    cb_params = {
        'task_type': device.upper(),
        'random_state': RANDOM_STATE,
        'verbose': False,
        'thread_count': -1
    }
    
    # GPU configurations
    if device.lower() == 'gpu':
        xgb_params.update({
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor'
        })
        
    # Model selection
    if 'xgb' in model_input.lower():
        model = xgb.XGBClassifier(**xgb_params)
    elif 'cat' in model_input.lower():
        model = CatBoostClassifier(**cb_params)
    else:
        model = lgb.LGBMClassifier(**lgb_params)
    
    # Cross validation setup
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    f1_scores = []
    feature_importances_list = []
    
    # Perform cross-validation
    for train_idx, val_idx in kfold.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Model fitting
        if 'lgb' in model_input.lower():
            model.fit(X_train_fold, y_train_fold)
        else:
            model.fit(X_train_fold, y_train_fold, verbose=False)
        
        # Predictions and scoring
        y_pred = model.predict(X_val_fold)
        f1_scores.append(accuracy_score(y_val_fold, y_pred))
        feature_importances_list.append(model.feature_importances_)
    
    # Calculate average metrics
    avg_accuracy = np.mean(f1_scores)
    avg_feature_importances = np.mean(feature_importances_list, axis=0)
    
    # Get feature importance rankings
    feature_importance_list = list(zip(X_train.columns, avg_feature_importances))
    sorted_features = sorted(feature_importance_list, key=lambda x: x[1], reverse=True)
    top_n_features = [feature[0] for feature in sorted_features[:n]]
    display_features = top_n_features[:12]
    
    # Plotting
    plt.figure(figsize=(12, 8))
    
    # Create color palette
    colors = sns.color_palette("husl", n_colors=1)
    
    # Plot horizontal bars
    bars = plt.barh(range(len(display_features)), 
                   [avg_feature_importances[X_train.columns.get_loc(feature)] for feature in display_features],
                   color=colors)
    
    # Customize plot
    plt.yticks(range(len(display_features)), display_features, fontsize=10)
    plt.xlabel('Feature Importance Score', fontsize=12)
    plt.ylabel('Features', fontsize=12)
    plt.title(f'Top {len(display_features)} Most Important Features\nAverage Accuracy: {avg_accuracy:.4f}', 
              fontsize=14, pad=20)
    
    # Invert y-axis and add grid
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.3)
    
    # Add value labels on bars
    for i, bar in enumerate(bars):
        width = bar.get_width()
        plt.text(width + 0.001, bar.get_y() + bar.get_height()/2,
                f'{width:.3f}',
                ha='left', va='center', fontsize=10)
    
    # Adjust layout and display
    plt.tight_layout()
    plt.show()
    
    return top_n_features

In [None]:
n_imp_features_cat=get_most_important_features(X_train.reset_index(drop=True), y_train,100, 'cat', device=device)
n_imp_features_xgb=get_most_important_features(X_train.reset_index(drop=True), y_train,100, 'xgb',  device=device)
n_imp_features_lgbm=get_most_important_features(X_train.reset_index(drop=True), y_train,100, 'lgbm',  device=device)

In [None]:
n_imp_features=[*set(n_imp_features_xgb+n_imp_features_lgbm+n_imp_features_cat)]
print(f"{len(n_imp_features)} features have been selected from three algorithms for the final model")

X_train=X_train[n_imp_features]
X_test=X_test[n_imp_features]

# 7. MODEL

## 7.1 ANNs

In [None]:
import tensorflow
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LeakyReLU, PReLU, ELU
from keras.layers import Dropout

from keras.utils import to_categorical

gpus = tensorflow.config.list_physical_devices('GPU')
if gpus:
    print("GPU is available")
else:
    print("GPU is not available")
    
    
def optimizer():
    sgd=tensorflow.keras.optimizers.SGD(learning_rate=0.005, momentum=0.5, nesterov=True)
    rms = tensorflow.keras.optimizers.RMSprop()
    nadam=tensorflow.keras.optimizers.Nadam(
        learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Nadam"
    )
    adam=tensorflow.keras.optimizers.Adam()
    adamW = keras.optimizers.AdamW(learning_rate=0.002,weight_decay=0.001, beta_1=0.9, beta_2=0.999)
    
    return sgd,rms,nadam, adamW,adam


lrelu = lambda x: tensorflow.keras.activations.relu(x, alpha=0.1)

from tensorflow.keras import backend as K

def root_mean_squared_log_error(y_true, y_pred):
    '''
    Compute RMSLE between actuals & predictions
    '''
    return K.sqrt(K.mean(K.square(K.log(abs(y_pred+1)) - K.log(abs(y_true+1)))))
    
def root_mean_squared_error(y_true, y_pred):
    '''
    Compute RMSE between actuals & predictions
    '''
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
def init_ann1(num_classes, input_dim):
    '''
    Initialize the artificial neural network (ANN) for multiclass classification
    '''

    sgd, rms, nadam, adamW, adam = optimizer()
    
    ann = Sequential()
    ann.add(Dense(16, input_dim=input_dim, kernel_initializer='he_uniform', activation='relu'))
    ann.add(Dropout(0.1))
    ann.add(Dense(8, kernel_initializer='he_uniform', activation='relu'))
    ann.add(Dropout(0.1))
    ann.add(Dense(4, kernel_initializer='he_uniform', activation='relu'))
    ann.add(Dropout(0.0))
    
    # Change the output layer to match the number of classes
    ann.add(Dense(num_classes, kernel_initializer='he_uniform', activation='softmax'))
    
    # Compile the model with categorical_crossentropy loss and accuracy metric
    ann.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    
    return ann

def init_ann2(num_classes, input_dim):  
    sgd,rms,nadam, adamW, adam=optimizer()
    ann2 = Sequential()
    ann2.add(Dense(128, input_dim=X_test.shape[1], kernel_initializer='he_uniform', activation='relu'))
    ann2.add(Dropout(0.3))
    ann2.add(Dense(32,  kernel_initializer='he_uniform', activation='relu'))
    ann2.add(Dropout(0.1))
    ann2.add(Dense(4,  kernel_initializer='he_uniform', activation='relu'))
    ann2.add(Dropout(0.2))
#     ann2.add(Dense(16,  kernel_initializer='he_uniform', activation='relu'))
#     ann2.add(Dropout(0.1))

    ann2.add(Dense(num_classes, kernel_initializer='he_uniform', activation='softmax'))
    ann2.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    
    return ann2



## 7.2 DEFINE MODELS

In [None]:
class Splitter:
    def __init__(self, test_size=0.2, kfold=True, n_splits=5):
        self.test_size = test_size
        self.kfold = kfold
        self.n_splits = n_splits

    def split_data(self, X, y, random_state_list):
        if self.kfold:
            for random_state in random_state_list:
                kf = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
                    
class Classifier:
    def __init__(self, n_estimators=100, device="cpu", random_state=0):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.len_models = len(self.models)
    def _define_model(self):
       
       # XGBoost parameters
       xgb_params = {
           'n_estimators': self.n_estimators,
           'learning_rate': 0.05,
           'max_depth': 4,
           'subsample': 0.8,
           'colsample_bytree': 0.1,
           'n_jobs': -1,
           'eval_metric': 'auc',  # Changed to binary error
           'objective': 'binary:logistic',  # Changed to binary objective
           'tree_method': 'hist',
           'verbosity': 0,
           'random_state': self.random_state,
       }
       
       if self.device == 'gpu':
           xgb_params['tree_method'] = 'gpu_hist'
           xgb_params['predictor'] = 'gpu_predictor'
    
       xgb_params2 = {
           'n_estimators': self.n_estimators,
           'gamma': 0.279,
           'max_depth': 10,
           'subsample': 0.325,
           'min_child_weight': 9,
           'colsample_bytree': 0.487,
           'learning_rate': 0.052,
           'reg_lambda': 0.0007,
           'reg_alpha': 0.371,
           'n_jobs': -1,
           'eval_metric': 'auc',  # Changed to binary
           'objective': 'binary:logistic',  # Changed to binary
           'tree_method': 'hist',
           'verbosity': 0,
           'random_state': self.random_state,
       }
    
       xgb_params3 = {
           'n_estimators': self.n_estimators,
           'gamma': 0.279,
           'max_depth': 10,
           'subsample': 0.325,
           'min_child_weight': 9,
           'colsample_bytree': 0.487,
           'learning_rate': 0.052,
           'reg_lambda': 0.0007,
           'reg_alpha': 0.371,
           'n_jobs': -1,
           'eval_metric': 'auc',
           'objective': 'binary:logistic',
           'tree_method': 'hist',
           'verbosity': 0,
           'device': 'cuda',
           'booster': 'gbtree',
           'random_state': self.random_state,
       }
    
       xgb_params4 = xgb_params.copy()
       xgb_params4.update({
           'subsample': 0.789,
           'max_depth': 5,
           'learning_rate': 0.161,
           'colsample_bytree': 0.243
       })
    
       xgb_params5 = xgb_params.copy()
       xgb_params5['objective'] = "binary:logistic"
    
       # LightGBM parameters
       lgb_params = {
           'n_estimators': 1024,
           'max_depth': 11,
           'min_samples_leaf': 26,
           'subsample': 0.912,
           'learning_rate': 0.029,
           'lambda_l1': 4.161,
           'lambda_l2': 2.636e-05,
           'colsample_bytree': 0.206,
           'objective': 'binary',  # Changed to binary
           'metric': 'auc',  # Changed to binary error
           'boosting_type': 'gbdt',
           'device': self.device,
           'random_state': self.random_state,
           'verbose': -1
       }
    
       lgb_params2 = {
           'n_estimators': 1000,
           'max_depth': 6,
           'subsample': 0.743,
           'learning_rate': 0.049,
           'lambda_l1': 8.922e-05,
           'lambda_l2': 0.0018,
           'colsample_bytree': 0.392,
           'objective': 'binary',
           'metric': 'auc',
           'boosting_type': 'gbdt',
           'device': self.device,
           'random_state': self.random_state,
           'verbose': -1
       }
    
       lgb_params3 = {
           'n_estimators': 1000,
           'max_depth': 9,
           'subsample': 0.540,
           'learning_rate': 0.049,
           'lambda_l1': 1.749e-08,
           'lambda_l2': 3.837,
           'colsample_bytree': 0.319,
           'objective': 'binary',
           'metric': 'auc',
           'boosting_type': 'gbdt',
           'device': self.device,
           'random_state': self.random_state,
           'verbose': -1
       }
    
       lgb_params4 = lgb_params2.copy()
       lgb_params4.update({
           'subsample': 0.9,
           'reg_lambda': 0.876,
           'reg_alpha': 0.319,
           'max_depth': 9,
           'learning_rate': 0.107,
           'colsample_bytree': 0.1
       })
    
       lgb_params5 = lgb_params2.copy()
       lgb_params5.update({
           'subsample': 0.9,
           'reg_lambda': 0.512,
           'reg_alpha': 0.898,
           'max_depth': 11,
           'learning_rate': 0.081,
           'colsample_bytree': 0.1
       })
    
       # CatBoost parameters
       cb_params = {
           'iterations': self.n_estimators,
           'depth': 6,
           'learning_rate': 0.05,
           'l2_leaf_reg': 0.7,
           'random_strength': 0.2,
           'max_bin': 200,
           'od_wait': 65,
           'one_hot_max_size': 70,
           'grow_policy': 'Depthwise',
           'bootstrap_type': 'Bayesian',
           'od_type': 'Iter',
           'eval_metric': 'AUC',
           'loss_function': 'Logloss',  # Changed to binary
           'task_type': self.device.upper(),
           'random_state': self.random_state,
           'verbose': -1
       }
    
       cb_sym_params = cb_params.copy()
       cb_sym_params['grow_policy'] = 'SymmetricTree'
    
       cb_loss_params = cb_params.copy()
       cb_loss_params['grow_policy'] = 'Lossguide'
    
       cb_params2 = cb_params.copy()
       cb_params2.update({
           'learning_rate': 0.019,
           'depth': 9,
           'random_strength': 0.3,
           'one_hot_max_size': 10,
           'max_bin': 100,
           'l2_leaf_reg': 0.419
       })
    
       cb_params3 = {
           'iterations': self.n_estimators,
           'random_strength': 0.1,
           'one_hot_max_size': 70,
           'learning_rate': 0.008,
           'l2_leaf_reg': 0.3,
           'grow_policy': 'Depthwise',
           'depth': 9,
           # 'max_bin': 200,
           'od_wait': 65,
           'bootstrap_type': 'Bayesian',
           'od_type': 'Iter',
           'eval_metric': 'AUC',
           'loss_function': 'Logloss',
           'task_type': self.device.upper(),
           'random_state': self.random_state,
       }
    
       cb_params4 = cb_params.copy()
       cb_params4.update({
           'learning_rate': 0.143,
           'depth': 16,
           'random_strength': 0.596,
           # 'one_hot_max_size': 100,
           # 'max_bin': 150,
           'l2_leaf_reg': 0.384,
           'grow_policy': 'Lossguide'
       })
    
       # Other model parameters remain same
       dt_params = {'criterion': 'gini', 'max_depth': 9, 'min_samples_split': 17, 'min_samples_leaf': 18, 'max_features': 0.843}
       etr_params = {'criterion': 'gini', 'max_depth': 16, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 0.668, 'bootstrap': True}
       hist_params = {'learning_rate': 0.058, 'n_iter_no_change': 795, 'max_depth': 4, 'min_samples_leaf': 17, 'max_leaf_nodes': 98, 'l2_regularization': 1.923e-07}
       rf_params = {'max_depth': 16, 'min_samples_split': 18, 'min_samples_leaf': 2, 'max_features': 0.416}
       gbt_params = {'learning_rate': 0.136, 'max_depth': 7, 'min_samples_split': 17, 'min_samples_leaf': 15, 'subsample': 0.886, 'max_features': 0.611}
       knn_params = {'n_neighbors': 16, 'weights': 'uniform', 'p': 2, 'leaf_size': 13, 'algorithm': 'ball_tree'}
       adb_params = {'n_estimators': 957, 'learning_rate': 0.663}
    
       models = {
           'xgb':  xgb.XGBClassifier(**xgb_params),
           'xgb2': xgb.XGBClassifier(**xgb_params2),
           'xgb3': xgb.XGBClassifier(**xgb_params3),
           'xgb4': xgb.XGBClassifier(**xgb_params4),
           'xgb5': xgb.XGBClassifier(**xgb_params5),
           'lgb':  lgb.LGBMClassifier(**lgb_params),
           'lgb2': lgb.LGBMClassifier(**lgb_params2),
           'lgb3': lgb.LGBMClassifier(**lgb_params3),
           'lgb4': lgb.LGBMClassifier(**lgb_params4),
           'lgb5': lgb.LGBMClassifier(**lgb_params5),
           'cat':  CatBoostClassifier(**cb_params),
           'cat2': CatBoostClassifier(**cb_params2),
           'cat3': CatBoostClassifier(**cb_params3),
           'cat4': CatBoostClassifier(**cb_params4),
           "cat_sym": CatBoostClassifier(**cb_sym_params),
           "cat_loss": CatBoostClassifier(**cb_loss_params),
           'hist_gbm': HistGradientBoostingClassifier(max_iter=self.n_estimators, **hist_params, random_state=self.random_state),
           'rf': RandomForestClassifier(n_estimators=250, **rf_params, random_state=self.random_state),
           'gbdt': GradientBoostingClassifier(**gbt_params, n_estimators=100, random_state=self.random_state),            
           # 'ada': AdaBoostClassifier(**adb_params, random_state=self.random_state),
           'etr': ExtraTreesClassifier(**etr_params, random_state=self.random_state),
           # 'dt': DecisionTreeClassifier(**dt_params, random_state=self.random_state),
           # 'knn': KNeighborsClassifier(**knn_params),
           # 'log_reg': LogisticRegression(max_iter=1000),
           # 'ridge': CalibratedClassifierCV(RidgeClassifierCV(alphas=[100.02]), method='sigmoid'),
           # 'elasticNet': LogisticRegressionCV(Cs=[0.044], l1_ratios=[0.977]),
           # 'ann1': init_ann1(1, X_test.shape[1]),  # Changed to 1 for binary
           # 'ann2': init_ann2(1, X_test.shape[1]),  # Changed to 1 for binary
       }
       
       return models

## 7.3 WEIGHTED ENSEMBLE

In [None]:
class OptunaWeights:
    def __init__(self, random_state, n_trials=5000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 0, 1) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=weights)
        weighted_pred = weighted_pred/weighted_pred.sum(axis=1, keepdims=True)

        threshold = find_best_threshold(y_true, weighted_pred)
        weighted_pred_labels = (weighted_pred > threshold).astype(int)
        accuracy = accuracy_score(y_true, weighted_pred_labels)
        auc_score= roc_auc_score(y_true, weighted_pred)
        return accuracy*(auc_score)

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='maximize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

def find_best_threshold(y_true, y_pred_probabilities):
   """
   Find optimal threshold for binary classification using accuracy
   """
   def objective(threshold):
       y_pred = (y_pred_probabilities >= threshold).astype(int)
       return -accuracy_score(y_true, y_pred)  # Negative since we want to maximize
   
   result = minimize_scalar(objective, bounds=(0, 1), method='bounded')
   return result.x


def compute_means_dict(X_train, missing_cols=None):
    """
    Compute means dictionary from training data
    
    Parameters:
    X_train (pandas.DataFrame): Training data
    missing_cols (list): List of columns to compute means for. 
                        If None, will find columns with NaN values
    
    Returns:
    dict: Dictionary mapping column names to their means
    """
    # If missing_cols not provided, find columns with NaN values
    if missing_cols is None:
        missing_cols = [f for f in X_train.columns if X_train[f].isna().sum() > 0]
    
    # Compute means dictionary
    means_dict = {}
    for col in missing_cols:
        means_dict[col] = X_train[col].mean()
    
    return means_dict
def fill_missing_with_means(df, means_dict):
    """
    Fill missing values using pre-computed means dictionary
    
    Parameters:
    df (pandas.DataFrame): Input data (can be train or test)
    means_dict (dict): Dictionary of column means computed from training data
    
    Returns:
    pandas.DataFrame: DataFrame with missing values filled
    """
    # Create a copy to avoid modifying original data
    df_filled = df.copy()
    
    # Fill missing values using the means dictionary
    for col, mean_val in means_dict.items():
        if col in df_filled.columns:  # Check if column exists in the dataframe
            df_filled[col].fillna(mean_val, inplace=True)
    
    return df_filled

means_dict=compute_means_dict(X_train)


## 7.4 FIT MODELS

In [None]:
kfold = True
n_splits = 1 if not kfold else 5
random_state = 42
random_state_list = [RANDOM_STATE] 
n_estimators = 1000 
early_stopping_rounds = 300
verbose = False

splitter = Splitter(kfold=kfold, n_splits=n_splits)
oof_predss = pd.DataFrame(np.zeros((X_train.shape[0], 1)))
test_predss = np.zeros((X_test.shape[0], 1))
ensemble_score = []
ensemble_acc_score = []
weights = []
trained_models = {'xgb':[]}
best_thresholds = []  # Store best thresholds for each fold
   
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_train, y_train, random_state_list=random_state_list)):
   n = i % n_splits
   m = i // n_splits
           
   classifier = Classifier(n_estimators, device, random_state)
   models = classifier.models
   
   oof_preds = []
   test_preds = []
   start_time_fold = time.time()
   
   # Train and predict with each model
   for name, model in models.items():
       start_time = time.time()
       
       # Model fitting
       if ('xgb' in name) or ('lgb' in name) or ('cat' in name):
           if 'lgb' in name:
               model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)])
           elif 'cat' in name:
               model.fit(X_train_, y_train_, 
                        eval_set=[(X_val, y_val)],
                        early_stopping_rounds=early_stopping_rounds,cat_features=cat_features,
                        verbose=verbose)  
           else:
               model.fit(X_train_, y_train_, 
                        eval_set=[(X_val, y_val)],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose=verbose)
       elif 'ann' in name:
           model.fit(fill_missing_with_means(X_train_, means_dict), y_train_,
                    validation_data=(fill_missing_with_means(X_val, means_dict), y_val),
                    batch_size=16,
                    epochs=10,
                    verbose=verbose)
       else:
           model.fit(fill_missing_with_means(X_train_, means_dict), y_train_)
           
       if name in trained_models.keys():
           trained_models[f'{name}'].append(deepcopy(model))
       
       # Make predictions
       if 'ann' in name:
           test_pred = model.predict(fill_missing_with_means(X_test, means_dict)).reshape(-1, 1)
           y_val_pred = model.predict(fill_missing_with_means(X_val, means_dict)).reshape(-1, 1)
       elif ('xgb' in name) or ('lgb' in name) or ('cat' in name):
           test_pred = model.predict_proba(X_test)[:, 1].reshape(-1, 1)
           y_val_pred = model.predict_proba(X_val)[:, 1].reshape(-1, 1)
       else:
           test_pred = model.predict_proba(fill_missing_with_means(X_test, means_dict))[:, 1].reshape(-1, 1)
           y_val_pred = model.predict_proba(fill_missing_with_means(X_val, means_dict))[:, 1].reshape(-1, 1)
       
       end_time = time.time()
       time_taken = end_time - start_time
       
       # Find optimal threshold and calculate accuracy
       # threshold = find_best_threshold(y_val, y_val_pred)
       y_val_pred_labels = (y_val_pred > 0.5).astype(int)
       accuracy = accuracy_score(y_val, y_val_pred_labels)
       
       print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] '
             f'Accuracy Score: {accuracy:.5f}'
             f'time taken: {time_taken:.3f} secs')
       
       oof_preds.append(y_val_pred)
       test_preds.append(test_pred)
   
   # Optimize ensemble weights
   optweights = OptunaWeights(random_state=random_state)
   y_val_pred = optweights.fit_predict(y_val, oof_preds)
   oof_predss.loc[X_val.index] = np.array(y_val_pred).reshape(-1, 1)
   # Find optimal threshold for ensemble predictions
   ensemble_threshold = find_best_threshold(y_val, y_val_pred)
   best_thresholds.append(ensemble_threshold)
   
   # Calculate metrics using optimal threshold
   y_val_pred_labels = (y_val_pred > ensemble_threshold).astype(int)
   accuracy = accuracy_score(y_val, y_val_pred_labels)
   
   end_time_fold = time.time()
   time_taken = end_time_fold - start_time_fold
   
   print(f'Ensemble [FOLD-{n} SEED-{random_state_list[m]}] '
         f'-------------------> Accuracy Score: {accuracy:.5f} '
         f'(threshold: {ensemble_threshold:.3f}), '
         f'fold time taken: {time_taken:.5f} secs')
   
   ensemble_acc_score.append(accuracy)
   weights.append(optweights.weights)
   
   # Predict test data using ensemble weights
   test_preds = optweights.predict(test_preds)
   test_predss += test_preds / (n_splits * len(random_state_list))
   
   gc.collect()

# Calculate final predictions using average of best thresholds
final_threshold = np.mean(best_thresholds)
print(f"\nFinal threshold: {final_threshold:.3f}")
final_predictions = (test_predss > final_threshold).astype(int).reshape(-1)

## 7.5 MODEL WEIGHTS

In [None]:
print(f"\nEnsemble CV Accuracy: {np.mean(ensemble_acc_score):.5f} ± {np.std(ensemble_acc_score):.5f}")

# Print the mean and standard deviation of the ensemble weights for each model
print('--- Model Weights ---')
mean_weights = np.mean(weights, axis=0)
std_weights = np.std(weights, axis=0)
for name, mean_weight, std_weight in zip(models.keys(), mean_weights, std_weights):
    print(f'{name}: {mean_weight:.5f} ± {std_weight:.5f}')

## 7.6 SUBMISSION

In [None]:
oof_predss.columns=['final_threshold']
oof_predss.to_csv('oof_predss_v22.csv',index=False)

submission = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
submission[target] =  test_predss
submission.to_csv('submission_pred_v22.csv',index=False)

submission[target] =  final_predictions
submission.to_csv('submission.csv',index=False)


submission.head()

# 7.7 MODE 

<font size="3">Use external results under the assumption that mode on selective results may result in a better score</font>

Thanks to the below authors for their work:
1) By [sunilkumarmuduli](https://www.kaggle.com/code/sunilkumarmuduli/rank-3-sol-sql-meets-machine-learning)
2) By [jiaoyouzhang](https://www.kaggle.com/code/jiaoyouzhang/mental-health-ensemble-0-94397)
3) By [chinmayadatt](https://www.kaggle.com/code/chinmayadatt/0-94434-ensemble-exploring-mental-health-data)
4) By [swandipsingha](https://www.kaggle.com/code/swandipsingha/mental-health-ps4e11)

In [None]:
sub_ext1=pd.read_csv("/kaggle/input/rank-3-sol-sql-meets-machine-learning/submission.csv")
sub_ext2=pd.read_csv("/kaggle/input/mental-health-ensemble-0-94397/submission.csv")
sub_ext3=pd.read_csv("/kaggle/input/0-94434-ensemble-exploring-mental-health-data/submission.csv")
sub_ext4=pd.read_csv("/kaggle/input/mental-health-ps4e11/submission.csv")


In [None]:

def create_mode_submission(submissions_list, submission_template, target_column=target):
    """
    Create a new submission file with mode values from multiple submissions.
    
    Parameters:
        submissions_list (list): List of DataFrames containing predictions
        submission_template (pd.DataFrame): Template submission DataFrame to use for structure
        target_column (str): Name of the target column to calculate mode for
        
    Returns:
        pd.DataFrame: New submission DataFrame with mode predictions
    """
    # Create a copy of the template submission
    final_submission = submission_template.copy()
    
    # Stack all predictions into a single DataFrame
    all_predictions = pd.concat([df[target_column] for df in submissions_list], axis=1)
    
    # Calculate mode for each row
    # If there's no mode (all values different), take the first value
    mode_predictions = all_predictions.mode(axis=1)
    
    # Handle cases where there might be multiple modes
    final_predictions = mode_predictions.iloc[:, 0]  # Take first mode if multiple exist
    
    # Update the target column in the final submission
    final_submission[target_column] = final_predictions
    
    return final_submission


submissions = [sub_ext1, sub_ext2,sub_ext3, sub_ext4, submission]
mode_submission = create_mode_submission(submissions, sub_ext1)

mode_submission.to_csv('submission_mode.csv', index=False)