In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, SMOTENC
import joblib


In [2]:
# Insert this cell before using df
import pandas as pd
df = pd.read_csv('student_mental_health.csv')   # or the correct filename/path
df.columns = df.columns.str.strip()
df.head()

Unnamed: 0,1. Age,2. Gender,3. University,4. Department,5. Academic Year,6. Current CGPA,7. Did you receive a waiver or scholarship at your university?,"1. In a semester, how often you felt nervous, anxious or on edge due to academic pressure?","2. In a semester, how often have you been unable to stop worrying about your academic affairs?","3. In a semester, how often have you had trouble relaxing due to academic pressure?",...,"2. In a semester, how often have you been feeling down, depressed or hopeless?","3. In a semester, how often have you had trouble falling or staying asleep, or sleeping too much?","4. In a semester, how often have you been feeling tired or having little energy?","5. In a semester, how often have you had poor appetite or overeating?","6. In a semester, how often have you been feeling bad about yourself - or that you are a failure or have let yourself or your family down?","7. In a semester, how often have you been having trouble concentrating on things, such as reading the books or watching television?","8. In a semester, how often have you moved or spoke too slowly for other people to notice? Or you've been moving a lot more than usual because you've been restless?","9. In a semester, how often have you had thoughts that you would be better off dead, or of hurting yourself?",Depression Value,Depression Label
0,18-22,Female,"Independent University, Bangladesh (IUB)",Engineering - CS / CSE / CSC / Similar to CS,Fourth Year or Equivalent,2.50 - 2.99,No,1,1,1,...,2,1,1,2,1,1,1,1,11,Moderate Depression
1,18-22,Male,"Independent University, Bangladesh (IUB)",Engineering - CS / CSE / CSC / Similar to CS,First Year or Equivalent,3.80 - 4.00,No,2,2,1,...,1,1,1,1,1,1,1,1,9,Mild Depression
2,18-22,Male,"Independent University, Bangladesh (IUB)",Engineering - CS / CSE / CSC / Similar to CS,First Year or Equivalent,3.00 - 3.39,No,2,1,1,...,0,2,3,2,2,2,2,1,16,Moderately Severe Depression
3,18-22,Male,"Independent University, Bangladesh (IUB)",Engineering - CS / CSE / CSC / Similar to CS,First Year or Equivalent,3.40 - 3.79,No,2,1,1,...,1,1,1,1,1,1,1,1,9,Mild Depression
4,18-22,Male,"Independent University, Bangladesh (IUB)",Engineering - CS / CSE / CSC / Similar to CS,First Year or Equivalent,3.40 - 3.79,No,1,1,1,...,1,1,1,1,1,1,1,1,9,Mild Depression


In [3]:
X = df.drop(columns=["Depression Label"])   # replace target_column_name
y = df["Depression Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
import sklearn
from packaging import version
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()

num_pipe = Pipeline([("impute", SimpleImputer(strategy="median"))])

# Create OneHotEncoder in a version-safe way
ohe_kwargs = {"handle_unknown": "ignore"}
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe_kwargs["sparse_output"] = False
else:
    ohe_kwargs["sparse"] = False

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(**ohe_kwargs))
])

preprocessor_onehot = ColumnTransformer(
    [("num", num_pipe, num_cols),
     ("cat", cat_pipe, cat_cols)],
    sparse_threshold=0  # force dense output
)

In [5]:
import sklearn
print(sklearn.__version__)

1.7.2


In [6]:
X_pre = preprocessor_onehot.fit_transform(X)
print(type(X_pre), getattr(X_pre, "shape", None))

<class 'numpy.ndarray'> (1977, 84)


In [9]:
from collections import Counter
# Quick diagnostic: show class distribution before resampling
print('Before resampling:', Counter(y_train))

# Define SMOTE safely (avoid using np.bincount on non-numeric labels)
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # minority -> 50% of majority

# Build pipeline using the pre-defined smote instance
pipeline = ImbPipeline([
    ("preprocess", preprocessor_onehot),
    ("smote", smote),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Before resampling: Counter({'Moderately Severe Depression': 396, 'Severe Depression': 390, 'Moderate Depression': 359, 'Mild Depression': 326, 'Minimal Depression': 75, 'No Depression': 35})


ValueError: "sampling_strategy" can be a float only when the type of target is binary. For multi-class, use a dict.

In [8]:
from collections import Counter
print('Class distribution (train):', Counter(y_train))
# If you need a custom sampling mapping, build it from Counter keys (labels):
# counts = Counter(y_train)
# majority = max(counts.values())
# sampling_map = {label: majority for label in counts}  # example mapping — adapt as needed
# smote = SMOTE(sampling_strategy=sampling_map, random_state=42)
# Otherwise use a safe scalar like 0.5 to avoid label-indexing issues
# smote = SMOTE(sampling_strategy=0.5, random_state=42)


Class distribution (train): Counter({'Moderately Severe Depression': 396, 'Severe Depression': 390, 'Moderate Depression': 359, 'Mild Depression': 326, 'Minimal Depression': 75, 'No Depression': 35})
