# keep only 105031 rows from the dataset for model training

In [1]:
import pandas as pd

df = pd.read_excel("Mental Health DataSet.xlsx",nrows=105031)
for col in df.columns:
    print(f"--- {col} ---")
    print(df[col].value_counts(dropna=False))
    print()


--- Gender ---
Gender
Male      52517
Female    52514
Name: count, dtype: int64

--- Country ---
Country
United States             65310
United Kingdom            16682
Canada                     7159
Australia                  2675
Ireland                    1891
Netherlands                1626
Sweden                     1294
Germany                    1023
India                       945
New Zealand                 776
South Africa                775
Belgium                     520
Poland                      519
France                      513
Brazil                      513
Switzerland                 341
Israel                      340
Italy                       340
Russia                      171
Greece                      170
Singapore                   170
Denmark                     170
Costa Rica                   86
Portugal                     86
Finland                      86
Czech Republic               85
Georgia                      85
Colombia                     85

# remove NaN values and Encoding categorical features: Convert Gender, Country, Occupation, and Days_Indoors into numeric representations

In [2]:
# 1. Remove rows with any NaN values
df = df.dropna()

# 2. Map Days_Indoors to an ordinal integer
order = [
    "Go out Every day",  # least indoor
    "1-14 days",
    "15-30 days",
    "31-60 days",
    "More than 2 months" # most indoor
]
mapping = {cat: i for i, cat in enumerate(order)}
df['Days_Indoors'] = df['Days_Indoors'].map(mapping)

# 3. replace rare countries with 'Other'
threshold = 1000  
country_counts = df['Country'].value_counts()
rare_countries = country_counts[country_counts < threshold].index

df['Country'] = df['Country'].apply(lambda x: 'Other' if x in rare_countries else x)
print(df['Country'].value_counts())

# 4. One-hot encode the remaining true categoricals
nominals = [
    'Gender',
    'Country',
    'Occupation',
]

# Only run get_dummies if ALL nominal columns are still in the DataFrame
if set(nominals).issubset(df.columns):
    df = pd.get_dummies(df, columns=nominals, drop_first=False, dtype=int)

df.to_excel("cleaned_dataset.xlsx", index=False)



Country
United States     63581
United Kingdom    16510
Other              7371
Canada             6901
Australia          2675
Ireland            1891
Netherlands        1626
Sweden             1294
Germany            1023
Name: count, dtype: int64


# Now Train/test split to prepare for model implementation.

In [6]:
from sklearn.model_selection import train_test_split


X = df.drop('treatment', axis=1)
y = df['treatment']

# First split: 80% temp + 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape:  {y_test.shape}")


X_train shape: (82297, 28)
X_test shape:  (20575, 28)
y_train shape: (82297,)
y_test shape:  (20575,)


# Decision Tree code

In [11]:
import numpy as np
from sklearn.metrics import confusion_matrix

class DecisionTreeClassifierScratch:
    """
    A simple Decision Tree classifier implemented from scratch.
    Supports 'entropy' or 'gini' as split criteria.
    """
    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def __init__(self, max_depth=5, min_samples_split=10, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return -np.sum(probs * np.log2(probs + 1e-9))

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return 1 - np.sum(probs**2)

    # Measures how much better the split separates the classes
    def _information_gain(self, y, y_left, y_right):
        if self.criterion == 'gini':
            loss = self._gini
        else:
            loss = self._entropy
        parent_loss = loss(y)
        n = len(y)
        n_left, n_right = len(y_left), len(y_right)
        child_loss = (n_left/n)*loss(y_left) + (n_right/n)*loss(y_right)
        return parent_loss - child_loss # highter is better

    # find the best feature and threshold to split the dataset
    def _best_split(self, X, y):
        best_gain = 0
        best_feat, best_thresh = None, None
        n_samples, n_features = X.shape

        # get all the unique values
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for t in thresholds:
                left_mask = X[:, feature_idx] <= t
                right_mask = ~left_mask
                if left_mask.sum() < self.min_samples_split or right_mask.sum() < self.min_samples_split:
                    continue
                gain = self._information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain, best_feat, best_thresh = gain, feature_idx, t

        return best_feat, best_thresh

    def _most_common_label(self, y):
        classes, counts = np.unique(y, return_counts=True)
        return classes[np.argmax(counts)]

    def _build_tree(self, X, y, depth=0):
        # Stopping conditions
        if (self.max_depth is not None and depth >= self.max_depth) \
           or len(np.unique(y)) == 1 \
           or len(y) < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return DecisionTreeClassifierScratch.Node(value=leaf_value)
        
        feat, thresh = self._best_split(X, y)
        if feat is None:
            return DecisionTreeClassifierScratch.Node(value=self._most_common_label(y))

        left_mask = X[:, feat] <= thresh
        left = self._build_tree(X[left_mask], y[left_mask], depth+1)
        right = self._build_tree(X[~left_mask], y[~left_mask], depth+1)
        return DecisionTreeClassifierScratch.Node(feature=feat, threshold=thresh, left=left, right=right)

    def fit(self, X, y):
        """
        Build the tree using training data.
        X: array-like of shape (n_samples, n_features)
        y: array-like of shape (n_samples,)
        """
        X, y = np.array(X), np.array(y)
        self.tree = self._build_tree(X, y)
        return self

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def predict(self, X):
        """
        Predict class labels for samples in X.
        X: array-like of shape (n_samples, n_features)
        """
        X = np.array(X)
        return np.array([self._traverse_tree(x, self.tree) for x in X])



# Metric calculations

In [8]:
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# --- Train scratch Decision Tree ---
tree = DecisionTreeClassifierScratch(max_depth=15, min_samples_split=10, criterion='entropy')
tree.fit(X_train.values, y_train.values)

# --- Predict on validation set (or use X_test for final) ---
y_pred = tree.predict(X_test.values)  # For validation

cm = confusion_matrix(y_test, y_pred)  # or y_test, y_pred for test set

# --- Compute metrics ---
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# --- Output results ---
print("Confusion Matrix:")
print(pd.DataFrame(cm, index=['Actual 0 (No Treatment)', 'Actual 1 (Treatment)'], columns=['Predicted 0', 'Predicted 1']))
print(f"\nAccuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")



Confusion Matrix:
                         Predicted 0  Predicted 1
Actual 0 (No Treatment)         5829         2917
Actual 1 (Treatment)            1836         9993

Accuracy:  0.7690
Precision: 0.7741
Recall:    0.8448
F1-Score:  0.8079


# Random Forest

In [9]:
from collections import Counter

class DecisionTreeClassifierRF(DecisionTreeClassifierScratch):
    def __init__(self, max_depth=15, min_samples_split=20, criterion='entropy',
                 max_features=5, random_state=42):
        super().__init__(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion)
        self.max_features = max_features
        self.random_state = random_state
        self.rng = np.random.RandomState(random_state)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        # randomly select a subset of features
        if self.max_features and self.max_features < n_features:
            features = self.rng.choice(n_features, self.max_features, replace=False)
        else:
            features = np.arange(n_features)

        best_gain = 0
        best_feat, best_thresh = None, None
        for feature_idx in features:
            thresholds = np.unique(X[:, feature_idx])
            for t in thresholds:
                left_mask = X[:, feature_idx] <= t
                right_mask = ~left_mask
                if left_mask.sum() < self.min_samples_split or right_mask.sum() < self.min_samples_split:
                    continue
                gain = self._information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain, best_feat, best_thresh = gain, feature_idx, t
        return best_feat, best_thresh

class RandomForestClassifierScratch:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=20,
                 criterion='entropy', max_features='sqrt', random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []
        self.rng = np.random.RandomState(random_state)

    def _get_max_features(self, n_features):
        if isinstance(self.max_features, int):
            return self.max_features
        if self.max_features == 'sqrt':
            return int(np.sqrt(n_features))
        if self.max_features == 'log2':
            return int(np.log2(n_features))
        return n_features

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape
        self.trees = []
        for i in range(self.n_estimators):
            # Bootstrap sampling
            indices = self.rng.choice(n_samples, n_samples//2, replace=True)
            X_sample, y_sample = X[indices], y[indices]

            max_feats = self._get_max_features(n_features)
            tree = DecisionTreeClassifierRF(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                criterion=self.criterion,
                max_features=max_feats,
                random_state=(self.random_state + i) if self.random_state is not None else None
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        X = np.array(X)
        # Collect predictions from each tree
        all_preds = np.array([tree.predict(X) for tree in self.trees])
        # Majority vote
        y_pred = []
        for preds in all_preds.T:
            vote = Counter(preds).most_common(1)[0][0]
            y_pred.append(vote)
        return np.array(y_pred)

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --- Train scratch Random Forest ---
rf = RandomForestClassifierScratch(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    criterion='entropy',
    max_features='sqrt',
    random_state=42
)
rf.fit(X_train.values, y_train.values)

# --- Predict on validation set (or test set for final eval) ---
y_pred_rf = rf.predict(X_test.values)  # For validation

cm_rf = confusion_matrix(y_test, y_pred_rf)  # or y_test, y_pred_rf for test set

# --- Compute metrics ---
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# --- Output results ---
print("Random Forest Confusion Matrix:")
print(pd.DataFrame(
    cm_rf,
    index=['Actual 0 (No Treatment)', 'Actual 1 (Treatment)'],
    columns=['Predicted 0', 'Predicted 1']
))
print(f"\nAccuracy:    {accuracy_rf:.4f}")
print(f"Precision:   {precision_rf:.4f}")
print(f"Recall:      {recall_rf:.4f}")
print(f"F1-Score:    {f1_rf:.4f}")



Random Forest Confusion Matrix:
                         Predicted 0  Predicted 1
Actual 0 (No Treatment)         5737         3009
Actual 1 (Treatment)            1548        10281

Accuracy:    0.7785
Precision:   0.7736
Recall:      0.8691
F1-Score:    0.8186


# AdaBoost

In [13]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --- Decision Tree Node ---
class TreeNode:
    def __init__(self):
        self.feature = None
        self.threshold = None
        self.polarity = 1
        self.left = None
        self.right = None
        self.prediction = None
        self.is_leaf = False

# --- Decision Tree as a weak learner ---
class DecisionTree:
    def __init__(self, max_depth=2):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y, sample_weights):
        X, y, w = np.array(X), np.array(y), np.array(sample_weights)
        self.root = self._build_tree(X, y, w, depth=0)
        return self

    def _build_tree(self, X, y, w, depth):
        node = TreeNode()
        
        # Base cases: max depth reached or pure node
        if depth >= self.max_depth or len(np.unique(y)) == 1 or len(y) <= 1:
            node.is_leaf = True
            # Weighted majority vote
            pos_weight = np.sum(w[y == 1])
            neg_weight = np.sum(w[y == -1])
            node.prediction = 1 if pos_weight > neg_weight else -1
            return node

        # Find best split
        best_feature, best_threshold, best_polarity, min_error = self._find_best_split(X, y, w)
        
        if best_feature is None:  # No valid split found
            node.is_leaf = True
            pos_weight = np.sum(w[y == 1])
            neg_weight = np.sum(w[y == -1])
            node.prediction = 1 if pos_weight > neg_weight else -1
            return node

        node.feature = best_feature
        node.threshold = best_threshold
        node.polarity = best_polarity

        # Split data
        if best_polarity == 1:
            left_mask = X[:, best_feature] < best_threshold
            right_mask = X[:, best_feature] >= best_threshold
        else:
            left_mask = X[:, best_feature] > best_threshold
            right_mask = X[:, best_feature] <= best_threshold

        # Recursively build subtrees
        if np.any(left_mask):
            node.left = self._build_tree(X[left_mask], y[left_mask], w[left_mask], depth + 1)
        else:
            # Create leaf node
            node.left = TreeNode()
            node.left.is_leaf = True
            node.left.prediction = 1 if np.sum(w[y == 1]) > np.sum(w[y == -1]) else -1

        if np.any(right_mask):
            node.right = self._build_tree(X[right_mask], y[right_mask], w[right_mask], depth + 1)
        else:
            # Create leaf node
            node.right = TreeNode()
            node.right.is_leaf = True
            node.right.prediction = 1 if np.sum(w[y == 1]) > np.sum(w[y == -1]) else -1

        return node

    def _find_best_split(self, X, y, w):
        n_samples, n_features = X.shape
        min_error = float('inf')
        best_feature = None
        best_threshold = None
        best_polarity = None

        for feature_i in range(n_features):
            thresholds = np.unique(X[:, feature_i])
            for t in thresholds:
                for polarity in [1, -1]:
                    # Make predictions based on split
                    preds = np.ones(n_samples)
                    if polarity == 1:
                        preds[X[:, feature_i] < t] = -1
                    else:
                        preds[X[:, feature_i] > t] = -1

                    error = np.sum(w[preds != y])
                    if error < min_error:
                        min_error = error
                        best_polarity = polarity
                        best_threshold = t
                        best_feature = feature_i

        return best_feature, best_threshold, best_polarity, min_error

    def predict(self, X):
        X = np.array(X)
        return np.array([self._predict_sample(self.root, sample) for sample in X])

    def _predict_sample(self, node, sample):
        if node.is_leaf:
            return node.prediction

        if node.polarity == 1:
            if sample[node.feature] < node.threshold:
                return self._predict_sample(node.left, sample)
            else:
                return self._predict_sample(node.right, sample)
        else:
            if sample[node.feature] > node.threshold:
                return self._predict_sample(node.left, sample)
            else:
                return self._predict_sample(node.right, sample)

# --- AdaBoost with Decision Trees ---
class AdaBoostTrees:
    def __init__(self, n_estimators=100, max_depth=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learners = []
        self.alphas = []

    def fit(self, X, y):
        X, y = np.array(X), np.array(y)
        # convert y to {-1, +1}
        y_signed = np.where(y == 1, 1, -1)
        n_samples = len(y_signed)
        # initialize sample weights
        w = np.ones(n_samples) / n_samples

        for i in range(self.n_estimators):
            tree = DecisionTree(max_depth=self.max_depth).fit(X, y_signed, w)
            preds = tree.predict(X)

            # weighted error
            err = np.clip(np.sum(w[preds != y_signed]), 1e-10, 1-1e-10)
            alpha = 0.5 * np.log((1 - err) / err)

            # update weights
            w *= np.exp(-alpha * y_signed * preds)
            w /= w.sum()

            self.learners.append(tree)
            self.alphas.append(alpha)

    def predict(self, X):
        X = np.array(X)
        # weighted sum of tree predictions
        learner_preds = np.array([alpha * learner.predict(X)
                                   for learner, alpha in zip(self.learners, self.alphas)])
        y_signed_pred = np.sign(np.sum(learner_preds, axis=0))
        # map back to {0,1}
        return np.where(y_signed_pred == 1, 1, 0)



In [None]:
# --- Train AdaBoost with weak Trees ---
ab = AdaBoostTrees(n_estimators=100, max_depth=5)
ab.fit(X_train.values, y_train.values) 

y_pred_ab = ab.predict(X_test.values) 

# --- Evaluate ---
cm_ab = confusion_matrix(y_test, y_pred_ab) 

print("AdaBoost Confusion Matrix:")
print(pd.DataFrame(
    cm_ab,
    index=['Actual 0 (No Treatment)', 'Actual 1 (Treatment)'],
    columns=['Predicted 0', 'Predicted 1']
))
print(f"\nAccuracy:  {accuracy_score(y_test, y_pred_ab):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_ab):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_ab):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_ab):.4f}")

AdaBoost Confusion Matrix:
                         Predicted 0  Predicted 1
Actual 0 (No Treatment)         5889         2857
Actual 1 (Treatment)            1447        10382

Accuracy:  0.7908
Precision: 0.7842
Recall:    0.8777
F1-Score:  0.8283
