# Random Forest Classification of Depression and Gender by Vocal Characteristics
In this project, we will train a Random Forest Classifier to determine whether a participant has depression based on speech data from a clinical interview. The dataset contains 107 participants, 63 of which are male and 44 of which are female. Of these participants, 30 have depression (17 female, 14 male). By varying the weights and features included in the model training, we will demonstrate how differing analysis techniques can have significant impact on the model's classification performance. 

In [1]:
# **** MATH OPERATIONS ****
import numpy as np
import os

# **** DATA MANIPULATION ****
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [4]:
# general functions
color_dict = {
    'all features': 'mediumpurple',
    'top performing features (depression)': 'darkviolet',
    'top perfoming features (gender)': 'plum',
    're-weighted features': 'indigo'
}
def load_features(path, labels_df):
    dataframes = []
    
    for file in os.listdir(path):
        if file.split('.')[1] != 'csv':
            continue
        df = pd.read_csv(os.path.join(path, file), header=None)
        participant_id = int(file.split("_")[1].split(".")[0]) # Get ID from file name
        df["Participant_ID"] = participant_id
        dataframes.append(df)
        
    # Combine into a single dataframe
    data_df = pd.concat(dataframes)
    return data_df.merge(labels_df, on="Participant_ID")

def analyze_results(test_df, display_results=True, pred_label='Depression', threshold = 0.05):
    # Group by participant and average predictions 
    participant_predictions = test_df.groupby("Participant_ID")["predictions"].mean()

    participant_predictions_binarized = (participant_predictions >= threshold).astype(int)

    # Join aggregated predictions back with the depression labels
    participant_labels = test_df.groupby("Participant_ID")[pred_label].first()

    # Filter data by gender
    male_participants = test_df[test_df["Gender"] == 1]["Participant_ID"].unique()
    female_participants = test_df[test_df["Gender"] == 0]["Participant_ID"].unique()

    # Calculate accuracies for all, male, and female participants
    all_metrics_depression = calculate_accuracy(participant_labels, 
                                                participant_predictions_binarized, "All participants",
                                                display_results=display_results)
    if display_results:
        print("")
    male_metrics = calculate_accuracy(participant_labels.loc[male_participants],
                                       participant_predictions_binarized.loc[male_participants], "Male participants",
                                       display_results=display_results)
    if display_results:
        print("")
    female_metrics = calculate_accuracy(participant_labels.loc[female_participants], 
                                        participant_predictions_binarized.loc[female_participants],
                                          "Female participants", display_results=display_results)

    # Calculate EO
    eo = (1 - abs(male_metrics["tpr"] - female_metrics["tpr"]))
    if display_results:
        print(f"Equality of Opportunity (EO): {eo:.2f}")

    return all_metrics_depression, male_metrics, female_metrics, eo

def calculate_accuracy(y_true, y_pred, group, display_results=True):
    # Calculate Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate Balanced Accuracy
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    
    # Extract TP, FP, TN, FN
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    tpr = tp / (tp + fn) # True Positive Rate
    tnr = tn / (tn + fp) # True Negative Rate
    fpr = fp / (fp + tn) # False Positive Rate
    fnr = fn / (fn + tp) # False Negative Rate
    
    # Store metrics
    metrics = {
        "accuracy": accuracy,
        "balanced_accuracy": balanced_accuracy,
        "tpr": tpr,
        "tnr": tnr,
        "fpr": fpr,
        "fnr": fnr
    }
    if display_results:
        print(f"Metrics for {group}:")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Balanced Accuracy: {balanced_accuracy:.2f}")
        print(f"True Positive Rate (TPR): {tpr:.2f}")
        print(f"True Negative Rate (TNR): {tnr:.2f}")
        print(f"False Positive Rate (FPR): {fpr:.2f}")
        print(f"False Negative Rate (FNR): {fnr:.2f}")
    
    return metrics


def train_random_forest(df:pd.DataFrame, classification_feat:str):
    # split into validation and train
    train_df, validation_df = train_test_split(df, test_size=.3)
    train_feat = train_df[classification_feat].values.tolist()
    validation_feat = validation_df[classification_feat].values.tolist()
    train_arr = train_df.drop(columns=[classification_feat]).to_numpy()
    validation_arr = validation_df.drop(columns = [classification_feat]).to_numpy()

    clf = RandomForestClassifier(random_state=42, class_weight='balanced')
    # clf = RandomForestClassifier(num_trees, random_state=42, class_weight='balanced', max_depth=max_depth)
    clf.fit(train_arr, train_feat)
    pred = clf.predict(validation_arr)
    acc = accuracy_score(validation_feat, pred)
    # return model with best classification accuracy
    return clf

def bar_graph(vals:dict, measures:list, title:str):
    x = np.arange(len(measures))  # the label locations
    width = 0.2  # the width of the bars
    multiplier = 0

    fig, ax = plt.subplots(figsize=(12, 7))
    for attribute, measurement in vals.items():
        offset = width * multiplier
        rects = ax.bar(x + offset, measurement, width, label=attribute, color=color_dict[attribute])
        ax.bar_label(rects, padding=5)
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Accuracy')
    ax.set_title(title)
    ax.set_xticks(x + width, measures)
    ax.legend(loc='upper left', ncols=4)
    ax.set_ylim(0, 1)
    fig.tight_layout()
    plt.show()

In [3]:
# pull in dataset & labels
labels_df = pd.read_csv("labels.csv")
train_df = load_features("features_train", labels_df)
test_df = load_features("features_test", labels_df)
# append feature names
features = pd.read_csv('feature_description.csv',
                    encoding = 'ISO-8859-1', 
                    names=['feature', 'description'])['feature'].values.tolist()
col_names = {x:features[x] for x in range(88)}
train_df.rename(columns=col_names, inplace=True)
test_df.rename(columns=col_names, inplace=True)

## Problem (1a) Classification of Gender and Depression
TODO Fill in

### Problem (a.i) Depression Classification
TODO Fill in


In [None]:
# have these values at the end 
all_metrics_depression = 0
male_metrics_depression = 0
female_metrics_depression = 0

### Problem (a.ii) Gender Classification
TODO fill in

In [None]:
# have these values at the end
all_metrics_gender = 0

### Problem (1b) Depression Feature Selection
In this section, we will repeat the same depression classification as above on only the features with the strongest correlation with depression.

In [None]:
# perform depression classification on the data
correlation_tups = []
for col in train_df.columns:
    if col in ['Participant_ID', 'Depression']:
        continue
    correlation_tups.append((col, train_df[col].corr(train_df['Depression'])))
correlation_tups = sorted(correlation_tups, key=lambda x: abs(x[1]), reverse=True)
top_twenty_feats = {correlation_tups[x][0]:round(correlation_tups[x][1], 3)  for x in range(20)}
print('Top twenty features correlated with depression: \n')
for key, val in top_twenty_feats.items():
    print(f'{key}: {val}')

The participants with depression more frequently spoke loudly (loudness_sma3) and with more fluctuations (spectralFlux). There is also slight correlation between the vocal range (semitone) and the speaker's depression. Since the majority of participants with depression were female, there is a correlation between gender and depression, as well.

We will now train a Random Forest Classifier to predict depression using only the features most strongly correlated with depression. 

In [None]:
# run model on filtered features
class_acc_dep_fs, bca_dep_fs, male_class_acc_dep_fs, male_bca_dep_fs, female_class_acc_dep_fs, female_bca_dep_fs, eo_dep_fs = [],[],[],[],[], [], []
for n in range(10, 51, 5):
    # select top performing features
    features = [correlation_tups[x][0] for x in range(n)] + ['Depression']
    filtered_df = train_df.loc[:, features]
    # build random forest on these features
    best_rand_forest = train_random_forest(filtered_df, 'Depression')
    filtered_test = test_df.loc[:, features]
    predictions = best_rand_forest.predict(filtered_test.drop(columns=['Depression']).to_numpy())
    # store accuracies
    test_df_pred = test_df.copy()
    test_df_pred['predictions'] = predictions
    x = test_df_pred.columns.values.tolist()
    all_res, male_res, female_res, eo_res = analyze_results(test_df_pred, display_results=False)
    class_acc_dep_fs.append(all_res['accuracy'])
    bca_dep_fs.append(all_res['balanced_accuracy'])
    male_class_acc_dep_fs.append(male_res['accuracy'])
    male_bca_dep_fs.append(male_res['balanced_accuracy'])
    female_class_acc_dep_fs.append(female_res['accuracy'])
    female_bca_dep_fs.append(female_res['balanced_accuracy'])
    eo_dep_fs.append(eo_res)


# plot results
fig, ax = plt.subplots(1, 1)
num_feats = range(10, 51, 5)
ax.plot(num_feats, class_acc_dep_fs, label = "Classification Accuracy", color='darkviolet') 
ax.plot(num_feats, male_class_acc_dep_fs, label = "Male Classification Accuracy", color='royalblue') 
ax.plot(num_feats, female_class_acc_dep_fs, label = "Female Classification Accuracy", color='violet') 
ax.plot(num_feats, bca_dep_fs, label = "Balanced Classification Accuracy", color='darkviolet', linestyle='dashed') 
ax.plot(num_feats, male_bca_dep_fs, label = "Male Balanced Classification Accuracy", color='royalblue', linestyle='dashed') 
ax.plot(num_feats, female_bca_dep_fs, label = "Female Balanced Classification Accuracy", color='violet', linestyle='dashed') 
ax.plot(num_feats, eo_dep_fs, label = "Equality of Opportunity", color='darkviolet', linestyle='dotted')
ax.set_title('Depression Classification Metrics Over Feature Selection')
ax.set_xlabel('Number of Features Analyzed')
ax.set_ylabel('Classification Accuracy')
fig.tight_layout()
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 0.5, 0.5, 0.5)) 
plt.show()

The above plot shows the depression classification accuracy measures over male and female participants as the number of features included in the model increase. With minimal (10) features included in the model training, the Random Forest Classifier performs much better on female participants than on male participants. As more features (with decreasing correlation) increase, the classification accuracy on male participants increases, while the classification accuracy on female participants decreases. As the number of features increase, the equality of opportunity nears 60% and the overall classification and balanced classification accuracy nears 50%.

In [None]:
best_ind = bca_dep_fs.index(max(bca_dep_fs))
best_male_ind = male_bca_dep_fs.index(max(male_bca_dep_fs))
best_female_ind = female_bca_dep_fs.index(max(female_bca_dep_fs))
measures = ['Acc', 'BCA', 'Male Acc', 'Male BCA', 'Female Acc', 'Female BCA', 'EO']
all_metrics_depression = {x:round(y, 2) for x,y in all_metrics_depression.items()}
male_metrics = {x:round(y, 2) for x,y in male_metrics_depression.items()}
female_metrics = {x:round(y, 2) for x,y in female_metrics_depression.items()}
class_acc = [round(x, 2) for x in class_acc_dep_fs]
male_class_acc = [round(x, 2) for x in male_class_acc_dep_fs]
female_class_acc = [round(x, 2) for x in female_class_acc_dep_fs]
bca = [round(x, 2) for x in bca_dep_fs]
male_bca = [round(x, 2) for x in male_bca_dep_fs]
female_bca = [round(x, 2) for x in female_bca_dep_fs]
vals = {
    'all features': [all_metrics_depression['accuracy'], all_metrics_depression['balanced_accuracy'], 
                     male_metrics['accuracy'], male_metrics['balanced_accuracy'], 
                     female_metrics['accuracy'], female_metrics['balanced_accuracy'], .6],
    'top performing features (depression)': [class_acc[best_ind], bca[best_ind], 
                          male_class_acc[best_male_ind], male_bca[best_male_ind],
                          female_class_acc[best_female_ind], female_bca[best_female_ind], eo_dep_fs[best_ind]],     
}
bar_graph(vals, measures, 'Depression Classification Metrics With Varied Feature Selection Methods')

The balanced classification accuracy sees little change from the best performing feature-selection model to the original model. The overall classification accuracy is significantly higher with the original model than with the feature selection model, but these values would likely converge as the number of features included in the analysis increases. 
The original model performs better on classifying the male participants, with a 9% difference in the balanced classificaiton accuracy. However, the feature-selection model performs better on classifying female participants, with a 10% increase in the balanced classification accuracy. Furthermore, the Equality of Opportunity score increases 20% with the feature-selection model. 

### Problem (1c) Gender Feature Selection
TODO Fill in

In [None]:
# fill in

# line graph

### Problem (1d) Mitigating Bias Via Removing Gender-Dependent Features
TODO Fill in 

In [None]:
# fill in

# bar graph comparing to part a

### Problem (1e) Mitigating Bias Via Other Approaches

#### Mitigating Bias via reweighting
Reweight the samples between female and male speakers by computing the proportion of two classes, and also by applying 'balanced' class weight for random forest classifier.

In [None]:
# sample based reweight

samples_per_gender = train_df['Gender'].value_counts()
total_samples = len(train_df)
inverse_representation = total_samples / samples_per_gender
print(samples_per_gender)

train_df_weights = train_df.copy()
train_df_weights['weights'] = train_df['Gender'].map(inverse_representation)
reweights = train_df_weights['weights']

In [ ]:
model_sbr = RandomForestClassifier(random_state=42, class_weight='balanced')
model_sbr.fit(X_train, y_train_depression, sample_weight=reweights)

y_pred_sbr = model_sbr.predict(X_test)
all_sbr, male_sbr, female_sbr, eo_sbr = analyze_results(y_pred_sbr)

#### Mitigating bias via autoencoder
Implement autoencoder by training a multi-task neural network. Through the training, compute the loss for gender and depression, and try to decrease the loss of depression prediction while increasing the loss of gender prediction, which will make sure the output features will represent depression well but gender badly. To achieve that, use the negative value of binary cross entropy of gender prediction. After training, take the output as the transformed features.

In [ ]:
# auto-encoder 

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

input_layer = Input(shape=(88,))
encoded = Dense(44, activation='relu', kernel_regularizer=l2(0.01))(input_layer)
decoded = Dense(88, activation='sigmoid', name='decoded')(encoded)

gender_output = Dense(1, activation='sigmoid', name='gender')(encoded)
depression_output = Dense(1, name='depression')(encoded)

autoencoder = Model(inputs=input_layer, outputs=[decoded, gender_output, depression_output])

def negative_binary_crossentropy(y_true, y_pred):
    return -tf.keras.losses.binary_crossentropy(y_true, y_pred)
def mean_squared_error(y_true, y_pred):
    return tf.keras.losses.mean_squared_error(y_true, y_pred)
losses = {
    'decoded': mean_squared_error,
    'gender': negative_binary_crossentropy,
    'depression': mean_squared_error,
}

autoencoder.compile(optimizer='adam', loss=losses)

autoencoder.fit(X_train, [X_train, y_train_gender, y_train_depression], epochs=10, batch_size=32)
X_train_encoded = autoencoder.predict(X_train)[0]

In [ ]:
model_aer = RandomForestClassifier(random_state=42, class_weight='balanced')
model_aer.fit(X_train_encoded, y_train_depression)

y_pred_aer = model_aer.predict(X_test)
all_aer, male_aer, female_aer, eo_aer = analyze_results(y_pred_aer)

In [ ]:
vals = {
    'all features': [all_metrics_depression['accuracy'], all_metrics_depression['balanced_accuracy'], 
                     male_metrics['accuracy'], male_metrics['balanced_accuracy'], 
                     female_metrics['accuracy'], female_metrics['balanced_accuracy'], .6],
    'SBR features': [all_sbr['accuracy'], all_sbr['balanced_accuracy'], 
                     male_sbr['accuracy'], male_sbr['balanced_accuracy'], 
                     female_sbr['accuracy'], female_sbr['balanced_accuracy'], eo_sbr],
    'AER features': [all_aer['accuracy'], all_aer['balanced_accuracy'], 
                 male_aer['accuracy'], male_aer['balanced_accuracy'], 
                 female_aer['accuracy'], female_aer['balanced_accuracy'], eo_aer]
}
bar_graph(vals, measures, 'Depression Classification Metrics With Debiasing')

According to the graph, mitigating bias will reduce the accuracy of the model. This might be because the model is trained on imbalanced value, so it can get high accuracy by simply predicting the class which has more samples. Balanced accuracy would be increased through this transformation. The accuracy on male class, which has more samples, will decrease as well. It makes sense because it takes a huge percentage of the overall accuracy. Moreover, the performance of using autoencoder will mostly depend on how well the autoencoder is trained, as in this task, where the autoencoder struggles with reducing losses stably, the final performance changes a lot.  
One interesting thing to notice is that after autoencoder, the EO could reach to 1, which indicates it a better choice of mitigating bias.