In [57]:
# **** MATH OPERATIONS ****
import numpy as np

# **** DATA MANIPULATION ****
import pandas as pd


In [58]:
# Load labels data

labels_df = pd.read_csv("labels.csv")

In [59]:
def load_features(files, path, labels_df):
    dataframes = []
    
    for file in files:
        df = pd.read_csv(f"{path}/{file}", header=None)
        participant_id = int(file.split("_")[1].split(".")[0]) # Get ID from file name
        df["Participant_ID"] = participant_id
        dataframes.append(df)
        
    # Combine into a single dataframe
    data_df = pd.concat(dataframes)
    return data_df.merge(labels_df, on="Participant_ID")

In [106]:
train_files = ["spk_303.csv", "spk_304.csv", "spk_310.csv", "spk_313.csv", "spk_316.csv", "spk_317.csv", "spk_318.csv", "spk_319.csv", "spk_322.csv", "spk_324.csv", 
               "spk_326.csv", "spk_327.csv", "spk_328.csv", "spk_330.csv", "spk_333.csv", "spk_338.csv", "spk_339.csv", "spk_340.csv", "spk_341.csv", "spk_343.csv", 
               "spk_344.csv", "spk_345.csv", "spk_347.csv", "spk_350.csv", "spk_351.csv", "spk_352.csv", "spk_353.csv", "spk_356.csv", "spk_357.csv", "spk_358.csv", 
               "spk_360.csv", "spk_364.csv", "spk_366.csv", "spk_369.csv", "spk_370.csv", "spk_371.csv", "spk_372.csv", "spk_374.csv", "spk_375.csv", "spk_376.csv", 
               "spk_379.csv", "spk_380.csv", "spk_383.csv", "spk_385.csv", "spk_386.csv", "spk_391.csv", "spk_392.csv", "spk_397.csv", "spk_400.csv", "spk_401.csv", 
               "spk_402.csv", "spk_409.csv", "spk_412.csv", "spk_414.csv", "spk_416.csv", "spk_419.csv", "spk_423.csv", "spk_425.csv", "spk_426.csv", "spk_427.csv", 
               "spk_428.csv", "spk_429.csv", "spk_430.csv", "spk_433.csv", "spk_441.csv", "spk_443.csv", "spk_445.csv", "spk_447.csv", "spk_448.csv", "spk_449.csv", 
               "spk_454.csv", "spk_455.csv", "spk_456.csv", "spk_457.csv", "spk_459.csv", "spk_463.csv", "spk_464.csv", "spk_468.csv", "spk_471.csv", "spk_473.csv", 
               "spk_475.csv", "spk_478.csv", "spk_485.csv", "spk_486.csv", "spk_487.csv", "spk_488.csv", "spk_491.csv"]

test_files = ["spk_305.csv", "spk_312.csv", "spk_315.csv", "spk_320.csv", "spk_321.csv", "spk_325.csv", "spk_336.csv", "spk_348.csv", "spk_355.csv", "spk_362.csv", 
              "spk_363.csv", "spk_368.csv", "spk_393.csv", "spk_415.csv", "spk_434.csv", "spk_437.csv", "spk_444.csv", "spk_446.csv", "spk_474.csv", "spk_479.csv", ]

train_df = load_features(train_files,"features_train", labels_df)
test_df = load_features(test_files, "features_test", labels_df)

# Problem (a.i) Depression Classification

In [424]:
from sklearn.preprocessing import StandardScaler

# Handle missing values
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# Separate features and labels
X_train = train_df.drop(["Participant_ID", "Depression", "Gender"], axis=1)
y_train_depression = train_df["Depression"]
y_train_gender = train_df["Gender"]

X_test = test_df.drop(["Participant_ID", "Depression", "Gender"], axis=1)
y_test_depression = test_df["Depression"]
y_test_gender = test_df["Gender"]

In [322]:
from sklearn.ensemble import RandomForestClassifier

# Depression Classifation
model_depression = RandomForestClassifier(random_state=42, class_weight='balanced')
model_depression.fit(X_train, y_train_depression)



In [242]:
# Gender Classification
model_gender = RandomForestClassifier(random_state=42, class_weight='balanced')
model_gender.fit(X_train, y_train_gender)

In [339]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix

def calculate_accuracy(y_true, y_pred, gender):
    # Calculate Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate Balanced Accuracy
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    
    # Extract TP, FP, TN, FN
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    tpr = tp / (tp + fn) # True Positive Rate
    tnr = tn / (tn + fp) # True Negative Rate
    fpr = fp / (fp + tn) # False Positive Rate
    fnr = fn / (fn + tp) # False Negative Rate
    
    # Store metrics
    metrics = {
        "accuracy": accuracy,
        "balanced_accuracy": balanced_accuracy,
        "tpr": tpr,
        "tnr": tnr,
        "fpr": fpr,
        "fnr": fnr
    }
    
    print(f"Metrics for {gender}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Balanced Accuracy: {balanced_accuracy:.2f}")
    print(f"True Positive Rate (TPR): {tpr:.2f}")
    print(f"True Negative Rate (TNR): {tnr:.2f}")
    print(f"False Positive Rate (FPR): {fpr:.2f}")
    print(f"False Negative Rate (FNR): {fnr:.2f}")
    
    return metrics
    
    

In [423]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Evaluate Depression Classification
y_pred_depression = model_depression.predict(X_test)


In [384]:
test_df_depression = test_df.copy()
test_df_depression["predictions"] = y_pred_depression

In [399]:
# Group by participant and average predictions 
participant_predictions_depression = test_df_depression.groupby("Participant_ID")["predictions"].mean()

# Binarize the predictions from 0.5 threshold
threshold = 0.05
participant_predictions_depression_binarized = (participant_predictions_depression >= threshold).astype(int)

# Join aggregated predictions back with the depression labels
participant_labels_depression = test_df_depression.groupby("Participant_ID")["Depression"].first()

In [416]:
# Filter data by gender
male_participants = test_df_depression[test_df_depression["Gender"] == 1]["Participant_ID"].unique()
female_participants = test_df_depression[test_df_depression["Gender"] == 0]["Participant_ID"].unique()

In [417]:
# Calculate accuracies for all, male, and female participants
all_metrics_depression = calculate_accuracy(participant_labels_depression, participant_predictions_depression_binarized, "All participants")
print("")
male_metrics = calculate_accuracy(participant_labels_depression.loc[male_participants], participant_predictions_depression_binarized.loc[male_participants], "Male participants")
print("")
female_metrics = calculate_accuracy(participant_labels_depression.loc[female_participants], participant_predictions_depression_binarized.loc[female_participants], "Female participants")

Metrics for All participants:
Accuracy: 0.55
Balanced Accuracy: 0.58
True Positive Rate (TPR): 0.67
True Negative Rate (TNR): 0.50
False Positive Rate (FPR): 0.50
False Negative Rate (FNR): 0.33

Metrics for Male participants:
Accuracy: 0.58
Balanced Accuracy: 0.77
True Positive Rate (TPR): 1.00
True Negative Rate (TNR): 0.55
False Positive Rate (FPR): 0.45
False Negative Rate (FNR): 0.00

Metrics for Female participants:
Accuracy: 0.50
Balanced Accuracy: 0.47
True Positive Rate (TPR): 0.60
True Negative Rate (TNR): 0.33
False Positive Rate (FPR): 0.67
False Negative Rate (FNR): 0.40


In [418]:
# Calculate EO
eo = (1 - abs(male_metrics["tpr"] - female_metrics["tpr"]))

print(f"Equality of Opportunity (EO): {eo:.2f}")

Equality of Opportunity (EO): 0.60


# Problem (a.ii) Gender Classification

In [419]:
# Evaluate Gender Classification
y_pred_gender = model_gender.predict(X_test)

In [420]:
test_df_gender = test_df.copy()
test_df_gender["predictions"] = y_pred_gender

In [421]:
# Group by participant and average predictions 
participant_predictions_gender = test_df_gender.groupby("Participant_ID")["predictions"].mean()

# Binarize the predictions from 0.5 threshold
threshold = 0.15
participant_predictions_gender_binarized = (participant_predictions_gender >= threshold).astype(int)

# Join aggregated predictions back with the depression labels
participant_labels_gender = test_df_gender.groupby("Participant_ID")["Gender"].first()

In [422]:
# Calculate accuracies for all, male, and female participants
all_metrics_gender = calculate_accuracy(participant_labels_gender, participant_predictions_gender_binarized, "All participants")


Metrics for All participants:
Accuracy: 0.90
Balanced Accuracy: 0.88
True Positive Rate (TPR): 1.00
True Negative Rate (TNR): 0.75
False Positive Rate (FPR): 0.25
False Negative Rate (FNR): 0.00
