### Mounting data to the working directory
Upload your data folder into google drive. Once done run the cell below. Now if you open the `files` tab on the left, you will see that a new folder `drive` is mounted. You will be able find your data in the folder.

In [7]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Task: 13 labels for each sample that we need to predict from the embedding


- multi-label classification (NOT multi-class) --> things can have 2 labels
- Can train one model per label (binary)

In [3]:
import pickle
from imblearn.over_sampling import SMOTE
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
with open('/content/drive/MyDrive/SPH6004_project/data/X_train.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/y_train.pkl', 'rb') as f:
    train_labels = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/X_valid.pkl', 'rb') as f:
    valid_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/y_valid.pkl', 'rb') as f:
    valid_labels = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/X_test.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/y_test.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [None]:
train_embeddings = np.array(train_embeddings)
train_labels = np.array(train_labels)
valid_embeddings = np.array(valid_embeddings)
valid_labels = np.array(valid_labels)

train_embeddings.shape

(36502, 1376)

In [None]:
train_labels.shape

(36502, 13)

In [None]:
valid_embeddings.shape

(4563, 1376)

In [None]:
valid_labels.shape

(4563, 13)

In [None]:
test_embeddings.shape

(4562, 1376)

In [None]:
test_labels.shape

(4562, 13)

In [None]:
#check class balance in the training data
for label_idx in range(train_labels.shape[1]):
    total_samples = train_labels.shape[0]

    # Identify NaN values
    nan_mask = np.isnan(train_labels[:, label_idx])
    nan_count = np.sum(nan_mask)

    # Filter valid (non-NaN) labels
    valid_idx = ~nan_mask
    y_train = train_labels[valid_idx, label_idx].astype(int)

    # Count occurrences of each class (0 and 1)
    class_counts = np.bincount(y_train, minlength=2)  # Ensure [0, 1] bins exist

    print(f"Label {label_idx}:")
    print(f"  Class 0: {class_counts[0]} samples ({class_counts[0] / total_samples:.2%})")
    print(f"  Class 1: {class_counts[1]} samples ({class_counts[1] / total_samples:.2%})")
    print(f"  NaN: {nan_count} samples ({nan_count / total_samples:.2%})\n")

Label 0:
  Class 0: 24778 samples (67.88%)
  Class 1: 3222 samples (8.83%)
  NaN: 8502 samples (23.29%)

Label 1:
  Class 0: 25852 samples (70.82%)
  Class 1: 2982 samples (8.17%)
  NaN: 7668 samples (21.01%)

Label 2:
  Class 0: 25851 samples (70.82%)
  Class 1: 650 samples (1.78%)
  NaN: 10001 samples (27.40%)

Label 3:
  Class 0: 26884 samples (73.65%)
  Class 1: 928 samples (2.54%)
  NaN: 8690 samples (23.81%)

Label 4:
  Class 0: 25213 samples (69.07%)
  Class 1: 553 samples (1.51%)
  NaN: 10736 samples (29.41%)

Label 5:
  Class 0: 24784 samples (67.90%)
  Class 1: 788 samples (2.16%)
  NaN: 10930 samples (29.94%)

Label 6:
  Class 0: 24834 samples (68.03%)
  Class 1: 1050 samples (2.88%)
  NaN: 10618 samples (29.09%)

Label 7:
  Class 0: 24953 samples (68.36%)
  Class 1: 4306 samples (11.80%)
  NaN: 7243 samples (19.84%)

Label 8:
  Class 0: 26732 samples (73.23%)
  Class 1: 2971 samples (8.14%)
  NaN: 6799 samples (18.63%)

Label 9:
  Class 0: 24709 samples (67.69%)
  Class 1: 

In [None]:
#ignore the NA values and output class balance
for label_idx in range(train_labels.shape[1]):
    valid_idx = ~np.isnan(train_labels[:, label_idx])
    y_train = train_labels[valid_idx, label_idx].astype(int)

    class_counts = np.bincount(y_train)
    total_samples = len(y_train)

    print(f"Label {label_idx}:")
    for cls, count in enumerate(class_counts):
        print(f"  Class {cls}: {count} samples ({count/total_samples:.2%})")
    print()

Label 0:
  Class 0: 24778 samples (88.49%)
  Class 1: 3222 samples (11.51%)

Label 1:
  Class 0: 25852 samples (89.66%)
  Class 1: 2982 samples (10.34%)

Label 2:
  Class 0: 25851 samples (97.55%)
  Class 1: 650 samples (2.45%)

Label 3:
  Class 0: 26884 samples (96.66%)
  Class 1: 928 samples (3.34%)

Label 4:
  Class 0: 25213 samples (97.85%)
  Class 1: 553 samples (2.15%)

Label 5:
  Class 0: 24784 samples (96.92%)
  Class 1: 788 samples (3.08%)

Label 6:
  Class 0: 24834 samples (95.94%)
  Class 1: 1050 samples (4.06%)

Label 7:
  Class 0: 24953 samples (85.28%)
  Class 1: 4306 samples (14.72%)

Label 8:
  Class 0: 26732 samples (90.00%)
  Class 1: 2971 samples (10.00%)

Label 9:
  Class 0: 24709 samples (98.47%)
  Class 1: 384 samples (1.53%)

Label 10:
  Class 0: 26674 samples (93.86%)
  Class 1: 1745 samples (6.14%)

Label 11:
  Class 0: 26898 samples (98.34%)
  Class 1: 455 samples (1.66%)

Label 12:
  Class 0: 24940 samples (94.07%)
  Class 1: 1572 samples (5.93%)



class imbalance, better to use average precision for training?, and smote can help too probably

In [None]:
#get number of labels
num_labels = train_labels.shape[1]

In [None]:
# Train one model per label, each model for each label is a binary task
models = {}
best_params_per_label = {}

# Define param grid
param_list = [
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 100},

]

for label_idx in range(num_labels):
    print(f"\nTraining model for label {label_idx + 1}/{num_labels}")

    # Filter for non-NA samples for this label in train and validation sets
    train_idx = ~np.isnan(train_labels[:, label_idx])
    val_idx = ~np.isnan(valid_labels[:, label_idx])

    x_train = train_embeddings[train_idx]
    y_train = train_labels[train_idx, label_idx].astype(int)

    x_val = valid_embeddings[val_idx]
    y_val = valid_labels[val_idx, label_idx].astype(int)

    # Check class distribution before SMOTE
    class_0, class_1 = np.bincount(y_train)
    print(f"  Before SMOTE - Class 0: {class_0}, Class 1: {class_1}")

    smote = SMOTE(sampling_strategy='auto', random_state=42)
    x_train, y_train = smote.fit_resample(x_train, y_train)

    # Check class distribution after SMOTE
    class_0_res, class_1_res = np.bincount(y_train)
    print(f"  After SMOTE  - Class 0: {class_0_res}, Class 1: {class_1_res}")

    best_pr_auc = 0
    best_model = None
    best_params = None

    for params in param_list:
        model = LGBMClassifier(objective="binary", boosting_type="gbdt", verbose=-1, **params)
        model.fit(x_train, y_train)

        val_probs = model.predict_proba(x_val)[:, 1]
        pr_auc = average_precision_score(y_val, val_probs)
        #auc = roc_auc_score(y_val, val_probs)

        print(f"  Params: {params}, Validation PR-AUC: {pr_auc:.4f}")

        if pr_auc > best_pr_auc:
            best_pr_auc = pr_auc
            best_model = model
            best_params = params

    print(f"Best Params for label {label_idx}: {best_params}, PR-AUC: {best_pr_auc:.4f}")
    models[label_idx] = best_model
    best_params_per_label[label_idx] = best_params


Training model for label 1/13
  Before SMOTE - Class 0: 24778, Class 1: 3222
  After SMOTE  - Class 0: 24778, Class 1: 24778
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}, Validation PR-AUC: 0.5145
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 75}, Validation PR-AUC: 0.5165
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}, Validation PR-AUC: 0.5260
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}, Validation PR-AUC: 0.5114
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 75}, Validation PR-AUC: 0.5261
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}, Validation PR-AUC: 0.5319
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}, Validation PR-AUC: 0.5171
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 75}, Validation PR-AUC: 0.5288
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100}, Validation PR-AUC: 0.532

In [None]:
best_params_per_label

{0: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
 1: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 50},
 2: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100},
 3: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50},
 4: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 75},
 5: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
 6: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50},
 7: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100},
 8: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 75},
 9: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50},
 10: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 75},
 11: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
 12: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 75}}

In [None]:
 #initialise to have predictions for each label for each test
 predictions = np.zeros((test_embeddings.shape[0], num_labels))

In [None]:
for label_idx, model in models.items():
    print(f"Predicting for label {label_idx + 1}/{num_labels}")

    # Predict probabilities
    pred_probs = model.predict(test_embeddings)
    predictions[:, label_idx] = pred_probs

Predicting for label 1/13
Predicting for label 2/13
Predicting for label 3/13
Predicting for label 4/13
Predicting for label 5/13
Predicting for label 6/13
Predicting for label 7/13
Predicting for label 8/13
Predicting for label 9/13
Predicting for label 10/13
Predicting for label 11/13
Predicting for label 12/13
Predicting for label 13/13


In [None]:
# Convert predicted probabilities to binary (threshold = 0.5)
binary_predictions = (predictions > 0.5).astype(int)
masked_test_labels = np.where(np.isnan(test_labels), -1, test_labels)

# Compute classification report
for label_idx in range(num_labels):
    valid_idx = ~np.isnan(test_labels[:, label_idx])  # Get valid (non-NaN) indices
    y_true = test_labels[valid_idx, label_idx]
    y_pred = binary_predictions[valid_idx, label_idx]

    if len(y_true) > 0:  # Ensure we have valid samples
        print(f"\nLabel {label_idx} Metrics:")
        print(classification_report(y_true, y_pred, zero_division=0))
    else:
        print(f"\nLabel {label_idx}: No valid samples for evaluation.")


Label 0 Metrics:
              precision    recall  f1-score   support

         0.0       0.96      0.85      0.90      3046
         1.0       0.42      0.76      0.54       446

    accuracy                           0.84      3492
   macro avg       0.69      0.80      0.72      3492
weighted avg       0.89      0.84      0.86      3492


Label 1 Metrics:
              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92      3202
         1.0       0.43      0.76      0.54       400

    accuracy                           0.86      3602
   macro avg       0.70      0.81      0.73      3602
weighted avg       0.91      0.86      0.88      3602


Label 2 Metrics:
              precision    recall  f1-score   support

         0.0       0.99      0.91      0.95      3167
         1.0       0.16      0.75      0.26        75

    accuracy                           0.90      3242
   macro avg       0.58      0.83      0.61      3242
weighted avg       0

In [None]:
pr_auc_scores = []

for label_idx in range(num_labels):
    valid_idx = ~np.isnan(test_labels[:, label_idx])

    if valid_idx.sum() > 0:  # Ensure there are valid labels
        pr_auc = average_precision_score(
            test_labels[valid_idx, label_idx],
            predictions[valid_idx, label_idx]
        )
        pr_auc_scores.append(pr_auc)
        print(f"Label {label_idx}: PR-AUC = {pr_auc:.4f}")

Label 0: PR-AUC = 0.3531
Label 1: PR-AUC = 0.3493
Label 2: PR-AUC = 0.1246
Label 3: PR-AUC = 0.2642
Label 4: PR-AUC = 0.0576
Label 5: PR-AUC = 0.0515
Label 6: PR-AUC = 0.1509
Label 7: PR-AUC = 0.3682
Label 8: PR-AUC = 0.4306
Label 9: PR-AUC = 0.0956
Label 10: PR-AUC = 0.1727
Label 11: PR-AUC = 0.0865
Label 12: PR-AUC = 0.2555


**without SMOTE**

In [None]:
#get number of labels
num_labels = train_labels.shape[1]

In [None]:
# Compare to no SMOTE
# Train one model per label, each model for each label is a binary task
models = {}
best_params_per_label = {}

# Define param grid
param_list = [
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 100},

]

for label_idx in range(num_labels):
    print(f"\nTraining model for label {label_idx + 1}/{num_labels}")

    # Filter for non-NA samples for this label in train and validation sets
    train_idx = ~np.isnan(train_labels[:, label_idx])
    val_idx = ~np.isnan(valid_labels[:, label_idx])

    x_train = train_embeddings[train_idx]
    y_train = train_labels[train_idx, label_idx].astype(int)

    x_val = valid_embeddings[val_idx]
    y_val = valid_labels[val_idx, label_idx].astype(int)

    best_pr_auc = 0
    best_model = None
    best_params = None

    for params in param_list:
        model = LGBMClassifier(objective="binary", boosting_type="gbdt", verbose=-1, **params)
        model.fit(x_train, y_train)

        val_probs = model.predict_proba(x_val)[:, 1]
        pr_auc = average_precision_score(y_val, val_probs)
        #auc = roc_auc_score(y_val, val_probs)

        print(f"  Params: {params}, Validation PR-AUC: {pr_auc:.4f}")

        if pr_auc > best_pr_auc:
            best_pr_auc = pr_auc
            best_model = model
            best_params = params

    print(f"Best Params for label {label_idx}: {best_params}, PR-AUC: {best_pr_auc:.4f}")
    models[label_idx] = best_model
    best_params_per_label[label_idx] = best_params


Training model for label 1/13
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}, Validation PR-AUC: 0.5224
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 75}, Validation PR-AUC: 0.5270
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}, Validation PR-AUC: 0.5297
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}, Validation PR-AUC: 0.5163
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 75}, Validation PR-AUC: 0.5235
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}, Validation PR-AUC: 0.5264
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}, Validation PR-AUC: 0.5147
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 75}, Validation PR-AUC: 0.5212
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100}, Validation PR-AUC: 0.5262
  Params: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 50}, Validation PR-AUC: 0.

In [None]:
best_params_per_label

{0: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
 1: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100},
 2: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 50},
 3: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100},
 4: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50},
 5: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100},
 6: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50},
 7: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 75},
 8: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 75},
 9: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 75},
 10: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100},
 11: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 50},
 12: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 50}}

In [None]:
 #initialise to have predictions for each label for each test
 predictions = np.zeros((test_embeddings.shape[0], num_labels))

In [None]:
for label_idx, model in models.items():
    print(f"Predicting for label {label_idx + 1}/{num_labels}")

    # Predict probabilities
    pred_probs = model.predict(test_embeddings)
    predictions[:, label_idx] = pred_probs

Predicting for label 1/13
Predicting for label 2/13
Predicting for label 3/13
Predicting for label 4/13
Predicting for label 5/13
Predicting for label 6/13
Predicting for label 7/13
Predicting for label 8/13
Predicting for label 9/13
Predicting for label 10/13
Predicting for label 11/13
Predicting for label 12/13
Predicting for label 13/13


In [None]:
# Convert predicted probabilities to binary (threshold = 0.5)
binary_predictions = (predictions > 0.5).astype(int)
masked_test_labels = np.where(np.isnan(test_labels), -1, test_labels)

# Compute classification report
for label_idx in range(num_labels):
    valid_idx = ~np.isnan(test_labels[:, label_idx])  # Get valid (non-NaN) indices
    y_true = test_labels[valid_idx, label_idx]
    y_pred = binary_predictions[valid_idx, label_idx]

    if len(y_true) > 0:  # Ensure we have valid samples
        print(f"\nLabel {label_idx} Metrics:")
        print(classification_report(y_true, y_pred, zero_division=0))
    else:
        print(f"\nLabel {label_idx}: No valid samples for evaluation.")


Label 0 Metrics:
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      3046
         1.0       0.60      0.42      0.49       446

    accuracy                           0.89      3492
   macro avg       0.76      0.69      0.72      3492
weighted avg       0.88      0.89      0.88      3492


Label 1 Metrics:
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      3202
         1.0       0.62      0.39      0.48       400

    accuracy                           0.91      3602
   macro avg       0.77      0.68      0.71      3602
weighted avg       0.89      0.91      0.90      3602


Label 2 Metrics:
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3167
         1.0       0.00      0.00      0.00        75

    accuracy                           0.98      3242
   macro avg       0.49      0.50      0.49      3242
weighted avg       0

In [None]:
pr_auc_scores = []

for label_idx in range(num_labels):
    valid_idx = ~np.isnan(test_labels[:, label_idx])

    if valid_idx.sum() > 0:  # Ensure there are valid labels
        pr_auc = average_precision_score(
            test_labels[valid_idx, label_idx],
            predictions[valid_idx, label_idx]
        )
        pr_auc_scores.append(pr_auc)
        print(f"Label {label_idx}: PR-AUC = {pr_auc:.4f}")

Label 0: PR-AUC = 0.3271
Label 1: PR-AUC = 0.3064
Label 2: PR-AUC = 0.0231
Label 3: PR-AUC = 0.0434
Label 4: PR-AUC = 0.0220
Label 5: PR-AUC = 0.0262
Label 6: PR-AUC = 0.0378
Label 7: PR-AUC = 0.3179
Label 8: PR-AUC = 0.4549
Label 9: PR-AUC = 0.0143
Label 10: PR-AUC = 0.0929
Label 11: PR-AUC = 0.0162
Label 12: PR-AUC = 0.1769


**Run lgbm with feature selection and smote for reduced feature set:**

In [4]:
import pickle
from imblearn.over_sampling import SMOTE
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [10]:
with open('/content/drive/MyDrive/SPH6004_project/data/X_train_100d_UMAP_AE.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/y_train.pkl', 'rb') as f:
    train_labels = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/X_valid_100d_UMAP_AE.pkl', 'rb') as f:
    valid_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/y_valid.pkl', 'rb') as f:
    valid_labels = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/X_test_100d_UMAP_AE.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/SPH6004_project/data/y_test.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [11]:
train_embeddings = np.array(train_embeddings)
train_labels = np.array(train_labels)
valid_embeddings = np.array(valid_embeddings)
valid_labels = np.array(valid_labels)

train_embeddings.shape

(36502, 100)

In [12]:
train_embeddings.shape

(36502, 100)

In [13]:
test_embeddings.shape

(4562, 100)

In [14]:
train_labels.shape

(36502, 13)

In [15]:
valid_labels.shape

(4563, 13)

In [16]:
test_labels.shape

(4562, 13)

In [17]:
#get number of labels
num_labels = train_labels.shape[1]

In [18]:
# Train one model per label, each model for each label is a binary task
models = {}
best_params_per_label = {}

# Define param grid
param_list = [
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 5, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 50},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 75},
    {"learning_rate": 0.01, "max_depth": -1, "n_estimators": 100},
    {"learning_rate": 0.01, "max_depth": 3, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 3, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 5, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": 7, "n_estimators": 100},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 50},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 75},
    {"learning_rate": 0.1, "max_depth": -1, "n_estimators": 100},

]

for label_idx in range(num_labels):
    print(f"\nTraining model for label {label_idx + 1}/{num_labels}")

    # Filter for non-NA samples for this label in train and validation sets
    train_idx = ~np.isnan(train_labels[:, label_idx])
    val_idx = ~np.isnan(valid_labels[:, label_idx])

    x_train = train_embeddings[train_idx]
    y_train = train_labels[train_idx, label_idx].astype(int)

    x_val = valid_embeddings[val_idx]
    y_val = valid_labels[val_idx, label_idx].astype(int)

    # Check class distribution before SMOTE
    class_0, class_1 = np.bincount(y_train)
    print(f"  Before SMOTE - Class 0: {class_0}, Class 1: {class_1}")

    smote = SMOTE(sampling_strategy='auto', random_state=42)
    x_train, y_train = smote.fit_resample(x_train, y_train)

    # Check class distribution after SMOTE
    class_0_res, class_1_res = np.bincount(y_train)
    print(f"  After SMOTE  - Class 0: {class_0_res}, Class 1: {class_1_res}")

    best_pr_auc = 0
    best_model = None
    best_params = None

    for params in param_list:
        model = LGBMClassifier(objective="binary", boosting_type="gbdt", verbose=-1, **params)
        model.fit(x_train, y_train)

        val_probs = model.predict_proba(x_val)[:, 1]
        pr_auc = average_precision_score(y_val, val_probs)
        #auc = roc_auc_score(y_val, val_probs)

        print(f"  Params: {params}, Validation PR-AUC: {pr_auc:.4f}")

        if pr_auc > best_pr_auc:
            best_pr_auc = pr_auc
            best_model = model
            best_params = params

    print(f"Best Params for label {label_idx}: {best_params}, PR-AUC: {best_pr_auc:.4f}")
    models[label_idx] = best_model
    best_params_per_label[label_idx] = best_params


Training model for label 1/13
  Before SMOTE - Class 0: 24778, Class 1: 3222
  After SMOTE  - Class 0: 24778, Class 1: 24778
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}, Validation PR-AUC: 0.4099
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 75}, Validation PR-AUC: 0.4351
  Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}, Validation PR-AUC: 0.4664
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}, Validation PR-AUC: 0.4601
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 75}, Validation PR-AUC: 0.4651
  Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}, Validation PR-AUC: 0.4650
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}, Validation PR-AUC: 0.4721
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 75}, Validation PR-AUC: 0.4747
  Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100}, Validation PR-AUC: 0.476

In [19]:
 #initialise to have predictions for each label for each test
 predictions = np.zeros((test_embeddings.shape[0], num_labels))

In [20]:
for label_idx, model in models.items():
    print(f"Predicting for label {label_idx + 1}/{num_labels}")

    # Predict probabilities
    pred_probs = model.predict(test_embeddings)
    predictions[:, label_idx] = pred_probs

Predicting for label 1/13
Predicting for label 2/13
Predicting for label 3/13
Predicting for label 4/13
Predicting for label 5/13
Predicting for label 6/13
Predicting for label 7/13
Predicting for label 8/13
Predicting for label 9/13
Predicting for label 10/13
Predicting for label 11/13
Predicting for label 12/13
Predicting for label 13/13


In [21]:
# Convert predicted probabilities to binary (threshold = 0.5)
binary_predictions = (predictions > 0.5).astype(int)
masked_test_labels = np.where(np.isnan(test_labels), -1, test_labels)

# Compute classification report
for label_idx in range(num_labels):
    valid_idx = ~np.isnan(test_labels[:, label_idx])  # Get valid (non-NaN) indices
    y_true = test_labels[valid_idx, label_idx]
    y_pred = binary_predictions[valid_idx, label_idx]

    if len(y_true) > 0:  # Ensure we have valid samples
        print(f"\nLabel {label_idx} Metrics:")
        print(classification_report(y_true, y_pred, zero_division=0))
    else:
        print(f"\nLabel {label_idx}: No valid samples for evaluation.")


Label 0 Metrics:
              precision    recall  f1-score   support

         0.0       0.97      0.82      0.89      3046
         1.0       0.40      0.80      0.53       446

    accuracy                           0.82      3492
   macro avg       0.68      0.81      0.71      3492
weighted avg       0.89      0.82      0.84      3492


Label 1 Metrics:
              precision    recall  f1-score   support

         0.0       0.97      0.82      0.89      3202
         1.0       0.36      0.80      0.50       400

    accuracy                           0.82      3602
   macro avg       0.67      0.81      0.69      3602
weighted avg       0.90      0.82      0.85      3602


Label 2 Metrics:
              precision    recall  f1-score   support

         0.0       0.99      0.88      0.93      3167
         1.0       0.13      0.75      0.22        75

    accuracy                           0.88      3242
   macro avg       0.56      0.81      0.58      3242
weighted avg       0

In [22]:
pr_auc_scores = []

for label_idx in range(num_labels):
    valid_idx = ~np.isnan(test_labels[:, label_idx])

    if valid_idx.sum() > 0:  # Ensure there are valid labels
        pr_auc = average_precision_score(
            test_labels[valid_idx, label_idx],
            predictions[valid_idx, label_idx]
        )
        pr_auc_scores.append(pr_auc)
        print(f"Label {label_idx}: PR-AUC = {pr_auc:.4f}")

Label 0: PR-AUC = 0.3452
Label 1: PR-AUC = 0.3113
Label 2: PR-AUC = 0.1033
Label 3: PR-AUC = 0.2031
Label 4: PR-AUC = 0.0554
Label 5: PR-AUC = 0.0462
Label 6: PR-AUC = 0.1122
Label 7: PR-AUC = 0.3491
Label 8: PR-AUC = 0.4040
Label 9: PR-AUC = 0.0647
Label 10: PR-AUC = 0.1550
Label 11: PR-AUC = 0.0765
Label 12: PR-AUC = 0.2399
