## CC (Elia)

In [37]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import csr_matrix  # For sparse matrix

In [38]:
# Load data
output_folder = Path('../output')
X = np.load(output_folder / 'x_remb_CC.npy')
y_df = pd.read_csv(output_folder / 'y_df_CC.csv')
X_sparse = csr_matrix(X)

In [39]:
import ast

y_df['GO_term'] = y_df['GO_term'].apply(lambda x: list(ast.literal_eval(x)))

In [40]:
# Preprocess target (MultiLabelBinarizer for multi-label classification)
mlb = MultiLabelBinarizer(sparse_output=True)  # Enable sparse output
y = mlb.fit_transform(y_df['GO_term'])

In [41]:
print("X_sparse shape:", X_sparse.shape)
print("y shape:", len(y) if isinstance(y, list) else y.shape)

X_sparse shape: (84638, 274)
y shape: (84638, 678)


In [74]:
X_sparse

<84638x274 sparse matrix of type '<class 'numpy.float64'>'
	with 23190812 stored elements in Compressed Sparse Row format>

In [75]:
y

<84638x678 sparse matrix of type '<class 'numpy.int32'>'
	with 1109632 stored elements in Compressed Sparse Row format>

In [44]:
y_dense = y.toarray() 
label_counts = np.sum(y_dense, axis=0)

# Get the number of labels and the count of each label
labels = np.arange(y_dense.shape[1])  # Each label corresponds to a column index
label_counts = label_counts.reshape(-1)

## NN

In [61]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras import regularizers

In [83]:
# Convert to dense NumPy arrays
X_array = X_sparse.toarray()
Y_array = y.toarray()

In [84]:
import tensorflow as tf
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred):
    """Compute the F1 score for multilabel classification"""
    # Round predictions to nearest integer (0 or 1) for binary classification
    y_pred = K.round(y_pred)
    
    # True positives, false positives, false negatives
    tp = K.sum(y_true * y_pred, axis=0)
    fp = K.sum((1 - y_true) * y_pred, axis=0)
    fn = K.sum(y_true * (1 - y_pred), axis=0)
    
    # Precision and recall
    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    
    # F1 score
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    
    # Return macro average of F1 score
    return K.mean(f1)


In [85]:
model = Sequential()

# Input layer - handling sparse matrix
model.add(Input(shape=(X_array.shape[1],)))  # Shape = (274,)

# First hidden layer with ReLU activation
model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01)))

# Second hidden layer with ReLU activation
model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)))

# Third hidden layer with ReLU activation
model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))

# Output layer with sigmoid activation for multilabel classification
model.add(Dense(Y_array.shape[1], activation='sigmoid'))  # Output = 678

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',  # Use binary crossentropy for multilabel classification
              metrics=[f1_score])  # Optimize for F1 score

# Summary of the model
model.summary()


In [89]:
from tensorflow.keras.callbacks import EarlyStopping

X_array = X.toarray()
Y_array = Y.toarray()

# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

# Train the model
model.fit(
    X_array, Y_array,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]
)

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [46]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [48]:
# Train-test split 22 MINUTES
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y, test_size=0.2)

In [60]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight


def build_model(input_dim, output_dim):
    model = models.Sequential()
    
    # Input layer and first hidden layer
    model.add(layers.InputLayer(input_shape=(input_dim,)))  # Adjusted to 274
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dropout(0.5))  # Higher dropout for regularization

    # Second hidden layer
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dropout(0.5))

    # Output layer with sigmoid activation for multilabel classification
    model.add(layers.Dense(output_dim, activation='sigmoid'))  # Adjusted to match number of labels
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['AUC', 'Precision', 'Recall'])

    return model

X_train_dense = X_train.toarray()  # Convert to dense (if needed)
y_train_dense = y_train.toarray()

# Flatten the multilabel matrix to count occurrences of each label
flat_y = y_train_dense.flatten()

# Get the unique labels (0 and 1 in this case) and their counts
classes = np.unique(flat_y)

# Calculate class weights for multilabel classification
class_weights = compute_class_weight(
    class_weight='balanced', 
    classes=classes,  # Classes are the unique values in the flattened matrix (0 and 1)
    y=flat_y  # Flattened labels
)

# Convert the class weights into a dictionary (useful for Keras training)
class_weights_dict = {i: class_weights[i] for i in range(len(classes))}
# Calculate class weights for multilabel classification
class_weights = compute_class_weight(
    class_weight='balanced', 
    classes=np.arange(y_train_dense.shape[1]),  # Classes are the columns (one per label)
    y=np.argmax(y_train_dense, axis=1)  # Use the argmax to get the class for each sample
)

# Train the model with reduced learning rate and monitoring AUC
model = build_model(input_dim=274, output_dim=678)  # Adjusted output_dim

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
model.fit(X_train_dense, y_train_dense, epochs=5, batch_size=256, validation_split=0.1, 
          class_weight=class_weights,
          callbacks=[EarlyStopping(monitor='val_loss', patience=5), lr_scheduler])


ValueError: classes should have valid labels that are in y

In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test data
y_pred = model.predict(X_test_dense)

# Threshold the predicted probabilities to get binary values
threshold = 0.5
y_pred_bin = (y_pred > threshold).astype(int)

# Generate the classification report
report = classification_report(y_test_dense, y_pred_bin, target_names=[f"Class {i}" for i in range(678)])
print(report)

[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
              precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00        43
     Class 1       0.00      0.00      0.00        60
     Class 2       0.00      0.00      0.00        13
     Class 3       0.00      0.00      0.00        15
     Class 4       0.00      0.00      0.00        16
     Class 5       0.00      0.00      0.00        25
     Class 6       0.00      0.00      0.00       158
     Class 7       0.00      0.00      0.00        10
     Class 8       0.00      0.00      0.00       111
     Class 9       0.00      0.00      0.00        27
    Class 10       0.00      0.00      0.00        10
    Class 11       0.00      0.00      0.00        15
    Class 12       0.00      0.00      0.00       144
    Class 13       0.00      0.00      0.00        10
    Class 14       0.00      0.00      0.00        21
    Class 15       0.00      0.00      0.00        79
    Cl

  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# Convert scipy sparse matrices to TensorFlow sparse tensors
X_train_tf = tf.sparse.SparseTensor(
    indices=np.array(X_train.nonzero()).T,
    values=X_train.data,
    dense_shape=X_train.shape
)

y_train_tf = tf.sparse.SparseTensor(
    indices=np.array(y_train.nonzero()).T,
    values=y_train.data,
    dense_shape=y_train.shape
)

X_test_tf = tf.sparse.SparseTensor(
    indices=np.array(X_test.nonzero()).T,
    values=X_test.data,
    dense_shape=X_test.shape
)

y_test_tf = tf.sparse.SparseTensor(
    indices=np.array(y_test.nonzero()).T,
    values=y_test.data,
    dense_shape=y_test.shape
)

In [36]:
from tensorflow.keras.callbacks import EarlyStopping

# Convert sparse data to dense
X_train_dense = X_train_tf.todense()  # or .toarray()
X_test_dense = X_test_tf.todense()
y_train_dense = y_train_tf.todense()  # Ensure labels are dense
y_test_dense = y_test_tf.todense()

# Define and compile the model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_dense.shape[1],)),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')  # Adjust for binary classification
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['AUC']
)

# Early stopping
early_stop = EarlyStopping(monitor="val_AUC", patience=3, mode="max", restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_dense,
    y_train_dense,
    epochs=20,
    batch_size=256,
    validation_data=(X_test_dense, y_test_dense),
    callbacks=[early_stop]
)


AttributeError: 'SparseTensor' object has no attribute 'todense'

In [28]:
# Final evaluation on test data
test_loss, test_accuracy, test_auc = model.evaluate(
    X_test_tf, tf.sparse.to_dense(y_test_tf)  # Ensure sparse tensors are converted to dense
)
print(f"Final Test Loss: {test_loss}")
print(f"Final Test Accuracy: {test_accuracy}")
print(f"Final Test AUC: {test_auc}")

# Get predictions
y_pred = model.predict(X_test_tf)

# Convert predictions to binary values (0 or 1) based on threshold
y_pred_binary = (y_pred > 0.5).astype(int)

# Convert sparse labels to dense for comparison
y_test_dense = tf.sparse.to_dense(y_test_tf).numpy()

# Generate classification report
report = classification_report(
    y_test_dense, 
    y_pred_binary, 
    target_names=[f"Class {i+1}" for i in range(y_test_dense.shape[1])],
    zero_division=0  # Avoid warnings for classes with no predictions
)
print("\nClassification Report:")
print(report)

[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - AUC: 0.9255 - accuracy: 0.8671 - loss: 0.0484
Final Test Loss: 0.048592835664749146
Final Test Accuracy: 0.8690335750579834
Final Test AUC: 0.9254587888717651
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step

Classification Report:
              precision    recall  f1-score   support

     Class 1       0.00      0.00      0.00        43
     Class 2       0.00      0.00      0.00        61
     Class 3       0.00      0.00      0.00        25
     Class 4       0.00      0.00      0.00        16
     Class 5       0.00      0.00      0.00        17
     Class 6       0.00      0.00      0.00        35
     Class 7       0.00      0.00      0.00       127
     Class 8       0.00      0.00      0.00        13
     Class 9       0.00      0.00      0.00       132
    Class 10       0.00      0.00      0.00        31
    Class 11       0.00      0.00      0.00        12
    Class 12    

In [29]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report

# Final evaluation on test data
test_loss, test_accuracy, test_auc = model.evaluate(
    X_test_tf, tf.sparse.to_dense(y_test_tf)
)
print(f"Final Test Loss: {test_loss}")
print(f"Final Test Accuracy: {test_accuracy}")
print(f"Final Test AUC: {test_auc}")

# Get predictions (probabilities)
y_pred = model.predict(X_test_tf)

# Convert sparse labels to dense
y_test_dense = tf.sparse.to_dense(y_test_tf).numpy()

# Function to tune threshold for F1 score
def tune_threshold(y_true, y_pred_prob):
    best_threshold = 0.5
    best_f1 = 0.0
    thresholds = [i * 0.01 for i in range(1, 100)]  # Thresholds from 0.01 to 0.99
    
    for threshold in thresholds:
        y_pred_binary = (y_pred_prob > threshold).astype(int)
        f1 = f1_score(y_true, y_pred_binary, average='micro')
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

# Tune threshold for F1
best_threshold, best_f1 = tune_threshold(y_test_dense, y_pred)
print(f"Optimal Threshold for F1: {best_threshold}")
print(f"Best F1 Score: {best_f1}")

# Apply the optimal threshold to predictions
y_pred_binary = (y_pred > best_threshold).astype(int)

# Calculate AUC for multi-label classification
auc_score = roc_auc_score(y_test_dense, y_pred, average='micro')
print(f"Final AUC Score: {auc_score}")

# Generate classification report
report = classification_report(
    y_test_dense, 
    y_pred_binary, 
    target_names=[f"Class {i+1}" for i in range(y_test_dense.shape[1])],
    zero_division=0
)
print("\nClassification Report:")
print(report)


[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - AUC: 0.9255 - accuracy: 0.8671 - loss: 0.0484
Final Test Loss: 0.048592835664749146
Final Test Accuracy: 0.8690335750579834
Final Test AUC: 0.9254587888717651
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step


KeyboardInterrupt: 

In [44]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping

y_dense = y.toarray()  # Convert sparse target matrix to dense array

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y_dense, test_size=0.2, random_state=42)

# Flatten the y_train array to handle multilabel
y_train_flattened = y_train.flatten()

# Compute class weights for multilabel problem
classes = np.unique(y_train_flattened)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train_flattened)

# Create a dictionary of class weights (mapping each class to its corresponding weight)
class_weight_dict = dict(zip(classes, class_weights))

# Build the neural network model
model = Sequential()

# Input layer: input shape is 274 (embedding size)
model.add(Input(shape=(X_train.shape[1],)))

# Hidden layers
model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer: 678 classes, sigmoid activation for multilabel classification
model.add(Dense(y_train.shape[1], activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-3),
              loss='binary_crossentropy',  # Multilabel, so binary crossentropy
              metrics=["accuracy", "AUC", "f1_score"])

# Implement early stopping to stop training when validation performance stops improving
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with class weights
history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_test, y_test), 
                    class_weight=class_weight_dict, callbacks=[early_stopping])


# Get predictions and apply optimal threshold (e.g., 0.2 or 0.3 instead of 0.5)
y_pred = model.predict(X_test)
threshold = 0.2  # or experiment with other values
y_pred_binary = (y_pred > threshold).astype(int)

# Generate classification report
report = classification_report(y_test, y_pred_binary, target_names=[f"Class {i+1}" for i in range(y_test.shape[1])])
print("\nClassification Report:")
print(report)

Epoch 1/10
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - AUC: 0.8597 - accuracy: 0.7175 - f1_score: 0.0016 - loss: 0.4054 - val_AUC: 0.9229 - val_accuracy: 0.8717 - val_f1_score: 0.0015 - val_loss: 0.0565
Epoch 2/10
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - AUC: 0.9230 - accuracy: 0.8724 - f1_score: 0.0015 - loss: 0.0618 - val_AUC: 0.9238 - val_accuracy: 0.8717 - val_f1_score: 0.0015 - val_loss: 0.0518
Epoch 3/10
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - AUC: 0.9238 - accuracy: 0.8691 - f1_score: 0.0015 - loss: 0.0581 - val_AUC: 0.9299 - val_accuracy: 0.8717 - val_f1_score: 0.0015 - val_loss: 0.0517
Epoch 4/10
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - AUC: 0.9243 - accuracy: 0.8727 - f1_score: 0.0015 - loss: 0.0577 - val_AUC: 0.9274 - val_accuracy: 0.8717 - val_f1_score: 0.0015 - val_loss: 0.0501
Epoch 5/10
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
'''
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

y_dense = y.toarray()  # Convert sparse target matrix to dense array

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y_dense, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()

# Input layer: input shape is 274 (embedding size)
model.add(Input(shape=(X_train.shape[1],)))

# Hidden layers
model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer: 678 classes, sigmoid activation for multilabel classification
model.add(Dense(y_train.shape[1], activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-3),
              loss='binary_crossentropy',  # Multilabel, so binary crossentropy
              metrics=["f1_score", "AUC", "accuracy"])  # F1 score, AUC, and accuracy as metrics

# Train the model and capture history
history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_test, y_test))

# Plot accuracy by epoch
plt.figure(figsize=(10, 6))

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')

# Add labels and title
plt.title('Accuracy by Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Show plot
plt.show()

# Final evaluation on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Final Test Loss: {test_loss}")
print(f"Final Test Accuracy: {test_accuracy}")

# Report summary
print("\nFinal Report:")
print("---------------")
print(f"Final Test Loss: {test_loss}")
print(f"Final Test Accuracy: {test_accuracy}")'''


'\nimport numpy as np\nimport tensorflow as tf\nfrom sklearn.preprocessing import MultiLabelBinarizer\nfrom sklearn.model_selection import train_test_split\nfrom scipy.sparse import csr_matrix\nimport matplotlib.pyplot as plt\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras import regularizers\n\ny_dense = y.toarray()  # Convert sparse target matrix to dense array\n\n# Train-test split (80-20 split)\nX_train, X_test, y_train, y_test = train_test_split(X_sparse, y_dense, test_size=0.2, random_state=42)\n\n# Build the neural network model\nmodel = Sequential()\n\n# Input layer: input shape is 274 (embedding size)\nmodel.add(Input(shape=(X_train.shape[1],)))\n\n# Hidden layers\nmodel.add(Dense(512, activation=\'relu\', kernel_regularizer=regularizers.l2(0.001)))\nmodel.add(Dense(256, activation=\'relu\', kernel_regularizer=regularizers.l2(0.001)))\n\n# Output layer: 678 cl

## OnevsRest SGD

In [None]:
from sklearn.model_selection import GridSearchCV

# CAREFUL - TAKES ABOUT 30MIN TO RUN
# Train SGDClassifier with OneVsRestClassifier and parallel processing
# Define the parameter grid for SGDClassifier
param_grid = {
    'estimator__alpha': [0.0001, 0.001, 0.01],
    'estimator__penalty': ['l2', 'l1', 'elasticnet']
}

# Initialize GridSearchCV with OneVsRestClassifier and SGDClassifier
grid_search = GridSearchCV(
    OneVsRestClassifier(SGDClassifier(loss='log_loss', random_state=42, n_jobs=-1, max_iter=100, early_stopping=True), n_jobs=-1),
    param_grid,
    scoring='f1_micro',
    cv=10,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and estimator
print("Best parameters found: ", grid_search.best_params_)
sgd = grid_search.best_estimator_

Fitting 10 folds for each of 18 candidates, totalling 180 fits


In [None]:
# Evaluate
y_pred = sgd.predict(X_test)
classification_report(y_test, y_pred, target_names=mlb.classes_)

: 

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

# Convert sparse matrix to dense (if necessary)
X_dense = X_sparse.toarray()
y_dense = y.toarray()  # Convert sparse target matrix to dense array

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_dense, y_dense, test_size=0.2, random_state=42)

# Create a CatBoost model for multilabel classification
catboost_model = CatBoostClassifier(
    iterations=101,            # Number of boosting iterations
    learning_rate=0.05,         # Learning rate
    depth=4,                    # Depth of the trees
    loss_function='Logloss',    # Binary classification logloss
    cat_features=[],            # No categorical features in this example
    eval_metric='AUC',          # Use AUC metric for evaluation
    verbose=100                 # Print progress every 100 iterations
)

# Use MultiOutputClassifier to handle multilabel classification
multi_target_model = MultiOutputClassifier(catboost_model)

# Train the model
multi_target_model.fit(X_train, y_train)

# Make predictions
y_pred = multi_target_model.predict(X_test)

# Classification report for evaluation
print(classification_report(y_test, y_pred))

# Compute AUC for each label
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred, average='macro', multi_class='ovr')
print(f'Macro AUC: {auc}')

# Plot the accuracy by epoch (for CatBoost, we don't have epochs directly, but we can plot after training)
# Since CatBoost doesn't give per-epoch training metrics directly, we plot evaluation over iterations
eval_results = catboost_model.get_evals_result()
plt.figure(figsize=(10, 6))
plt.plot(eval_results['learn']['AUC'], label='Train AUC')
plt.plot(eval_results['validation']['AUC'], label='Validation AUC')
plt.title('AUC by Iteration')
plt.xlabel('Iteration')
plt.ylabel('AUC')
plt.legend()
plt.grid(True)
plt.show()

0:	total: 187ms	remaining: 18.7s
100:	total: 2.34s	remaining: 0us
0:	total: 26.4ms	remaining: 2.64s
100:	total: 2.28s	remaining: 0us
0:	total: 27ms	remaining: 2.7s
100:	total: 2.31s	remaining: 0us
0:	total: 25.6ms	remaining: 2.56s
100:	total: 2.43s	remaining: 0us
0:	total: 22.6ms	remaining: 2.26s
100:	total: 2.35s	remaining: 0us
0:	total: 24.7ms	remaining: 2.47s
100:	total: 2.42s	remaining: 0us


KeyboardInterrupt: 

: 

## Other

In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

# Assuming you have your features X_sparse and target labels y_dense (multilabel)
y_dense = y.toarray()  # Convert sparse target matrix to dense array

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y_dense, test_size=0.2, random_state=42)

# Initialize LightGBM model
lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss')

# Using MultiOutputClassifier to handle multilabel via One-vs-Rest
multi_target_model = MultiOutputClassifier(lgb_model, n_jobs=-1)

# Train the model
multi_target_model.fit(X_train, y_train)

# Make predictions
y_pred = multi_target_model.predict(X_test)

# Evaluate performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score: ", roc_auc_score(y_test, multi_target_model.predict_proba(X_test), average='macro', multi_class='ovr'))

  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        44
           1       0.00      0.00      0.00        64
           2       0.00      0.00      0.00        26
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        14
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00       116
           7       0.00      0.00      0.00        10
           8       0.00      0.00      0.00       140
           9       0.00      0.00      0.00        33
          10       0.00      0.00      0.00        11
          11       0.00      0.00      0.00        19
          12       0.00      0.00      0.00       155
          13       0.00      0.00      0.00         8
          14       0.00      0.00      0.00        20
          15       0.00      0.00      0.00        95
          16       0.00      0.00      0.00        48
   

ValueError: Found array with dim 3. None expected <= 2.