# Keras Multi-Output Classification - Bond Data
Learning exercise: Predicting level_2 and level_3 from bond characteristics

In [None]:
import pandas as pd


In [None]:
# Setup
!pip install keras --upgrade -q

In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

print(f"Keras version: {keras.__version__}")
print(f"Backend: {keras.backend.backend()}")

In [None]:
# Load data
# TODO: Upload your bond dataset (CSV file)
df = pd.read_excel('mstar_core_bond_index_oct_data.xlsx')

# Quick data check
print(df.shape)
print(df.head())
print(df.info())

In [None]:
# Check unique values in our target variables
print("level_2 unique values:", df['level_2'].unique())
print("level_2 counts:\n", df['level_2'].value_counts())
print("\nlevel_3 unique values:", df['level_3'].unique())
print("level_3 counts:\n", df['level_3'].value_counts())
print("\nrating unique values:", df['rating'].unique())
print("rating counts:\n", df['rating'].value_counts())

In [None]:
# Separate features and targets
X_numerical = df[['oas', 'yield', 'duration', 'convexity', 'coupon']].values
X_rating = df['rating'].values
y_level2 = df['level_2'].values
y_level3 = df['level_3'].values

print("Numerical features shape:", X_numerical.shape)
print("Rating feature shape:", X_rating.shape)
print("Target level_2 shape:", y_level2.shape)
print("Target level_3 shape:", y_level3.shape)

# Count unique classes for later
num_classes_level2 = df['level_2'].nunique()
num_classes_level3 = df['level_3'].nunique()
num_ratings = df['rating'].nunique()

print(f"\nNumber of classes - level_2: {num_classes_level2}, level_3: {num_classes_level3}")
print(f"Number of rating categories: {num_ratings}")

In [None]:
# Train/test split
# We need to split all our inputs and both outputs
X_num_train, X_num_test, X_rating_train, X_rating_test, y_level2_train, y_level2_test, y_level3_train, y_level3_test = train_test_split(
    X_numerical,
    X_rating,
    y_level2,
    y_level3,
    test_size=0.2,
    random_state=42,
    stratify=y_level2  # Stratify by level_2 to maintain class balance
)

print("Training set size:", X_num_train.shape[0])
print("Test set size:", X_num_test.shape[0])
print(f"Split ratio: {X_num_train.shape[0] / len(df):.1%} train, {X_num_test.shape[0] / len(df):.1%} test")

In [None]:
# Recreate the StringLookup layers WITHOUT OOV token
rating_lookup = keras.layers.StringLookup(output_mode="int", num_oov_indices=0)
level2_lookup = keras.layers.StringLookup(output_mode="int", num_oov_indices=0)
level3_lookup = keras.layers.StringLookup(output_mode="int", num_oov_indices=0)

# Adapt them to the training data
rating_lookup.adapt(X_rating_train)
level2_lookup.adapt(y_level2_train)
level3_lookup.adapt(y_level3_train)

print("Rating vocabulary size:", rating_lookup.vocabulary_size())
print("Level 2 vocabulary size:", level2_lookup.vocabulary_size())
print("Level 3 vocabulary size:", level3_lookup.vocabulary_size())

In [None]:
# Define input layers
input_numerical = keras.Input(shape=(5,), name='numerical_features')
input_rating = keras.Input(shape=(1,), dtype='string', name='rating')

print("Input layers created:")
print(f"  Numerical input: {input_numerical}")
print(f"  Rating input: {input_rating}")

In [None]:
# Apply preprocessing layers
rating_encoded = rating_lookup(input_rating)
numerical_normalized = normalizer(input_numerical)

print("Preprocessing applied:")
print(f"  Rating encoded: {rating_encoded}")
print(f"  Numerical normalized: {numerical_normalized}")

In [None]:
# Simpler approach: just look at the vocabulary mappings
print("Rating vocabulary mapping:")
vocab = rating_lookup.get_vocabulary()
for i, rating in enumerate(vocab):
    print(f"  {i}: '{rating}'")

print("\n" + "="*50)
print("Sample of original data (first 5 rows):")
print("="*50)
print("\nNumerical features:")
print(X_num_train[:5])
print("\nRating values:")
print(X_rating_train[:5])
print("\nAfter encoding, these ratings would become:")
for rating in X_rating_train[:5]:
    idx = vocab.index(rating) if rating in vocab else 0
    print(f"  '{rating}' -> {idx}")

In [None]:
# Flatten rating (it's shape (None, 1), we want (None,))
rating_flat = keras.layers.Flatten()(rating_encoded)

# Concatenate all features
combined_features = keras.layers.Concatenate()([rating_flat, numerical_normalized])

print("Features combined:")
print(f"  Combined shape: {combined_features}")

In [None]:
# Shared hidden layers
x = keras.layers.Dense(64, activation='relu', name='hidden_1')(combined_features)
x = keras.layers.Dense(32, activation='relu', name='hidden_2')(x)

print("Hidden layers added:")
print(f"  After hidden_1 (64 units): {x}")

In [None]:
# Output branch for level_2 (6 classes)
output_level2 = keras.layers.Dense(
    num_classes_level2,
    activation='softmax',
    name='level_2_output'
)(x)

# Output branch for level_3 (28 classes)
output_level3 = keras.layers.Dense(
    num_classes_level3,
    activation='softmax',
    name='level_3_output'
)(x)

print("Output layers created:")
print(f"  level_2 output: {output_level2}")
print(f"  level_3 output: {output_level3}")

In [None]:
# Create the full model
model = keras.Model(
    inputs=[input_numerical, input_rating],
    outputs=[output_level2, output_level3],
    name='bond_classifier'
)

print("Model created!")
print("\nModel summary:")
model.summary()

In [None]:
# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'level_2_output': 'sparse_categorical_crossentropy',
        'level_3_output': 'sparse_categorical_crossentropy'
    },
    metrics={
        'level_2_output': ['accuracy'],
        'level_3_output': ['accuracy']
    }
)

print("Model compiled!")
print("Ready to train.")

In [None]:
import numpy as np

# Encode target variables to integers
y_level2_train_encoded = np.array(level2_lookup(y_level2_train.reshape(-1, 1))).flatten()
y_level2_test_encoded = np.array(level2_lookup(y_level2_test.reshape(-1, 1))).flatten()

y_level3_train_encoded = np.array(level3_lookup(y_level3_train.reshape(-1, 1))).flatten()
y_level3_test_encoded = np.array(level3_lookup(y_level3_test.reshape(-1, 1))).flatten()

print("Targets encoded!")
print(f"Training samples: {len(y_level2_train_encoded)}")
print(f"Test samples: {len(y_level2_test_encoded)}")
print(f"\nSample encoded level_2 targets: {y_level2_train_encoded[:5]}")
print(f"Sample encoded level_3 targets: {y_level3_train_encoded[:5]}")

In [None]:
# Train the model
history = model.fit(
    [X_num_train, tf.constant(X_rating_train.reshape(-1, 1), dtype=tf.string)],
    {'level_2_output': y_level2_train_encoded,
     'level_3_output': y_level3_train_encoded},
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    verbose=1
)

print("\nTraining complete!")

In [None]:
import matplotlib.pyplot as plt

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot level_2 loss
axes[0, 0].plot(history.history['level_2_output_loss'], label='Training')
axes[0, 0].plot(history.history['val_level_2_output_loss'], label='Validation')
axes[0, 0].set_title('Level 2 Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Plot level_2 accuracy
axes[0, 1].plot(history.history['level_2_output_accuracy'], label='Training')
axes[0, 1].plot(history.history['val_level_2_output_accuracy'], label='Validation')
axes[0, 1].set_title('Level 2 Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Plot level_3 loss
axes[1, 0].plot(history.history['level_3_output_loss'], label='Training')
axes[1, 0].plot(history.history['val_level_3_output_loss'], label='Validation')
axes[1, 0].set_title('Level 3 Loss')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Plot level_3 accuracy
axes[1, 1].plot(history.history['level_3_output_accuracy'], label='Training')
axes[1, 1].plot(history.history['val_level_3_output_accuracy'], label='Validation')
axes[1, 1].set_title('Level 3 Accuracy')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Evaluate on test data
test_results = model.evaluate(
    [X_num_test, tf.constant(X_rating_test.reshape(-1, 1), dtype=tf.string)],
    {'level_2_output': y_level2_test_encoded,
     'level_3_output': y_level3_test_encoded},
    verbose=1
)

print("\n" + "="*50)
print("Test Results:")
print("="*50)
print(f"Total Loss: {test_results[0]:.4f}")
print(f"Level 2 Loss: {test_results[1]:.4f}")
print(f"Level 3 Loss: {test_results[2]:.4f}")
print(f"Level 2 Accuracy: {test_results[3]:.4f} ({test_results[3]*100:.2f}%)")
print(f"Level 3 Accuracy: {test_results[4]:.4f} ({test_results[4]*100:.2f}%)")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Make predictions on test set
predictions = model.predict(
    [X_num_test, tf.constant(X_rating_test.reshape(-1, 1), dtype=tf.string)],
    verbose=0
)

# Get predicted classes (argmax of probabilities)
y_pred_level2 = np.argmax(predictions[0], axis=1)
y_pred_level3 = np.argmax(predictions[1], axis=1)

# Create confusion matrices
cm_level2 = confusion_matrix(y_level2_test_encoded, y_pred_level2)
cm_level3 = confusion_matrix(y_level3_test_encoded, y_pred_level3)

# Get class names
level2_classes = level2_lookup.get_vocabulary()
level3_classes = level3_lookup.get_vocabulary()

# Plot Level 2 confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

sns.heatmap(cm_level2, annot=True, fmt='d', cmap='Blues',
            xticklabels=level2_classes, yticklabels=level2_classes,
            ax=axes[0])
axes[0].set_title('Level 2 Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Plot Level 3 confusion matrix
sns.heatmap(cm_level3, annot=True, fmt='d', cmap='Blues',
            xticklabels=level3_classes, yticklabels=level3_classes,
            ax=axes[1])
axes[1].set_title('Level 3 Confusion Matrix')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

# Print detailed classification report for level_3
print("\nLevel 3 Classification Report:")
print("="*80)
print(classification_report(y_level3_test_encoded, y_pred_level3,
                           target_names=level3_classes, zero_division=0))