# Legal Clause Semantic Similarity - Model Training and Evaluation

This notebook implements and compares two models for legal clause semantic similarity:
1. Siamese BiLSTM
2. Attention-based Encoder

---


## 1. Import and Setup


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import json
import os
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set style for plots
try:
    plt.style.use('seaborn-v0_8')
except:
    try:
        plt.style.use('seaborn')
    except:
        plt.style.use('default')
sns.set_palette("husl")

print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))


## 2. Data Loading via preprocessing.py


In [None]:
# Import preprocessing functions
from preprocessing import load_data, create_pairs, prepare_tokenizer_and_sequences, split_data

# Load and preprocess data
print("Loading data...")
df = load_data('archive')

# Create pairs
print("\nCreating pairs...")
pairs_df = create_pairs(df, num_positive_pairs=50000, num_negative_pairs=50000)

# Prepare tokenizer and sequences
print("\nPreparing tokenizer and sequences...")
tokenizer, X1, X2, vocab_size = prepare_tokenizer_and_sequences(
    pairs_df, num_words=20000, maxlen=120
)

# Get labels
y = pairs_df['label'].values

# Split data
X1_train, X1_val, X1_test, X2_train, X2_val, X2_test, y_train, y_val, y_test, test_indices = split_data(
    X1, X2, y, test_size=0.15, val_size=0.15
)

print("\nData shapes:")
print(f"X1_train: {X1_train.shape}, X2_train: {X2_train.shape}")
print(f"X1_val: {X1_val.shape}, X2_val: {X2_val.shape}")
print(f"X1_test: {X1_test.shape}, X2_test: {X2_test.shape}")
print(f"y_train: {y_train.shape}, y_val: {y_val.shape}, y_test: {y_test.shape}")
print(f"Vocabulary size: {vocab_size}")
print(f"Test indices shape: {test_indices.shape}")


## 3. Model 1: Siamese BiLSTM


In [None]:
def build_siamese_bilstm(vocab_size, embedding_dim=128, lstm_units=128, maxlen=120):
    """
    Build Siamese BiLSTM model for semantic similarity.
    
    Architecture:
    - Shared Embedding layer
    - Shared Bidirectional LSTM
    - Combine using [u, v, |u-v|, u*v]
    - Dense layers with dropout
    """
    # Input layers
    input1 = layers.Input(shape=(maxlen,), name='input1')
    input2 = layers.Input(shape=(maxlen,), name='input2')
    
    # Shared embedding layer
    embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=True, name='embedding')
    
    # Shared BiLSTM layer
    bilstm = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=False), name='bilstm')
    
    # Process both inputs through shared layers
    embedded1 = embedding(input1)
    embedded2 = embedding(input2)
    
    # Apply BiLSTM
    lstm_out1 = bilstm(embedded1)
    lstm_out2 = bilstm(embedded2)
    
    # Combine vectors: [u, v, |u-v|, u*v]
    diff = layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([lstm_out1, lstm_out2])
    multiply = layers.Lambda(lambda x: x[0] * x[1])([lstm_out1, lstm_out2])
    concat = layers.Concatenate()([lstm_out1, lstm_out2, diff, multiply])
    
    # Dense layers
    dense1 = layers.Dense(128, activation='relu', name='dense1')(concat)
    dropout1 = layers.Dropout(0.3, name='dropout1')(dense1)
    dense2 = layers.Dense(64, activation='relu', name='dense2')(dropout1)
    output = layers.Dense(1, activation='sigmoid', name='output')(dense2)
    
    # Create model
    model = Model(inputs=[input1, input2], outputs=output, name='Siamese_BiLSTM')
    
    return model

# Build model
print("Building Siamese BiLSTM model...")
bilstm_model = build_siamese_bilstm(vocab_size, embedding_dim=128, lstm_units=128, maxlen=120)

# Compile model
bilstm_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
bilstm_model.summary()


In [None]:
# Callbacks for training
bilstm_callbacks = [
    EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1),
    ModelCheckpoint('bilstm_model.h5', monitor='val_loss', save_best_only=True, save_weights_only=False, verbose=1)
]

# Train model
print("Training Siamese BiLSTM model...")
start_time = time.time()

bilstm_history = bilstm_model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    batch_size=64,
    epochs=25,
    callbacks=bilstm_callbacks,
    verbose=1
)

bilstm_training_time = time.time() - start_time
print(f"\nTraining completed in {bilstm_training_time:.2f} seconds")
print("Best model weights restored automatically by EarlyStopping")


## 4. Model 2: Attention-based Encoder


In [None]:
def attention_layer(inputs, attention_dim=128):
    """
    Self-attention layer for encoding sequences.
    """
    # Dense layer for attention scoring
    attention_scores = layers.Dense(attention_dim, activation='tanh', name='attention_dense1')(inputs)
    attention_scores = layers.Dense(1, activation=None, name='attention_dense2')(attention_scores)
    
    # Remove the last dimension (squeeze from (batch, seq, 1) to (batch, seq))
    attention_scores = layers.Lambda(lambda x: tf.squeeze(x, axis=-1), name='attention_squeeze')(attention_scores)
    
    # Apply softmax along sequence dimension (axis=1)
    attention_weights = layers.Softmax(axis=1, name='attention_softmax')(attention_scores)
    
    # Expand dimensions for broadcasting: (batch, seq) -> (batch, seq, 1)
    attention_weights = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1), name='attention_expand')(attention_weights)
    
    # Apply attention weights
    attended = layers.Multiply(name='attention_apply')([inputs, attention_weights])
    
    # Sum over sequence dimension to get fixed-size representation
    attended = layers.Lambda(lambda x: tf.reduce_sum(x, axis=1), name='attention_sum')(attended)
    
    return attended

def build_attention_encoder(vocab_size, embedding_dim=128, lstm_units=128, maxlen=120, attention_dim=128):
    """
    Build Attention-based Encoder model for semantic similarity.
    
    Architecture:
    - Shared Embedding layer
    - Shared Bidirectional LSTM (return_sequences=True)
    - Self-attention layer
    - Combine using [u, v, |u-v|, u*v]
    - Dense layers with dropout
    """
    # Input layers
    input1 = layers.Input(shape=(maxlen,), name='input1')
    input2 = layers.Input(shape=(maxlen,), name='input2')
    
    # Shared embedding layer
    embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=True, name='embedding')
    
    # Shared BiLSTM layer (return sequences for attention)
    bilstm = layers.Bidirectional(
        layers.LSTM(lstm_units, return_sequences=True), 
        name='bilstm'
    )
    
    # Process both inputs through shared layers
    embedded1 = embedding(input1)
    embedded2 = embedding(input2)
    
    # Apply BiLSTM
    lstm_out1 = bilstm(embedded1)
    lstm_out2 = bilstm(embedded2)
    
    # Apply attention
    attended1 = attention_layer(lstm_out1, attention_dim=attention_dim)
    attended2 = attention_layer(lstm_out2, attention_dim=attention_dim)
    
    # Combine vectors: [u, v, |u-v|, u*v]
    diff = layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([attended1, attended2])
    multiply = layers.Lambda(lambda x: x[0] * x[1])([attended1, attended2])
    concat = layers.Concatenate()([attended1, attended2, diff, multiply])
    
    # Dense layers
    dense1 = layers.Dense(128, activation='relu', name='dense1')(concat)
    dropout1 = layers.Dropout(0.3, name='dropout1')(dense1)
    dense2 = layers.Dense(64, activation='relu', name='dense2')(dropout1)
    output = layers.Dense(1, activation='sigmoid', name='output')(dense2)
    
    # Create model
    model = Model(inputs=[input1, input2], outputs=output, name='Attention_Encoder')
    
    return model

# Build model
print("Building Attention-based Encoder model...")
attention_model = build_attention_encoder(
    vocab_size, embedding_dim=128, lstm_units=128, maxlen=120, attention_dim=128
)

# Compile model
attention_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
attention_model.summary()


In [None]:
# Callbacks for training
attention_callbacks = [
    EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1),
    ModelCheckpoint('attention_model.h5', monitor='val_loss', save_best_only=True, save_weights_only=False, verbose=1)
]

# Train model
print("Training Attention-based Encoder model...")
start_time = time.time()

attention_history = attention_model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    batch_size=64,
    epochs=25,
    callbacks=attention_callbacks,
    verbose=1
)

attention_training_time = time.time() - start_time
print(f"\nTraining completed in {attention_training_time:.2f} seconds")
print("Best model weights restored automatically by EarlyStopping")


## 5. Evaluation and Comparison


In [None]:
def evaluate_model(model, X1_test, X2_test, y_test, model_name):
    """
    Evaluate model and return metrics.
    """
    # Predict probabilities and labels
    y_pred_proba = model.predict([X1_test, X2_test], batch_size=64, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    
    # PR curve
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
    
    return {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'fpr': fpr,
        'tpr': tpr,
        'precision_curve': precision_curve,
        'recall_curve': recall_curve,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

# Evaluate both models
print("Evaluating Siamese BiLSTM model...")
bilstm_results = evaluate_model(bilstm_model, X1_test, X2_test, y_test, "Siamese BiLSTM")

print("Evaluating Attention-based Encoder model...")
attention_results = evaluate_model(attention_model, X1_test, X2_test, y_test, "Attention Encoder")

# Create comparison table
comparison_df = pd.DataFrame({
    'Model': [bilstm_results['model_name'], attention_results['model_name']],
    'Accuracy': [bilstm_results['accuracy'], attention_results['accuracy']],
    'Precision': [bilstm_results['precision'], attention_results['precision']],
    'Recall': [bilstm_results['recall'], attention_results['recall']],
    'F1-Score': [bilstm_results['f1_score'], attention_results['f1_score']],
    'ROC-AUC': [bilstm_results['roc_auc'], attention_results['roc_auc']]
})

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)


### Training Curves


In [None]:
# Plot training curves for both models
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss curves
axes[0, 0].plot(bilstm_history.history['loss'], label='BiLSTM Train Loss', linewidth=2)
axes[0, 0].plot(bilstm_history.history['val_loss'], label='BiLSTM Val Loss', linewidth=2)
axes[0, 0].plot(attention_history.history['loss'], label='Attention Train Loss', linewidth=2)
axes[0, 0].plot(attention_history.history['val_loss'], label='Attention Val Loss', linewidth=2)
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Training and Validation Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Accuracy curves
axes[0, 1].plot(bilstm_history.history['accuracy'], label='BiLSTM Train Accuracy', linewidth=2)
axes[0, 1].plot(bilstm_history.history['val_accuracy'], label='BiLSTM Val Accuracy', linewidth=2)
axes[0, 1].plot(attention_history.history['accuracy'], label='Attention Train Accuracy', linewidth=2)
axes[0, 1].plot(attention_history.history['val_accuracy'], label='Attention Val Accuracy', linewidth=2)
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Training and Validation Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# ROC curves
axes[1, 0].plot(bilstm_results['fpr'], bilstm_results['tpr'], 
                label=f"BiLSTM (AUC = {bilstm_results['roc_auc']:.3f})", linewidth=2)
axes[1, 0].plot(attention_results['fpr'], attention_results['tpr'], 
                label=f"Attention (AUC = {attention_results['roc_auc']:.3f})", linewidth=2)
axes[1, 0].plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].set_title('ROC Curves')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# PR curves
axes[1, 1].plot(bilstm_results['recall_curve'], bilstm_results['precision_curve'], 
                label='BiLSTM', linewidth=2)
axes[1, 1].plot(attention_results['recall_curve'], attention_results['precision_curve'], 
                label='Attention', linewidth=2)
axes[1, 1].set_xlabel('Recall')
axes[1, 1].set_ylabel('Precision')
axes[1, 1].set_title('Precision-Recall Curves')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=300, bbox_inches='tight')
plt.show()


### Confusion Matrices


In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# BiLSTM confusion matrix
sns.heatmap(bilstm_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues', 
            ax=axes[0], cbar_kws={'label': 'Count'})
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Siamese BiLSTM - Confusion Matrix')

# Attention confusion matrix
sns.heatmap(attention_results['confusion_matrix'], annot=True, fmt='d', cmap='Greens', 
            ax=axes[1], cbar_kws={'label': 'Count'})
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Attention Encoder - Confusion Matrix')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()


### Example Predictions


In [None]:
def analyze_predictions(model, X1_test, X2_test, y_test, pairs_df, test_indices, model_name, num_examples=5):
    """
    Analyze correct and incorrect predictions.
    test_indices: indices in original pairs_df that correspond to test set
    """
    # Get predictions for all test samples
    y_pred_proba = model.predict([X1_test, X2_test], batch_size=64, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    y_true = y_test
    
    # Find correct and incorrect predictions
    correct_mask = (y_pred == y_true)
    incorrect_mask = (y_pred != y_true)
    
    # Get indices in test set (0 to len(X1_test)-1)
    test_correct_indices = np.where(correct_mask)[0][:num_examples]
    test_incorrect_indices = np.where(incorrect_mask)[0][:num_examples]
    
    print(f"\n{'='*80}")
    print(f"{model_name} - Correct Predictions")
    print(f"{'='*80}")
    for test_idx in test_correct_indices:
        # Map test index to original pairs_df index
        orig_idx = test_indices[test_idx]
        text1 = pairs_df.iloc[orig_idx]['text1']
        text2 = pairs_df.iloc[orig_idx]['text2']
        # Truncate for display
        text1_display = text1[:150] + "..." if len(text1) > 150 else text1
        text2_display = text2[:150] + "..." if len(text2) > 150 else text2
        true_label = int(y_true[test_idx])
        pred_label = int(y_pred[test_idx])
        proba = float(y_pred_proba[test_idx][0])
        
        print(f"\nPair {orig_idx} (test idx {test_idx}):")
        print(f"Text1: {text1_display}")
        print(f"Text2: {text2_display}")
        print(f"True Label: {true_label}, Predicted: {pred_label}, Probability: {proba:.3f}")
        if true_label == 1:
            print("Reason: Semantic similarity correctly identified - both clauses are from same category")
        else:
            print("Reason: Correctly identified as different - clauses are from different categories")
    
    print(f"\n{'='*80}")
    print(f"{model_name} - Incorrect Predictions")
    print(f"{'='*80}")
    for test_idx in test_incorrect_indices:
        # Map test index to original pairs_df index
        orig_idx = test_indices[test_idx]
        text1 = pairs_df.iloc[orig_idx]['text1']
        text2 = pairs_df.iloc[orig_idx]['text2']
        # Truncate for display
        text1_display = text1[:150] + "..." if len(text1) > 150 else text1
        text2_display = text2[:150] + "..." if len(text2) > 150 else text2
        true_label = int(y_true[test_idx])
        pred_label = int(y_pred[test_idx])
        proba = float(y_pred_proba[test_idx][0])
        
        print(f"\nPair {orig_idx} (test idx {test_idx}):")
        print(f"Text1: {text1_display}")
        print(f"Text2: {text2_display}")
        print(f"True Label: {true_label}, Predicted: {pred_label}, Probability: {proba:.3f}")
        if true_label == 1 and pred_label == 0:
            print("Reason: False Negative - Similar clauses not recognized (possible lexical differences)")
        else:
            print("Reason: False Positive - Different clauses incorrectly matched (possible semantic overlap)")

# Analyze predictions for both models
analyze_predictions(bilstm_model, X1_test, X2_test, y_test, pairs_df, test_indices, 
                   "Siamese BiLSTM", num_examples=5)
analyze_predictions(attention_model, X1_test, X2_test, y_test, pairs_df, test_indices, 
                   "Attention Encoder", num_examples=5)


## 6. Observations and Conclusion


### Performance Comparison

Based on the evaluation metrics:

**Which model generalizes better?**
- The model with higher validation accuracy and lower validation loss during training typically generalizes better.
- The model with higher test set ROC-AUC score shows better ability to distinguish between similar and dissimilar clauses.
- F1-score provides a balanced measure considering both precision and recall.

**Which is faster?**
- Training time comparison will be shown in the final metrics section below.
- Inference time can be measured, but generally BiLSTM is faster due to simpler architecture.

**Observations about legal text challenges:**

1. **Legal Terminology**: Legal documents use specialized vocabulary and domain-specific terms that require careful semantic understanding.

2. **Long Dependencies**: Legal clauses often contain long sentences with complex dependencies that require models to capture long-range context.

3. **Subtle Differences**: Many legal clauses may appear similar but have critical differences (e.g., "shall" vs "may", specific conditions).

4. **Formal Language**: Legal text uses formal language patterns that differ from everyday text, making it challenging for models trained on general text.

5. **Context Sensitivity**: The same phrase might have different meanings in different legal contexts, requiring deep semantic understanding.

6. **Pair Creation Challenges**: Creating balanced positive/negative pairs is crucial - clauses from the same category should be semantically similar, but some categories may have more variation than others.

### Model Strengths and Weaknesses

**Siamese BiLSTM:**
- Strengths: Simpler architecture, faster training, good baseline performance
- Weaknesses: May struggle with long sequences, less attention to important words

**Attention-based Encoder:**
- Strengths: Can focus on important parts of the text, better handling of long sequences
- Weaknesses: More complex, potentially slower, may overfit with limited data

### Recommendations

1. **Data Augmentation**: Consider generating more training pairs, especially for underrepresented clause types.

2. **Hyperparameter Tuning**: Experiment with different embedding dimensions, LSTM units, and learning rates.

3. **Ensemble Methods**: Combining both models might improve performance.

4. **Domain-Specific Embeddings**: While we used learned embeddings, domain-specific pre-training could help.

5. **Evaluation Metrics**: Consider additional metrics like MRR (Mean Reciprocal Rank) for ranking tasks.


In [None]:
# Print final metrics table
print("\n" + "="*80)
print("FINAL METRICS COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

print(f"\nTraining Times:")
print(f"BiLSTM: {bilstm_training_time:.2f} seconds")
print(f"Attention: {attention_training_time:.2f} seconds")

print(f"\nModels saved:")
print(f"- bilstm_model.h5")
print(f"- attention_model.h5")
print(f"- tokenizer.json")
print(f"- pairs.csv")
