# 2. Federated Learning Training

**Project:** IoT Network Attack Detection using Federated Learning  
**Algorithm:** FedAvg (Federated Averaging)  
**Author:** Nguyen Duc Thang

---

## üìã Objectives

1. Load preprocessed client data
2. Create global DNN model
3. Initialize Federated Server and Clients
4. Run FL training loop (30-50 rounds)
   - Server broadcasts model ‚Üí Clients train locally ‚Üí Server aggregates (FedAvg)
5. Save trained model and training history

---

## üéØ Expected Outputs

- `../Output/models/global_model.h5`
- `../Output/metrics/training_history.json`


## 1. Setup and Imports


In [1]:
# Standard libraries
import os
import sys
import numpy as np
import json
import yaml
import matplotlib.pyplot as plt
from datetime import datetime

# Framework-agnostic imports
import torch  # PyTorch (only used if framework='pytorch')

# TensorFlow/Keras
import tensorflow as tf
from tensorflow import keras

# Import our utility modules
from utils import data_utils, model_utils, fl_utils
from utils.fedmade_aggregation import fedmade_aggregate_with_fallback

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Check GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"‚úÖ GPU available: {len(gpus)} device(s)")
    for gpu in gpus:
        print(f"   {gpu}")
else:
    print("‚ö†Ô∏è  No GPU found. Training will use CPU (slower).")

print(f"\n‚úÖ TensorFlow version: {tf.__version__}")
print(f"‚úÖ Keras version: {keras.__version__}")

2025-12-30 17:30:28.865621: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


‚ö†Ô∏è  No GPU found. Training will use CPU (slower).

‚úÖ TensorFlow version: 2.11.0
‚úÖ Keras version: 2.11.0


## 2. Load Configuration


In [2]:
# Load training configuration
config_path = 'configs/training_config.yaml'

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("üìÑ Configuration loaded:")
print(f"\nüîß FL Settings:")
print(f"   Number of clients: {config['num_clients']}")
print(f"   Number of rounds: {config['num_rounds']}")
print(f"   Local epochs: {config['local_epochs']}")
print(f"   Batch size: {config['batch_size']}")

print(f"\nüèóÔ∏è  Model Architecture:")
print(f"   Input dim: {config['model']['input_dim']}")
print(f"   Hidden layers: {config['model']['hidden_layers']}")
print(f"   Output classes: {config['model']['num_classes']}")
print(f"   Dropout rate: {config['model']['dropout_rate']}")

print(f"\n‚öôÔ∏è  Optimizer:")
print(f"   Type: {config['optimizer']['type']}")
print(f"   Learning rate: {config['optimizer']['learning_rate']}")

üìÑ Configuration loaded:

üîß FL Settings:
   Number of clients: 5
   Number of rounds: 30
   Local epochs: 10
   Batch size: 256

üèóÔ∏è  Model Architecture:
   Input dim: 46
   Hidden layers: [128, 64, 32]
   Output classes: 34
   Dropout rate: 0.3

‚öôÔ∏è  Optimizer:
   Type: adam
   Learning rate: 0.001


In [3]:
# ============================================================================
# LOAD AGGREGATION CONFIGURATION
# ============================================================================

agg_method = config['aggregation']['method']
fedmade_config = config['aggregation']['fedmade']

print("="*80)
print(f"CHI·∫æN L∆Ø·ª¢C AGGREGATION: {agg_method.upper()}")
print("="*80)

if agg_method == 'fedmade':
    print(f"\n‚öôÔ∏è  C·∫•u h√¨nh FedMade:")
    print(f"   Contribution threshold: {fedmade_config['contribution_threshold']}")
    print(f"   Accuracy weight: {fedmade_config['accuracy_weight']}")
    print(f"   Loss weight: {fedmade_config['loss_weight']}")
    print(f"   Verbose logging: {fedmade_config['verbose']}")
    print(f"\nüí° FedMade s·∫Ω t√≠nh ƒëi·ªÉm ƒë√≥ng g√≥p ƒë·ªông d·ª±a tr√™n performance c·ªßa clients")
else:
    print(f"\nüìä S·ª≠ d·ª•ng FedAvg (trung b√¨nh c·ªông chu·∫©n)")

# Kh·ªüi t·∫°o l∆∞u tr·ªØ contribution scores
contribution_scores_history = [] if agg_method == 'fedmade' else None


CHI·∫æN L∆Ø·ª¢C AGGREGATION: FEDMADE

‚öôÔ∏è  C·∫•u h√¨nh FedMade:
   Contribution threshold: 0.2
   Accuracy weight: 0.7
   Loss weight: 0.3
   Verbose logging: True

üí° FedMade s·∫Ω t√≠nh ƒëi·ªÉm ƒë√≥ng g√≥p ƒë·ªông d·ª±a tr√™n performance c·ªßa clients


## 3. Load Preprocessed Data


In [4]:
# Load client data
data_dir = '../Output/data'

print("üìÇ Loading client data...\n")

# Load data for each client
client_datasets = {}
for i in range(config['num_clients']):
    client_name = f'client_{i}'
    data = data_utils.load_client_data(data_dir, client_name)
    client_datasets[client_name] = data
    print(f"   ‚úì {client_name}: {len(data['X']):,} samples")

# Load test data
test_data = data_utils.load_client_data(data_dir, 'test')
X_test = test_data['X']
y_test = test_data['y']
print(f"   ‚úì test: {len(X_test):,} samples")

print(f"\n‚úÖ All data loaded successfully!")

üìÇ Loading client data...

   ‚úì client_0: 1,193,393 samples
   ‚úì client_1: 197,234 samples
   ‚úì client_2: 323,594 samples
   ‚úì client_3: 215,564 samples
   ‚úì client_4: 164,893 samples
   ‚úì test: 523,670 samples

‚úÖ All data loaded successfully!


## 4. Create Global Model


In [5]:
# ============================================================================
# CREATE GLOBAL MODEL (Framework-Agnostic)
# ============================================================================

# Get framework from config
framework = config.get('framework', 'tensorflow')

print(f"üîß Selected Framework: {framework.upper()}")
print(f"{'='*80}\n")

if framework == 'pytorch':
    # ========== PYTORCH TABTRANSFORMER PATH ==========
    print("üèóÔ∏è  Creating PyTorch TabTransformer model...\n")

    from utils.model_utils_pytorch import create_tabtransformer_from_config
    import os
    import json

    # Load (or create) feature configuration
    feature_config_path = '../Output/models/feature_config.json'

    if os.path.exists(feature_config_path):
        with open(feature_config_path, 'r') as f:
            feature_config = json.load(f)
    else:
        print(f"‚ö†Ô∏è  Feature config not found at: {feature_config_path}")
        print("   Creating a default feature_config.json (safe fallback)...")

        feature_config = dict(config.get('features', {}) or {})
        input_dim = int(config.get('model', {}).get('input_dim', 46))

        # If not provided, use the project default split used elsewhere in the code:
        # first 20 columns are treated as categorical (see utils/fl_utils_pytorch.split_features)
        if not feature_config.get('categorical_cardinalities'):
            num_categorical = min(20, input_dim)
            feature_config['categorical_indices'] = list(range(num_categorical))
            feature_config['numerical_indices'] = list(range(num_categorical, input_dim))
            feature_config['categorical_cardinalities'] = [50] * num_categorical

        feature_config['num_categorical'] = len(feature_config.get('categorical_cardinalities', []))
        feature_config['num_numerical'] = input_dim - feature_config['num_categorical']
        feature_config['total_features'] = input_dim

        os.makedirs(os.path.dirname(feature_config_path), exist_ok=True)
        with open(feature_config_path, 'w') as f:
            json.dump(feature_config, f, indent=2)

        print(f"   ‚úì Saved: {feature_config_path}")

    print(f"üìä Feature Configuration:")
    print(f"   Categorical features: {feature_config['num_categorical']}")
    print(f"   Numerical features: {feature_config['num_numerical']}")
    print(f"   Total features: {feature_config['total_features']}\n")

    # Add feature config to model config
    config['features'] = feature_config

    # Create TabTransformer
    global_model = create_tabtransformer_from_config(config)
    
    # Set device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    global_model.to(device)
    print(f"\nüíª Device: {device}")
    
    if device == 'cpu':
        print("   ‚ö†Ô∏è  Using CPU. Training will be slower.")
    
else:
    # ========== TENSORFLOW DNN PATH (EXISTING) ==========
    print("üèóÔ∏è  Creating TensorFlow DNN model...\n")
    
    # Create and compile global model (existing code)
    global_model = model_utils.create_and_compile_model(config)
    
    # Print model summary (existing code)
    model_utils.print_model_summary(global_model)

print(f"\n{'='*80}")

üîß Selected Framework: PYTORCH

üèóÔ∏è  Creating PyTorch TabTransformer model...

‚ö†Ô∏è  Feature config not found at: ../Output/models/feature_config.json
   Creating a default feature_config.json (safe fallback)...
   ‚úì Saved: ../Output/models/feature_config.json
üìä Feature Configuration:
   Categorical features: 20
   Numerical features: 26
   Total features: 46


üèóÔ∏è  Created TabTransformer model:
   Categorical features: 20
   Numerical features: 26
   Total features: 46
   Embedding dimension: 32
   Transformer layers: 2
   Attention heads: 4
   Output classes: 34
   Total parameters: 66,178
   Trainable parameters: 66,178
   Model size: 258.51 KB (FP32)

üíª Device: cpu
   ‚ö†Ô∏è  Using CPU. Training will be slower.



## 5. Initialize Federated Server and Clients


In [6]:
# ============================================================================
# INITIALIZE FEDERATED SERVER AND CLIENTS (Framework-Agnostic)
# ============================================================================

if framework == 'pytorch':
    # ========== PYTORCH PATH ==========
    from utils.fl_utils_pytorch import create_data_loaders
    
    print("üì° Creating PyTorch DataLoaders for clients...")
    
    # Create DataLoaders for each client
    client_loaders = []
    for i in range(config['num_clients']):
        client_name = f'client_{i}'
        client_data = client_datasets[client_name]
        
        loader = create_data_loaders(
            X=client_data['X'],
            y=client_data['y'],
            batch_size=config['batch_size'],
            shuffle=True
        )
        client_loaders.append(loader)
        print(f"   ‚úì {client_name}: {len(client_data['X']):,} samples")
    
    # Create test DataLoader
    test_loader = create_data_loaders(
        X=X_test,
        y=y_test,
        batch_size=config['batch_size'],
        shuffle=False
    )
    print(f"   ‚úì test: {len(X_test):,} samples")
    
    print(f"\n‚úÖ {len(client_loaders)} PyTorch DataLoaders created!")
    
else:
    # ========== TENSORFLOW PATH (EXISTING) ==========
    
    # Initialize Federated Server
    print("üñ•Ô∏è  Initializing Federated Server...")
    server = fl_utils.FederatedServer(model=global_model)
    print("   ‚úì Server initialized\n")
    
    # Initialize Federated Clients
    print("üë• Initializing Federated Clients...")
    clients = []
    for i in range(config['num_clients']):
        client_name = f'client_{i}'
        client_data = client_datasets[client_name]
        
        client = fl_utils.FederatedClient(
            client_id=i,
            X_train=client_data['X'],
            y_train=client_data['y']
        )
        clients.append(client)
    
    print(f"\n‚úÖ {len(clients)} clients initialized!")

üì° Creating PyTorch DataLoaders for clients...
   ‚úì client_0: 1,193,393 samples
   ‚úì client_1: 197,234 samples
   ‚úì client_2: 323,594 samples
   ‚úì client_3: 215,564 samples
   ‚úì client_4: 164,893 samples
   ‚úì test: 523,670 samples

‚úÖ 5 PyTorch DataLoaders created!


## 6. Run Federated Learning Training

This is the main training loop. It will take several hours depending on:

- Dataset size
- Number of rounds
- Hardware (GPU vs CPU)

**Estimated time:**

- With GPU: 4-6 hours (full dataset, 30 rounds)
- With CPU: 8-12 hours (full dataset, 30 rounds)
- With 10% sample: 30-60 minutes


In [None]:
# ============================================================================
# RUN FEDERATED LEARNING TRAINING (Framework-Agnostic)
# ============================================================================

# Record start time
start_time = datetime.now()
print(f"üïê Training started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}\n")

if framework == 'pytorch':
    # ========== PYTORCH TRAINING ==========
    from utils.fl_utils_pytorch import federated_training_loop_pytorch
    
    history = federated_training_loop_pytorch(
        global_model=global_model,
        client_data_loaders=client_loaders,
        test_loader=test_loader,
        num_rounds=config['num_rounds'],
        local_epochs=config['local_epochs'],
        learning_rate=config['optimizer']['learning_rate'],
        device=device,
        num_categorical=feature_config['num_categorical'],
        categorical_cardinalities=feature_config['categorical_cardinalities'],
        verbose=True,
        aggregation_method=agg_method,
        aggregation_config=fedmade_config if agg_method == 'fedmade' else None,
        client_metrics_history=contribution_scores_history
    )
    
    # Convert to same format as TensorFlow history
    training_history = {
        'round': history['round'],
        'accuracy': history['accuracy'],
        'loss': history['loss']
    }
    
else:
    # ========== TENSORFLOW TRAINING (EXISTING) ==========
    
    # Run federated training loop
    training_history = fl_utils.federated_training_loop(
        server=server,
        clients=clients,
        X_test=X_test,
        y_test=y_test,
        num_rounds=config['num_rounds'],
        local_epochs=config['local_epochs'],
        batch_size=config['batch_size'],
        verbose=1
    )

# Record end time  
end_time = datetime.now()
training_duration = end_time - start_time

print(f"\nüïê Training completed at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"‚è±Ô∏è  Total training time: {training_duration}")
print(f"   ({training_duration.total_seconds() / 60:.2f} minutes)")

üïê Training started at: 2025-12-30 17:31:03


FEDERATED LEARNING TRAINING (PyTorch + TabTransformer)
Number of clients: 5
Number of rounds: 30
Local epochs per round: 10
Learning rate: 0.001
Device: cpu


ROUND 1/30
üì° Distributing global model to 5 clients...

   Client 0 training...


## 7. Visualize Training Progress


In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot accuracy
axes[0].plot(training_history['round'], training_history['accuracy'], 
            marker='o', linewidth=2, markersize=6)
axes[0].set_title('Global Model Accuracy vs Round', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Round', fontsize=12)
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].grid(True, alpha=0.3)
axes[0].set_ylim([0, 1])

# Add horizontal line at 95% target
axes[0].axhline(y=0.95, color='r', linestyle='--', linewidth=2, label='Target (95%)')
axes[0].legend()

# Plot loss
axes[1].plot(training_history['round'], training_history['loss'], 
            marker='o', linewidth=2, markersize=6, color='orange')
axes[1].set_title('Global Model Loss vs Round', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Round', fontsize=12)
axes[1].set_ylabel('Loss', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final metrics
final_accuracy = training_history['accuracy'][-1]
final_loss = training_history['loss'][-1]

print(f"\nüìä Final Metrics:")
print(f"   Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print(f"   Loss: {final_loss:.4f}")

if final_accuracy >= 0.95:
    print(f"\n‚úÖ Target accuracy (>95%) achieved!")
else:
    print(f"\n‚ö†Ô∏è  Target accuracy (>95%) not yet achieved.")
    print(f"   Consider increasing num_rounds or tuning hyperparameters.")

## 8. Save Trained Model


In [None]:
# ============================================================================
# SAVE TRAINED MODEL (Framework-Agnostic)
# ============================================================================

# Create models directory if it doesn't exist
models_dir = '../Output/models'
os.makedirs(models_dir, exist_ok=True)

if framework == 'pytorch':
    # ========== SAVE PYTORCH MODEL ==========
    model_path = os.path.join(models_dir, 'global_model.pth')
    
    # Save model state dict
    torch.save(global_model.state_dict(), model_path)
    
    print(f"üíæ PyTorch model saved to: {model_path}")
    
    # Get model size
    model_size_mb = os.path.getsize(model_path) / (1024 ** 2)
    
else:
    # ========== SAVE TENSORFLOW MODEL (EXISTING) ==========
    model_path = os.path.join(models_dir, config['paths']['global_model'])
    
    # Save model
    server.model.save(model_path)
    print(f"   üíæ Global model saved to: {model_path}")
    
    # Get model size
    model_size_mb = os.path.getsize(model_path) / (1024 ** 2)

print(f"\n‚úÖ Model saved successfully!")
print(f"   Path: {model_path}")
print(f"   Size: {model_size_mb:.2f} MB")

In [None]:
# ============================================================================
# L∆ØU FEDMADE METRICS
# ============================================================================

if agg_method == 'fedmade' and contribution_scores_history:
    import json
    from datetime import datetime
    
    print("\n" + "="*80)
    print("L∆ØU FEDMADE METRICS")
    print("="*80)
    
    metrics_dir = fedmade_config['output_dir']
    os.makedirs(metrics_dir, exist_ok=True)
    
    # L∆∞u contribution scores history
    scores_path = os.path.join(metrics_dir, fedmade_config['contribution_scores_json'])
    with open(scores_path, 'w') as f:
        json.dump(contribution_scores_history, f, indent=2)
    
    print(f"\nüíæ FedMade metrics ƒë√£ l∆∞u:")
    print(f"   - Contribution scores: {scores_path}")
    print(f"   - T·ªïng s·ªë rounds: {len(contribution_scores_history)}")
    
    print(f"\n‚úÖ C√≥ th·ªÉ s·ª≠ d·ª•ng scores n√†y ƒë·ªÉ v·∫Ω heatmap contribution")
else:
    print("\n‚è≠Ô∏è  B·ªè qua l∆∞u FedMade metrics (kh√¥ng s·ª≠ d·ª•ng FedMade)")


## 9. Save Training History


In [None]:
# Prepare training history for saving
history_to_save = {
    'config': {
        'num_clients': config['num_clients'],
        'num_rounds': config['num_rounds'],
        'local_epochs': config['local_epochs'],
        'batch_size': config['batch_size'],
        'learning_rate': config['optimizer']['learning_rate'],
        'model_architecture': config['model']['hidden_layers']
    },
    'training_info': {
        'start_time': start_time.strftime('%Y-%m-%d %H:%M:%S'),
        'end_time': end_time.strftime('%Y-%m-%d %H:%M:%S'),
        'duration_seconds': training_duration.total_seconds(),
        'gpu_used': len(gpus) > 0
    },
    'history': {
        'round': training_history['round'],
        'loss': [float(x) for x in training_history['loss']],  # Convert to float for JSON
        'accuracy': [float(x) for x in training_history['accuracy']]
    },
    'final_metrics': {
        'accuracy': float(final_accuracy),
        'loss': float(final_loss)
    }
}

# Save to JSON
output_metrics_dir = '../Output/metrics'
os.makedirs(output_metrics_dir, exist_ok=True)

history_path = os.path.join(output_metrics_dir, config['paths']['training_history'])
with open(history_path, 'w') as f:
    json.dump(history_to_save, f, indent=2)

print(f"üíæ Training history saved to: {history_path}")

## 10. Quick Evaluation on Test Set


In [None]:
# Load the saved model and evaluate
print("üîç Loading saved model for verification...")
loaded_model = model_utils.load_model(model_path)

# Evaluate on test set
print("\nüìä Evaluating on test set...")
test_loss, test_accuracy = loaded_model.evaluate(X_test, y_test, verbose=0)

print(f"\n‚úÖ Test Set Results:")
print(f"   Loss: {test_loss:.4f}")
print(f"   Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Generate predictions for a few samples
print(f"\nüîÆ Sample predictions (first 10 test samples):")
sample_predictions = loaded_model.predict(X_test[:10], verbose=0)
predicted_classes = np.argmax(sample_predictions, axis=1)
true_classes = y_test[:10]

for i in range(10):
    match = "‚úì" if predicted_classes[i] == true_classes[i] else "‚úó"
    print(f"   {match} Sample {i+1}: Predicted={predicted_classes[i]}, True={true_classes[i]}")

## 11. Summary


In [None]:
print("="*80)
print("FEDERATED LEARNING TRAINING SUMMARY")
print("="*80)

print(f"\nüîß Training Configuration:")
print(f"   Clients: {config['num_clients']}")
print(f"   Rounds: {config['num_rounds']}")
print(f"   Local epochs: {config['local_epochs']}")
print(f"   Batch size: {config['batch_size']}")
print(f"   Learning rate: {config['optimizer']['learning_rate']}")

print(f"\n‚è±Ô∏è  Training Time:")
print(f"   Duration: {training_duration}")
print(f"   ({training_duration.total_seconds() / 60:.2f} minutes)")
print(f"   ({training_duration.total_seconds() / 3600:.2f} hours)")

print(f"\nüìä Final Results:")
print(f"   Test Accuracy: {test_accuracy*100:.2f}%")
print(f"   Test Loss: {test_loss:.4f}")

if test_accuracy >= 0.95:
    print(f"\n‚úÖ SUCCESS: Target accuracy (>95%) achieved!")
else:
    print(f"\n‚ö†Ô∏è  Target accuracy (>95%) not achieved.")
    print(f"   Gap: {(0.95 - test_accuracy)*100:.2f}%")
    print(f"\nüí° Suggestions:")
    print(f"   - Increase num_rounds to 50")
    print(f"   - Reduce learning_rate to 0.0005")
    print(f"   - Increase local_epochs to 7")

print(f"\nüíæ Output Files:")
print(f"   Model: {model_path}")
print(f"   History: {history_path}")

print(f"\nüìù Next step: Run 3_Model_Evaluation_Export.ipynb")
print(f"   for detailed metrics and visualizations.")
print("="*80)