<a href="https://colab.research.google.com/github/mallelamanojkumar90/AIML/blob/main/Week5_Day2_NN_Architecture_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 5, Day 2: Neural Network Architecture and Training

## Learning Objectives
- Understand different neural network architectures
- Learn optimization techniques
- Master regularization methods
- Practice building and training networks

## Topics Covered
1. Network Architectures
2. Optimization Methods
3. Regularization Techniques
4. Model Evaluation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.regularizers import l1, l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 1. Network Architectures

In [None]:
def compare_architectures():
    # Generate synthetic data
    np.random.seed(42)
    X = np.random.randn(1000, 10)
    y = np.sum(X**2, axis=1) + np.random.normal(0, 0.1, 1000)

    # Split and scale data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define different architectures
    def create_shallow_network():
        model = Sequential([
            Dense(32, activation='relu', input_shape=(10,)),
            Dense(1)
        ])
        return model

    def create_deep_network():
        model = Sequential([
            Dense(16, activation='relu', input_shape=(10,)),
            Dense(16, activation='relu'),
            Dense(16, activation='relu'),
            Dense(16, activation='relu'),
            Dense(1)
        ])
        return model

    def create_wide_network():
        model = Sequential([
            Dense(64, activation='relu', input_shape=(10,)),
            Dense(64, activation='relu'),
            Dense(1)
        ])
        return model

    # Create and compile models
    models = {
        'Shallow': create_shallow_network(),
        'Deep': create_deep_network(),
        'Wide': create_wide_network()
    }

    for name, model in models.items():
        model.compile(optimizer='adam', loss='mse')

    # Train and evaluate models
    histories = {}
    for name, model in models.items():
        print(f"\nTraining {name} Network:")
        history = model.fit(X_train_scaled, y_train,
                          validation_split=0.2,
                          epochs=50,
                          verbose=0)
        histories[name] = history.history

    # Plot training curves
    plt.figure(figsize=(12, 5))

    plt.subplot(121)
    for name, history in histories.items():
        plt.plot(history['loss'], label=f'{name} (train)')
        plt.plot(history['val_loss'], '--', label=f'{name} (val)')
    plt.title('Training History')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Compare final performance
    results = []
    for name, model in models.items():
        test_loss = model.evaluate(X_test_scaled, y_test, verbose=0)
        results.append({'Architecture': name, 'Test Loss': test_loss})

    plt.subplot(122)
    results_df = pd.DataFrame(results)
    sns.barplot(data=results_df, x='Architecture', y='Test Loss')
    plt.title('Test Performance')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

compare_architectures()

## 2. Optimization Methods

In [None]:
def compare_optimizers():
    # Generate data
    np.random.seed(42)
    X = np.random.randn(1000, 5)
    y = np.sum(X**2, axis=1) + np.random.normal(0, 0.1, 1000)

    # Split and scale data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define optimizers
    optimizers = {
        'SGD': SGD(learning_rate=0.01),
        'Adam': Adam(learning_rate=0.01),
        'RMSprop': RMSprop(learning_rate=0.01)
    }

    # Create and train models
    histories = {}
    for name, optimizer in optimizers.items():
        model = Sequential([
            Dense(32, activation='relu', input_shape=(5,)),
            Dense(32, activation='relu'),
            Dense(1)
        ])

        model.compile(optimizer=optimizer, loss='mse')

        print(f"\nTraining with {name}:")
        history = model.fit(X_train_scaled, y_train,
                          validation_split=0.2,
                          epochs=50,
                          verbose=0)
        histories[name] = history.history

    # Plot results
    plt.figure(figsize=(10, 5))
    for name, history in histories.items():
        plt.plot(history['loss'], label=f'{name} (train)')
        plt.plot(history['val_loss'], '--', label=f'{name} (val)')

    plt.title('Optimizer Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

compare_optimizers()

## 3. Regularization Techniques

In [None]:
def compare_regularization():
    # Generate data
    np.random.seed(42)
    X = np.random.randn(1000, 20)
    y = np.sum(X[:, :5]**2, axis=1) + np.random.normal(0, 0.1, 1000)

    # Split and scale data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define models with different regularization
    def create_base_model():
        return Sequential([
            Dense(64, activation='relu', input_shape=(20,)),
            Dense(64, activation='relu'),
            Dense(1)
        ])

    def create_l2_model():
        return Sequential([
            Dense(64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(20,)),
            Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
            Dense(1)
        ])

    def create_dropout_model():
        return Sequential([
            Dense(64, activation='relu', input_shape=(20,)),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(1)
        ])

    def create_batchnorm_model():
        return Sequential([
            Dense(64, activation='relu', input_shape=(20,)),
            BatchNormalization(),
            Dense(64, activation='relu'),
            BatchNormalization(),
            Dense(1)
        ])

    models = {
        'Base': create_base_model(),
        'L2': create_l2_model(),
        'Dropout': create_dropout_model(),
        'BatchNorm': create_batchnorm_model()
    }

    # Train models
    histories = {}
    for name, model in models.items():
        model.compile(optimizer='adam', loss='mse')

        print(f"\nTraining {name} Model:")
        history = model.fit(X_train_scaled, y_train,
                          validation_split=0.2,
                          epochs=50,
                          verbose=0)
        histories[name] = history.history

    # Plot results
    plt.figure(figsize=(15, 5))

    plt.subplot(121)
    for name, history in histories.items():
        plt.plot(history['loss'], label=f'{name} (train)')
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(122)
    for name, history in histories.items():
        plt.plot(history['val_loss'], label=name)
    plt.title('Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

compare_regularization()

## Practical Exercises

In [None]:
# Exercise 1: Architecture Design

def architecture_exercise():
    # Generate complex dataset
    np.random.seed(42)
    n_samples = 1000

    t = np.random.uniform(0, 2*np.pi, n_samples)
    X = np.column_stack([
        np.sin(t) + np.random.normal(0, 0.1, n_samples),
        np.cos(t) + np.random.normal(0, 0.1, n_samples)
    ])
    y = np.sin(2*t) + 0.5*np.cos(3*t) + np.random.normal(0, 0.1, n_samples)

    # Plot data
    plt.figure(figsize=(10, 5))
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis')
    plt.colorbar(label='Target')
    plt.title('Complex Dataset')
    plt.show()

    print("Task: Design a neural network architecture")
    print("1. Experiment with different depths and widths")
    print("2. Try different activation functions")
    print("3. Add appropriate regularization")
    print("4. Compare performance")

    # Your code here

architecture_exercise()

In [None]:
# Exercise 2: Optimization Challenge

def optimization_exercise():
    # Generate data with multiple local minima
    np.random.seed(42)
    X = np.random.uniform(-5, 5, (1000, 2))
    y = np.sin(X[:, 0]) * np.cos(X[:, 1]) + \
        0.1 * (X[:, 0]**2 + X[:, 1]**2) + \
        np.random.normal(0, 0.1, 1000)

    # Plot target function
    xx, yy = np.meshgrid(np.linspace(-5, 5, 100), np.linspace(-5, 5, 100))
    zz = np.sin(xx) * np.cos(yy) + 0.1 * (xx**2 + yy**2)

    plt.figure(figsize=(10, 8))
    plt.contour(xx, yy, zz, levels=20)
    plt.colorbar(label='Target')
    plt.title('Complex Target Function')
    plt.show()

    print("Task: Optimize neural network training")
    print("1. Try different optimizers")
    print("2. Tune learning rates")
    print("3. Implement learning rate scheduling")
    print("4. Compare convergence")

    # Your code here

optimization_exercise()

## MCQ Quiz

1. What is the advantage of deep networks?
   - a) Faster training
   - b) Less parameters
   - c) Hierarchical feature learning
   - d) Simpler optimization

2. Which optimizer is most commonly used?
   - a) SGD
   - b) Adam
   - c) RMSprop
   - d) Adagrad

3. What does L2 regularization do?
   - a) Adds bias
   - b) Penalizes large weights
   - c) Speeds up training
   - d) Increases capacity

4. What is the purpose of dropout?
   - a) Feature selection
   - b) Prevent overfitting
   - c) Speed up training
   - d) Improve accuracy

5. When should you use batch normalization?
   - a) Only in shallow networks
   - b) Only in deep networks
   - c) With any architecture
   - d) Never with dropout

6. What is the vanishing gradient problem?
   - a) Loss becomes zero
   - b) Gradients become too small
   - c) Network becomes too large
   - d) Learning rate is too high

7. Which is NOT a way to prevent overfitting?
   - a) Dropout
   - b) L2 regularization
   - c) Increasing model size
   - d) Early stopping

8. What does learning rate scheduling do?
   - a) Increases model capacity
   - b) Adjusts learning rate during training
   - c) Selects best architecture
   - d) Prevents overfitting

9. Which is true about skip connections?
   - a) Only used in CNNs
   - b) Help gradient flow
   - c) Increase parameters
   - d) Slow down training

10. What is the benefit of wide networks?
    - a) Less parameters
    - b) Better generalization
    - c) More feature capacity
    - d) Faster training

Answers: 1-c, 2-b, 3-b, 4-b, 5-c, 6-b, 7-c, 8-b, 9-b, 10-c