In [4]:
from tensorflow.keras.mixed_precision import set_global_policy
from tensorflow.keras import layers
import keras
import tensorflow as tf

**Mixed Precision Training**

In [2]:
set_global_policy('mixed_float16')

In [3]:
# Build model (automatically uses float16)
model = keras.Sequential([
    layers.Dense(128, activation='relu'),
    layers.Dense(10, activation='softmax', dtype='float32')  # Output in float32
])

**Gradient Accumulation**

In [5]:
class GradientAccumulationModel(keras.Model):
    def __init__(self, accumulation_steps=4):
        super().__init__()
        self.accumulation_steps = accumulation_steps
        self.accumulation_counter = 0
    
    def train_step(self, data):
        x, y = data
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss = self.compiled_loss(y, y_pred)
            
        gradients = tape.gradient(loss, self.trainable_variables)
        
        if self.accumulation_counter == 0:
            self.accumulated_gradients = [tf.zeros_like(g) for g in gradients]
        
        for i, grad in enumerate(gradients):
            self.accumulated_gradients[i] += grad / self.accumulation_steps
        
        self.accumulation_counter += 1

        if self.accumulation_counter == self.accumulation_steps:
            self.optimizer.apply_gradients(
                zip(self.accumulated_gradients, self.trainable_variables)
            )
            self.accumulation_counter = 0
        
        return {'loss' : loss}

**Learning Rate Scheduling**

In [6]:
# Cosine annealing
lr_schedule = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=0.0001,
    decay_steps=1000
)

In [8]:
# One-cycle policy
def one_cycle_lr(epoch, max_epochs):
    if epoch < max_epochs // 2:
        return 0.001 * (2 * epoch / max_epochs)
    else:
        return 0.0001 * (2 - 2 * epoch / max_epochs)
    
lr_callback = keras.callbacks.LearningRateScheduler(one_cycle_lr)