In [None]:
def calculate_layers(V, H, I, N, h=None, g_size=None):
    # Embedding Layer
    embedding_layer = V * H
    print(f"Embedding Layer: {embedding_layer}")

    # RMS Norm Layer
    rms_norm_layer = H
    print(f"RMS Norm Layer: {rms_norm_layer}")

    # Query (Q), Key (K), Value (V)
    q = H * H  # Or can be H * h * (H / h), but it simplifies to H * H
    if g_size is None:
        k = H * H
        v = H * H
    else:
        k = H * H / g_size
        v = H * H / g_size
    print(f"Query (Q): {q}")
    print(f"Key (K): {k}")
    print(f"Value (V): {v}")

    # Output (O)
    o = H * H
    print(f"Output (O): {o}")

    # MLP calculation
    mlp = 2 * H * I + I * H
    print(f"MLP: {mlp}")

    # Attention Blocks
    attention_blocks = N * (2 * rms_norm_layer + q + k + v + o + mlp)
    print(f"Attention Blocks: {attention_blocks}")

    # All Layers (Sum of all components)
    all_layers = embedding_layer + attention_blocks + rms_norm_layer 

    print(f"All Layers: {all_layers}")
    return all_layers

calculate_layers(128256, 2048, 4480, 12, g_size=1)

Embedding Layer: 164167680
RMS Norm Layer: 1280
Query (Q): 1638400
Key (K): 1638400.0
Value (V): 1638400.0
Output (O): 1638400
MLP: 17203200
Attention Blocks: 285112320.0
All Layers: 449281280.0


449281280.0

In [8]:
import json
config_path = './configs/model_configs/llama_750M_config.json'
def MFU_calculation_with_config(config_path, batch_size, sequence_length, number_of_GPU, GPU_peak_TFLOPS, iteration_time):
    """
    Calculates Model FLOPs Utilization (MFU) for a given model and hardware setup, using a configuration file.

    Parameters:
    - config_path (str): Path to the model's configuration JSON file.
    - batch_size (int): Batch size used in training.
    - sequence_length (int): Sequence length of the input data.
    - number_of_GPU (int): Number of GPUs used.
    - GPU_peak_TFLOPS (float): Theoretical peak TFLOPs of a single GPU.
    - iteration_time (float): Time taken for one iteration (in seconds).

    Returns:
    - str: MFU as a percentage rounded to 4 decimal places.
    """
    # Load configuration from the JSON file
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    
    v = config.get('vocab_size', 0)  # Vocabulary size
    n = config.get('num_attention_heads', 0)  # Number of attention heads
    h = config.get('hidden_size', 0)  # Hidden state dimension
    i = config.get('intermediate_size', 0)  # SwiGLU projection dimension
    N = config.get('num_hidden_layers', 0)  # Number of layers
    
    # Ensure all parameters are available
    if not all([v, n, h, i, N]):
        raise ValueError("Configuration file is missing one or more required parameters.")
    
    b = batch_size
    s = sequence_length

    # FLOPs calculation for one forward pass
    flops_per_forward = (
        N * (6 * b * s * h**2 + 4 * b * s**2 * h + 3 * b * s**2 * n + 2 * b * s * h**2)
        + N * (6 * b * s * h * i)
        + 2 * b * s * h * v
    )

    # Forward-backward pass is roughly 3 times the forward pass FLOPs
    flops_per_forward_backward = (3 * flops_per_forward) / 10**12  # Convert to TFLOPs

    # GPU peak TFLOPs per iteration
    GPU_TFLOPs_per_iteration = number_of_GPU * GPU_peak_TFLOPS * iteration_time

    # Calculate MFU
    MFU = (flops_per_forward_backward / GPU_TFLOPs_per_iteration) * 100

    # Return MFU rounded to 4 decimal places
    return f"MFU: {MFU:.4f}%"

MFU_calculation_with_config(config_path=config_path, batch_size=240, sequence_length=1024, number_of_GPU=4, GPU_peak_TFLOPS=149.7, iteration_time=6.3)

'MFU: 31.9310%'

In [3]:
36 * 4

144

In [4]:
# Constants
gpu_flops = 149.7 * 10**12  # GPU FLOPs in teraflops
day_time = 24 * 60 * 60  * 2   # Seconds in a day
num_gpus = 4              # Number of GPUs
mfu = 0.24          # Model utilization factor

# Calculate utilized FLOPs (C)
C = gpu_flops * day_time * num_gpus * mfu
print(f"Utilized FLOPs (C): {C:.2e}")

# Define functions for N_opt(C), D_opt(C), and L_opt(C)
def N_opt(C):
    """Calculates optimal number of parameters (N_opt) given utilized FLOPs (C)."""
    return 0.6 * C**0.45

def D_opt(C):
    """Calculates optimal dataset size (D_opt) given utilized FLOPs (C)."""
    return 0.29 * C**0.53

def L_opt(C):
    """Calculates optimal loss (L_opt) given utilized FLOPs (C)."""
    return 1070 * C**-0.154 + 1.7

# Calculate values for N_opt, D_opt, and L_opt
N_opt_value = N_opt(C)
D_opt_value = D_opt(C)
L_opt_value = L_opt(C)

# Print results
print(f"N_opt(C): {N_opt_value/ 10**6} Million")
print(f"D_opt(C): {D_opt_value / 10**9} Billion")
print(f"L_opt(C): {L_opt_value:.2e}")

Utilized FLOPs (C): 2.48e+19
N_opt(C): 320.5662505403389 Million
D_opt(C): 5.51781946390067 Billion
L_opt(C): 2.80e+00


In [5]:
C / (6 * D_opt_value) / 10**6

750.0980463529183

In [6]:
7/17.3

0.40462427745664736