# Setup

In [10]:
import subprocess
import tempfile
import os
import struct
import numpy as np
# import matplotlib.pyplot as plt
from typing import Tuple
from pathlib import Path
import time

In [2]:
### SET CWD TO REPO ROOT

os.chdir(Path.cwd().parent)
print("CWD now:", Path.cwd())

CWD now: /workspaces/LMUL-Hardware-Acceleration


# Utilities

In [3]:
def float_to_bf16(f: float) -> int:
    """Convert float32 to bfloat16 (16-bit)"""
    # Handle special cases
    if np.isnan(f):
        return 0x7FC0  # NaN
    if np.isinf(f):
        return 0xFF80 if f < 0 else 0x7F80  # +/- Infinity
    
    # Clamp to float32 range to avoid overflow
    f = np.clip(f, -3.4e38, 3.4e38)
    
    # Get float32 bits
    f32_bits = struct.unpack('>I', struct.pack('>f', np.float32(f)))[0]
    # BF16 = top 16 bits of FP32
    bf16_bits = (f32_bits >> 16) & 0xFFFF
    return bf16_bits

def bf16_to_float(bf16: int) -> float:
    """Convert bfloat16 to float32"""
    # Shift back to FP32 position
    f32_bits = (bf16 & 0xFFFF) << 16
    return struct.unpack('>f', struct.pack('>I', f32_bits))[0]

def unpack_bf16(bf16: int) -> Tuple[int, int, int]:
    """Unpack BF16 into sign, exponent, mantissa"""
    sign = (bf16 >> 15) & 1
    exp = (bf16 >> 7) & 0xFF
    mant = bf16 & 0x7F
    return sign, exp, mant

def print_bf16(bf16: int, label: str = ""):
    """Pretty print BF16 representation"""
    sign, exp, mant = unpack_bf16(bf16)
    f = bf16_to_float(bf16)
    print(f"{label:20s} = 0x{bf16:04X} | s={sign} e={exp:3d} m={mant:3d} | float={f:+.6e}")

print("✓ BF16 utilities ready")

✓ BF16 utilities ready


In [4]:
class LMULTester:
    """Test LMUL hardware with clock-based simulation"""
    
    def __init__(self, verilog_file='rtl/top_lmul.v'):
        self.verilog_file = verilog_file
        
    def test(self, a_bf16: int, b_bf16: int) -> int:
        """
        Test LMUL multiplication in hardware
        
        Args:
            a_bf16: First operand as BF16 integer (16-bit)
            b_bf16: Second operand as BF16 integer (16-bit)
            
        Returns:
            result_bf16: Result as BF16 integer (16-bit)
        """
        # Create testbench with clock
        testbench = f'''
`timescale 1ns/1ps

module tb;
    reg clk;
    reg rstn;
    reg i_valid;
    wire i_ready;
    reg [15:0] i_a;
    reg [15:0] i_b;
    wire o_valid;
    reg o_ready;
    wire [15:0] o_p;
    
    // Instantiate top module
    top_lmul dut (
        .clk(clk),
        .rstn(rstn),
        .i_valid(i_valid),
        .i_ready(i_ready),
        .i_a(i_a),
        .i_b(i_b),
        .o_valid(o_valid),
        .o_ready(o_ready),
        .o_p(o_p)
    );
    
    // Clock generation
    initial clk = 0;
    always #5 clk = ~clk;  // 10ns period = 100MHz
    
    initial begin
        // Reset
        rstn = 0;
        i_valid = 0;
        i_a = 0;
        i_b = 0;
        o_ready = 1;  // Always ready to receive
        
        #20;
        rstn = 1;
        #10;
        
        // Apply inputs
        i_a = 16'h{a_bf16:04x};
        i_b = 16'h{b_bf16:04x};
        i_valid = 1;
        
        // Wait for i_ready
        @(posedge clk);
        while (!i_ready) @(posedge clk);
        
        // Deassert valid after acceptance
        @(posedge clk);
        i_valid = 0;
        
        // Wait for output valid
        while (!o_valid) @(posedge clk);
        
        // Display result
        $display("%h", o_p);
        
        #20;
        $finish;
    end
    
    // Timeout
    initial begin
        #1000;
        $display("ERROR: Timeout");
        $finish;
    end
endmodule
'''
        
        # Write testbench
        with tempfile.NamedTemporaryFile(mode='w', suffix='.v', delete=False) as f:
            tb_file = f.name
            f.write(testbench)
        
        try:
            # Compile
            out_file = '/tmp/lmul_sim.out'
            compile_result = subprocess.run(
                ['iverilog', '-o', out_file, '-I', 'rtl', 
                 self.verilog_file, 'rtl/lmul_bf16.v', tb_file],
                capture_output=True,
                text=True
            )
            
            if compile_result.returncode != 0:
                print("STDERR:", compile_result.stderr)
                raise RuntimeError(f"Compilation failed:\n{compile_result.stderr}")
            
            # Run simulation
            sim_result = subprocess.run(
                ['vvp', out_file],
                capture_output=True,
                text=True,
                timeout=5
            )
            
            # Extract result
            output = sim_result.stdout.strip()
            if output.startswith("ERROR"):
                raise RuntimeError(f"Simulation error: {output}")
            
            result = int(output, 16)
            return result
            
        finally:
            # Cleanup
            if os.path.exists(tb_file):
                os.remove(tb_file)
            if os.path.exists(out_file):
                os.remove(out_file)

tester = LMULTester()
print("✓ LMULTester ready")

✓ LMULTester ready


In [5]:
def bf16_multiply_reference(a_bf16: int, b_bf16: int) -> int:
    """
    Reference BF16 multiplication using actual floating point
    """
    a_float = bf16_to_float(a_bf16)
    b_float = bf16_to_float(b_bf16)
    result_float = a_float * b_float
    result_bf16 = float_to_bf16(result_float)
    return result_bf16

print("✓ Reference multiplication ready")

✓ Reference multiplication ready


# Tests

### Accuracy

In [8]:
print("=" * 60)
print("BASIC TESTS")
print("=" * 60)

test_cases = [
    (1.0, 1.0, "1 * 1"),
    (2.0, 3.0, "2 * 3"),
    (0.5, 0.5, "0.5 * 0.5"),
    (1.5, 2.0, "1.5 * 2.0"),
    (-2.0, 3.0, "-2 * 3"),
    (10.0, 0.1, "10 * 0.1"),
    (0.0, 5.0, "0 * 5 (zero)"),
    (1e-10, 1e-10, "tiny * tiny (underflow)"),
    (1e15, 1e15, "large * large (overflow test)"),
]

for a_float, b_float, desc in test_cases:
    a_bf16 = float_to_bf16(a_float)
    b_bf16 = float_to_bf16(b_float)
    
    # Hardware result
    hw_result_bf16 = tester.test(a_bf16, b_bf16)
    hw_result_float = bf16_to_float(hw_result_bf16)
    
    # Reference result
    ref_result_bf16 = bf16_multiply_reference(a_bf16, b_bf16)
    ref_result_float = bf16_to_float(ref_result_bf16)
    
    # Compare
    match = "✓" if hw_result_bf16 == ref_result_bf16 else "✗"
    
    print(f"\n{desc}:")
    print(f"  Input A:     {a_float:+.6e} (0x{a_bf16:04X})")
    print(f"  Input B:     {b_float:+.6e} (0x{b_bf16:04X})")
    print(f"  HW Result:   {hw_result_float:+.6e} (0x{hw_result_bf16:04X})")
    print(f"  Ref Result:  {ref_result_float:+.6e} (0x{ref_result_bf16:04X})")
    print(f"  Match: {match}")

BASIC TESTS

1 * 1:
  Input A:     +1.000000e+00 (0x3F80)
  Input B:     +1.000000e+00 (0x3F80)
  HW Result:   +1.000000e+00 (0x3F80)
  Ref Result:  +1.000000e+00 (0x3F80)
  Match: ✓

2 * 3:
  Input A:     +2.000000e+00 (0x4000)
  Input B:     +3.000000e+00 (0x4040)
  HW Result:   +6.000000e+00 (0x40C0)
  Ref Result:  +6.000000e+00 (0x40C0)
  Match: ✓

0.5 * 0.5:
  Input A:     +5.000000e-01 (0x3F00)
  Input B:     +5.000000e-01 (0x3F00)
  HW Result:   +2.500000e-01 (0x3E80)
  Ref Result:  +2.500000e-01 (0x3E80)
  Match: ✓

1.5 * 2.0:
  Input A:     +1.500000e+00 (0x3FC0)
  Input B:     +2.000000e+00 (0x4000)
  HW Result:   +3.000000e+00 (0x4040)
  Ref Result:  +3.000000e+00 (0x4040)
  Match: ✓

-2 * 3:
  Input A:     -2.000000e+00 (0xC000)
  Input B:     +3.000000e+00 (0x4040)
  HW Result:   -6.000000e+00 (0xC0C0)
  Ref Result:  -6.000000e+00 (0xC0C0)
  Match: ✓

10 * 0.1:
  Input A:     +1.000000e+01 (0x4120)
  Input B:     +1.000000e-01 (0x3DCC)
  HW Result:   +9.218750e-01 (0x3F6C)

### Speed

In [11]:
# Prepare test data
num_ops = 100
np.random.seed(42)
test_data = []
for i in range(num_ops):
    a_float = np.random.uniform(-10, 10)
    b_float = np.random.uniform(-10, 10)
    a_bf16 = float_to_bf16(a_float)
    b_bf16 = float_to_bf16(b_float)
    test_data.append((a_bf16, b_bf16, a_float, b_float))

In [12]:
# Benchmark 1: LMUL Hardware (via simulation)
print("1. LMUL Hardware (via iverilog simulation):")
start = time.time()
hw_results = []
for a_bf16, b_bf16, _, _ in test_data:
    result = tester.test(a_bf16, b_bf16)
    hw_results.append(bf16_to_float(result))
hw_time = time.time() - start
hw_per_op = hw_time / num_ops

print(f"   Total time: {hw_time:.3f}s")
print(f"   Per operation: {hw_per_op*1000:.2f}ms")
print(f"   Throughput: {num_ops/hw_time:.2f} ops/sec")

1. LMUL Hardware (via iverilog simulation):
   Total time: 0.773s
   Per operation: 7.73ms
   Throughput: 129.39 ops/sec


In [13]:
# Benchmark 2: Python float32 multiplication
print("\n2. Python float32 multiplication:")
start = time.time()
py_results = []
for _, _, a_float, b_float in test_data:
    result = a_float * b_float
    py_results.append(result)
py_time = time.time() - start
py_per_op = py_time / num_ops

print(f"   Total time: {py_time:.6f}s")
print(f"   Per operation: {py_per_op*1e6:.2f}μs")
print(f"   Throughput: {num_ops/py_time:.0f} ops/sec")


2. Python float32 multiplication:
   Total time: 0.000206s
   Per operation: 2.06μs
   Throughput: 485452 ops/sec


In [14]:
# Benchmark 3: NumPy vectorized multiplication
print("\n3. NumPy vectorized multiplication:")
a_array = np.array([d[2] for d in test_data], dtype=np.float32)
b_array = np.array([d[3] for d in test_data], dtype=np.float32)

start = time.time()
np_results = a_array * b_array
np_time = time.time() - start
np_per_op = np_time / num_ops

print(f"   Total time: {np_time:.6f}s")
print(f"   Per operation: {np_per_op*1e6:.2f}μs")
print(f"   Throughput: {num_ops/np_time:.0f} ops/sec")


3. NumPy vectorized multiplication:
   Total time: 0.001236s
   Per operation: 12.36μs
   Throughput: 80877 ops/sec


In [22]:
# Summary comparison
print("\n" + "-" * 60)
print("SPEED COMPARISON:")
print("-" * 60)
print(f"Python FP32:      1.00x (baseline)")
print(f"NumPy vectorized: {py_time/np_time:.2f}x faster than Python")
print(f"LMUL simulation:  {py_time/hw_time:.2f}x faster than Python")


------------------------------------------------------------
SPEED COMPARISON:
------------------------------------------------------------
Python FP32:      1.00x (baseline)
NumPy vectorized: 0.17x faster than Python
LMUL simulation:  0.00x faster than Python


Our LMUL speed performance is limited by overhead