# SAM8905 Multiplier Pipeline Timing Analysis

## Question: Does the SAM8905 need to emulate a 2-cycle multiplier delay?

From the programmer's guide (Section 7.4):
- **WXY to RP**: 2 cycles minimum
- **WXY to WACC**: 3 cycles minimum

This notebook analyzes all 4 Keyfox10 FX reverb algorithms to determine:
1. Are WXY→RP/WACC timing constraints always respected?
2. Are there overlapping WXY instructions that would require pipeline emulation?
3. Can continuous (immediate) multiplication work, or do we need a delay pipeline?

In [1]:
import sys
sys.path.insert(0, '..')

from sam8905_aram_decoder import decode_instruction, decode_algorithm, format_instruction

print("SAM8905 instruction decoder loaded")

SAM8905 instruction decoder loaded


In [2]:
# All 4 Keyfox10 FX Reverb Algorithms (22kHz mode, 64 instructions each)

# ALG 0: Input conditioning - reads input, writes to SRAM delay buffers
aram_alg0 = [
    0x00F7, 0x607F, 0x58BF, 0x5A5F, 0x30BF, 0x5DDF, 0x082D, 0x593F,
    0x5ADF, 0x58F7, 0x406F, 0x2CDF, 0x48BF, 0x58F7, 0x42DF, 0x749F,
    0x68F7, 0x38FD, 0x7FFB, 0x7FFB, 0x7EFB, 0x7EFB, 0x7FFF, 0x406F,
    0x50BF, 0x42DF, 0x683F, 0x7A3F, 0x7A3F, 0x7A3F, 0x7A3F, 0x7A3F,
    0x7A3F, 0x7A3F, 0x7AF7, 0x7FFB, 0x7FFB, 0x7EFB, 0x7EFB, 0x7FFF,
    0x78FD, 0x18EF, 0x58F7, 0x7FFF, 0x50EF, 0x08FD, 0x24DF, 0x7FFF,
    0x20F7, 0x287F, 0x00EF, 0x7CBF, 0x2ADF, 0x20F7, 0x707F, 0x7CBF,
    0x28EF, 0x78FD, 0x10F7, 0x7A7F, 0x7CBF, 0x6A5B, 0x7FFF, 0x7FFF
]

# ALG 1: Diffusion/scatter - heavy DAC output (13 WACC)
aram_alg1 = [
    0x30EF, 0x48FD, 0x6ADF, 0x703F, 0x0000, 0x6BDF, 0x38EF, 0x50FC,
    0x687F, 0x7CBE, 0x18F7, 0x7A7F, 0x40EF, 0x58FC, 0x7CBE, 0x6ADF,
    0x18F7, 0x00BF, 0x307F, 0x32CE, 0x7CBE, 0x18F7, 0x7A7F, 0x48EF,
    0x60FC, 0x7CBE, 0x6ADF, 0x18F7, 0x00BF, 0x387F, 0x3ACE, 0x7CBE,
    0x18F7, 0x7A7F, 0x50EF, 0x38FC, 0x7CBE, 0x6ADF, 0x18F7, 0x00BF,
    0x407F, 0x42CE, 0x0800, 0x18F7, 0x7A7F, 0x58EF, 0x60FC, 0x7CBE,
    0x6ADF, 0x687F, 0x7CBE, 0x28F7, 0x40EF, 0x7A5F, 0x30F7, 0x487F,
    0x7A7F, 0x7CBF, 0x6A5B, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF
]

# ALG 2: Main reverb output with stereo panning (WXY+WSP at PC07)
aram_alg2 = [
    0x00EF, 0x48FD, 0x00F7, 0x28BF, 0x0800, 0x18EF, 0x40FD, 0x29B7,
    0x287F, 0x7CBE, 0x0800, 0x0800, 0x38EF, 0x50FC, 0x7CBE, 0x6ADF,
    0x18F7, 0x00BF, 0x307F, 0x32CE, 0x7CBE, 0x18F7, 0x7A7F, 0x40EF,
    0x58FC, 0x7CBE, 0x6ADF, 0x18F7, 0x00BF, 0x387F, 0x3ACE, 0x7CBE,
    0x28F7, 0x7A7F, 0x48EF, 0x60FC, 0x7CBE, 0x6ADF, 0x687F, 0x7CBE,
    0x18F7, 0x7A7F, 0x50EF, 0x38FC, 0x7CBE, 0x6ADF, 0x687F, 0x7CBE,
    0x28F7, 0x7A7F, 0x58EF, 0x60FC, 0x7CBE, 0x6ADF, 0x687F, 0x7CBE,
    0x28F7, 0x407F, 0x7A7F, 0x7CBF, 0x6A5B, 0x7FFF, 0x7FFF, 0x7FFF
]

# ALG 3: All-pass filter (MUTED - mix_l=mix_r=0)
aram_alg3 = [
    0x00EF, 0x48FD, 0x00F7, 0x28BF, 0x7CBE, 0x39F7, 0x287F, 0x10EF,
    0x7CBE, 0x6ADF, 0x38EF, 0x0800, 0x50FD, 0x18F7, 0x00BF, 0x307F,
    0x32CE, 0x7CBE, 0x18F7, 0x7A7F, 0x40EF, 0x58FC, 0x7CBE, 0x6ADF,
    0x18F7, 0x00BF, 0x387F, 0x3ACE, 0x7CBE, 0x28F7, 0x7A7F, 0x48EF,
    0x60FC, 0x0800, 0x6ADF, 0x687F, 0x7CBE, 0x18F7, 0x7A7F, 0x50EF,
    0x38FC, 0x7CBE, 0x6ADF, 0x687F, 0x7CBE, 0x28F7, 0x7A7F, 0x58EF,
    0x60FC, 0x7CBE, 0x6ADF, 0x687F, 0x7CBE, 0x28F7, 0x407F, 0x7A7F,
    0x7CBF, 0x6A5B, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF
]

algorithms = {
    'ALG 0 (Input)': aram_alg0,
    'ALG 1 (Diffusion)': aram_alg1,
    'ALG 2 (Reverb)': aram_alg2,
    'ALG 3 (All-pass)': aram_alg3
}

print(f"Loaded {len(algorithms)} algorithms, {len(aram_alg0)} instructions each")

Loaded 4 algorithms, 64 instructions each


In [3]:
def analyze_multiplier_timing(name, aram):
    """Analyze WXY→RP and WXY→WACC timing for an algorithm."""
    
    print(f"\n{'='*60}")
    print(f"  {name}")
    print(f"{'='*60}")
    
    # Find all WXY, RP, and WACC positions
    wxy_positions = []
    rp_positions = []
    wacc_positions = []
    
    for pc, inst in enumerate(aram):
        di = decode_instruction(inst)
        if di.wxy:
            wxy_positions.append(pc)
        if di.emitter == 'RP':
            rp_positions.append(pc)
        if di.wacc:
            wacc_positions.append(pc)
    
    print(f"\nWXY at PCs:  {wxy_positions}")
    print(f"RP at PCs:   {rp_positions}")
    print(f"WACC at PCs: {wacc_positions}")
    
    # Analyze RP timing (need 2+ cycles from WXY)
    print(f"\n--- RP Timing (require 2+ cycles from WXY) ---")
    rp_violations = []
    for rp_pc in rp_positions:
        preceding_wxy = [w for w in wxy_positions if w < rp_pc]
        if preceding_wxy:
            last_wxy = max(preceding_wxy)
            gap = rp_pc - last_wxy
            status = "✓" if gap >= 2 else "✗ VIOLATION"
            if gap < 2:
                rp_violations.append((rp_pc, last_wxy, gap))
            print(f"  PC{rp_pc:02d}: RP, last WXY at PC{last_wxy:02d}, gap={gap} {status}")
        else:
            print(f"  PC{rp_pc:02d}: RP, no preceding WXY (reads initial/previous frame)")
    
    # Analyze WACC timing (need 3+ cycles from WXY)
    print(f"\n--- WACC Timing (require 3+ cycles from WXY) ---")
    wacc_violations = []
    for wacc_pc in wacc_positions:
        preceding_wxy = [w for w in wxy_positions if w < wacc_pc]
        if preceding_wxy:
            last_wxy = max(preceding_wxy)
            gap = wacc_pc - last_wxy
            status = "✓" if gap >= 3 else "✗ VIOLATION"
            if gap < 3:
                wacc_violations.append((wacc_pc, last_wxy, gap))
            print(f"  PC{wacc_pc:02d}: WACC, last WXY at PC{last_wxy:02d}, gap={gap} {status}")
        else:
            print(f"  PC{wacc_pc:02d}: WACC, no preceding WXY")
    
    # Check for overlapping WXY (gap < 3 between consecutive WXY)
    print(f"\n--- WXY Spacing (overlaps within 2 cycles?) ---")
    overlaps = []
    for i in range(1, len(wxy_positions)):
        gap = wxy_positions[i] - wxy_positions[i-1]
        if gap < 3:
            overlaps.append((wxy_positions[i-1], wxy_positions[i], gap))
            print(f"  ⚠ PC{wxy_positions[i-1]:02d} → PC{wxy_positions[i]:02d}, gap={gap} cycle(s)")
        else:
            print(f"  ✓ PC{wxy_positions[i-1]:02d} → PC{wxy_positions[i]:02d}, gap={gap} cycles")
    
    return {
        'wxy': wxy_positions,
        'rp': rp_positions,
        'wacc': wacc_positions,
        'rp_violations': rp_violations,
        'wacc_violations': wacc_violations,
        'overlaps': overlaps
    }

print("Analysis function defined")

Analysis function defined


In [4]:
# Run analysis on all algorithms
results = {}
for name, aram in algorithms.items():
    results[name] = analyze_multiplier_timing(name, aram)


  ALG 0 (Input)

WXY at PCs:  [0, 9, 13, 16, 34, 42, 48, 53, 58]
RP at PCs:   [5, 11, 15, 46, 51, 55, 60]
WACC at PCs: []

--- RP Timing (require 2+ cycles from WXY) ---
  PC05: RP, last WXY at PC00, gap=5 ✓
  PC11: RP, last WXY at PC09, gap=2 ✓
  PC15: RP, last WXY at PC13, gap=2 ✓
  PC46: RP, last WXY at PC42, gap=4 ✓
  PC51: RP, last WXY at PC48, gap=3 ✓
  PC55: RP, last WXY at PC53, gap=2 ✓
  PC60: RP, last WXY at PC58, gap=2 ✓

--- WACC Timing (require 3+ cycles from WXY) ---

--- WXY Spacing (overlaps within 2 cycles?) ---
  ✓ PC00 → PC09, gap=9 cycles
  ✓ PC09 → PC13, gap=4 cycles
  ✓ PC13 → PC16, gap=3 cycles
  ✓ PC16 → PC34, gap=18 cycles
  ✓ PC34 → PC42, gap=8 cycles
  ✓ PC42 → PC48, gap=6 cycles
  ✓ PC48 → PC53, gap=5 cycles
  ✓ PC53 → PC58, gap=5 cycles

  ALG 1 (Diffusion)

WXY at PCs:  [4, 10, 16, 21, 27, 32, 38, 42, 43, 51, 54]
RP at PCs:   [9, 14, 20, 25, 31, 36, 47, 50, 57]
WACC at PCs: [4, 7, 9, 13, 14, 19, 20, 24, 25, 30, 31, 35, 36, 41, 42, 46, 47, 50]

--- RP Timi

In [5]:
# Summary
print("\n" + "="*60)
print("  SUMMARY")
print("="*60)

total_rp_violations = sum(len(r['rp_violations']) for r in results.values())
total_wacc_violations = sum(len(r['wacc_violations']) for r in results.values())
total_overlaps = sum(len(r['overlaps']) for r in results.values())

print(f"\nRP timing violations (gap < 2):   {total_rp_violations}")
print(f"WACC timing violations (gap < 3): {total_wacc_violations}")
print(f"WXY overlaps (gap < 3):           {total_overlaps}")

if total_rp_violations > 0:
    print("\n⚠ RP VIOLATIONS FOUND:")
    for name, r in results.items():
        for rp_pc, wxy_pc, gap in r['rp_violations']:
            print(f"  {name}: PC{rp_pc:02d} reads RP only {gap} cycle(s) after WXY at PC{wxy_pc:02d}")

if total_wacc_violations > 0:
    print("\n⚠ WACC VIOLATIONS FOUND:")
    for name, r in results.items():
        for wacc_pc, wxy_pc, gap in r['wacc_violations']:
            print(f"  {name}: PC{wacc_pc:02d} uses WACC only {gap} cycle(s) after WXY at PC{wxy_pc:02d}")

if total_overlaps > 0:
    print("\n⚠ WXY OVERLAPS FOUND:")
    for name, r in results.items():
        for wxy1, wxy2, gap in r['overlaps']:
            print(f"  {name}: WXY at PC{wxy1:02d} and PC{wxy2:02d} only {gap} cycle(s) apart")

print("\n" + "-"*60)
if total_rp_violations == 0 and total_wacc_violations == 0 and total_overlaps == 0:
    print("✓ All timing constraints satisfied!")
    print("✓ No overlapping WXY instructions!")
    print("\nCONCLUSION: Continuous (immediate) multiplication should work correctly.")
    print("            Pipeline emulation is NOT required for these algorithms.")
else:
    print("✗ Some timing constraints violated!")
    print("\nCONCLUSION: Pipeline emulation MAY be required.")


  SUMMARY

RP timing violations (gap < 2):   0
WACC timing violations (gap < 3): 5
WXY overlaps (gap < 3):           4

⚠ WACC VIOLATIONS FOUND:
  ALG 2 (Reverb): PC04 uses WACC only 2 cycle(s) after WXY at PC02
  ALG 2 (Reverb): PC09 uses WACC only 2 cycle(s) after WXY at PC07
  ALG 2 (Reverb): PC11 uses WACC only 1 cycle(s) after WXY at PC10
  ALG 2 (Reverb): PC13 uses WACC only 2 cycle(s) after WXY at PC11
  ALG 3 (All-pass): PC04 uses WACC only 2 cycle(s) after WXY at PC02

⚠ WXY OVERLAPS FOUND:
  ALG 1 (Diffusion): WXY at PC42 and PC43 only 1 cycle(s) apart
  ALG 2 (Reverb): WXY at PC02 and PC04 only 2 cycle(s) apart
  ALG 2 (Reverb): WXY at PC10 and PC11 only 1 cycle(s) apart
  ALG 3 (All-pass): WXY at PC11 and PC13 only 2 cycle(s) apart

------------------------------------------------------------
✗ Some timing constraints violated!

CONCLUSION: Pipeline emulation MAY be required.


## When Would Pipeline Emulation Matter?

Pipeline emulation would only be needed if the code does:

```
PC N:   WXY (X=a, Y=b)     ; Start multiplication a*b
PC N+1: WXY (X=c, Y=d)     ; Start multiplication c*d (overlapping!)
PC N+2: RP                 ; Should read a*b, NOT c*d
```

With **continuous multiplication** (current implementation):
- Each cycle ends with `mul_result = X * Y`
- RP reads whatever is in `mul_result` at that moment
- If WXY at PC N+1 overwrites X,Y, then RP at PC N+2 reads c*d instead of a*b

With **pipeline emulation**:
- `mul_result` is delayed by 2 cycles from when X,Y were set
- Even if WXY at PC N+1 starts a new multiplication, RP at PC N+2 still reads the old result

**The algorithms avoid this by:**
1. Always waiting 2+ cycles between WXY and RP
2. Never having two WXY instructions within 2 cycles of each other (before an RP)

In [6]:
# Detailed look at the tightest timing sequences (gap == 2)
print("="*60)
print("  DETAILED: Sequences with WXY→RP gap of exactly 2 cycles")
print("="*60)
print()

for name, aram in algorithms.items():
    tight_sequences = []
    
    # Find all WXY and RP positions
    wxy_positions = [pc for pc, inst in enumerate(aram) if decode_instruction(inst).wxy]
    
    for pc, inst in enumerate(aram):
        di = decode_instruction(inst)
        if di.emitter == 'RP':
            preceding_wxy = [w for w in wxy_positions if w < pc]
            if preceding_wxy:
                last_wxy = max(preceding_wxy)
                gap = pc - last_wxy
                if gap == 2:  # Exactly at the minimum
                    tight_sequences.append((last_wxy, pc))
    
    if tight_sequences:
        print(f"\n{name}:")
        for wxy_pc, rp_pc in tight_sequences:
            # Show the actual instructions
            print(f"  PC{wxy_pc:02d}: {aram[wxy_pc]:04X}  {format_instruction(decode_instruction(aram[wxy_pc]))}")
            print(f"  PC{wxy_pc+1:02d}: {aram[wxy_pc+1]:04X}  {format_instruction(decode_instruction(aram[wxy_pc+1]))}")
            print(f"  PC{rp_pc:02d}: {aram[rp_pc]:04X}  {format_instruction(decode_instruction(aram[rp_pc]))} ← reads mul_result")
            print()
    else:
        print(f"\n{name}: No tight (gap=2) sequences")

  DETAILED: Sequences with WXY→RP gap of exactly 2 cycles


ALG 0 (Input):
  PC09: 58F7  58F7  RM 11, <WXY>
  PC10: 406F  406F  RM 8, <WA, WPHI>
  PC11: 2CDF  2CDF  RP 5, <WM> ← reads mul_result

  PC13: 58F7  58F7  RM 11, <WXY>
  PC14: 42DF  42DF  RADD 8, <WM>
  PC15: 749F  749F  RP 14, <WB, WM> ← reads mul_result

  PC53: 20F7  20F7  RM 4, <WXY>
  PC54: 707F  707F  RM 14, <WA>
  PC55: 7CBF  7CBF  RP, <WB> ← reads mul_result

  PC58: 10F7  10F7  RM 2, <WXY>
  PC59: 7A7F  7A7F  RADD, <WA>
  PC60: 7CBF  7CBF  RP, <WB> ← reads mul_result


ALG 1 (Diffusion): No tight (gap=2) sequences

ALG 2 (Reverb):
  PC07: 29B7  29B7  RM 5, <WB, WXY, WSP>
  PC08: 287F  287F  RM 5, <WA>
  PC09: 7CBE  7CBE  RP, <WB, WACC> ← reads mul_result


ALG 3 (All-pass):
  PC02: 00F7  00F7  RM 0, <WXY>
  PC03: 28BF  28BF  RM 5, <WB>
  PC04: 7CBE  7CBE  RP, <WB, WACC> ← reads mul_result



In [7]:
# Detailed investigation of WACC violations
print("="*70)
print("  WACC VIOLATION ANALYSIS")
print("="*70)
print()
print("The programmer's guide says WACC needs 3 cycles, but we found cases with")
print("gaps of 1-2 cycles. Let's examine what's actually happening:")
print()

violations = [
    ('ALG 2 (Reverb)', aram_alg2, [(2, 4), (7, 9), (10, 11), (11, 13)]),
    ('ALG 3 (All-pass)', aram_alg3, [(2, 4)]),
]

for name, aram, cases in violations:
    print(f"\n{'='*60}")
    print(f"  {name}")
    print(f"{'='*60}")
    
    for wxy_pc, wacc_pc in cases:
        gap = wacc_pc - wxy_pc
        print(f"\n  Case: WXY at PC{wxy_pc:02d} → WACC at PC{wacc_pc:02d} (gap={gap})")
        print(f"  " + "-"*50)
        
        # Show the instructions in this range
        for pc in range(wxy_pc, wacc_pc + 1):
            di = decode_instruction(aram[pc])
            markers = []
            if di.wxy:
                markers.append("← WXY")
            if di.wacc:
                markers.append("← WACC")
            marker_str = " ".join(markers)
            print(f"    PC{pc:02d}: {aram[pc]:04X}  {format_instruction(di):<35} {marker_str}")
        
        # Check if there's an RP at the WACC instruction
        di = decode_instruction(aram[wacc_pc])
        if di.emitter == 'RP':
            print(f"\n    Note: PC{wacc_pc:02d} has BOTH RP and WACC")
            print(f"    - RP puts mul_result on bus (for WB)")
            print(f"    - WACC accumulates mul_result to audio output")
            print(f"    - If mul_result isn't ready, BOTH read wrong value!")

  WACC VIOLATION ANALYSIS

The programmer's guide says WACC needs 3 cycles, but we found cases with
gaps of 1-2 cycles. Let's examine what's actually happening:


  ALG 2 (Reverb)

  Case: WXY at PC02 → WACC at PC04 (gap=2)
  --------------------------------------------------
    PC02: 00F7  00F7  RM 0, <WXY>                   ← WXY
    PC03: 28BF  28BF  RM 5, <WB>                    
    PC04: 0800  0800  RM 1, <WA, WB, WM, WPHI, WXY, clearB, WWF, WACC> ← WXY ← WACC

  Case: WXY at PC07 → WACC at PC09 (gap=2)
  --------------------------------------------------
    PC07: 29B7  29B7  RM 5, <WB, WXY, WSP>          ← WXY
    PC08: 287F  287F  RM 5, <WA>                    
    PC09: 7CBE  7CBE  RP, <WB, WACC>                ← WACC

    Note: PC09 has BOTH RP and WACC
    - RP puts mul_result on bus (for WB)
    - WACC accumulates mul_result to audio output
    - If mul_result isn't ready, BOTH read wrong value!

  Case: WXY at PC10 → WACC at PC11 (gap=1)
  -------------------------------

In [9]:
# Clarify the timing semantics
print("="*60)
print("  TIMING CLARIFICATION")
print("="*60)
print("""
From programmer's guide:
- WXY to RP:   2 cycles → at N+2, NEW result is ready
- WXY to WACC: 3 cycles → at N+3, NEW result is ready for accumulation

This means:
- RP with gap=2: Reads the CORRECT (new) multiplication result ✓
- RP with gap<2: Would read OLD result (but we found NO such cases)

- WACC with gap=3+: Reads the CORRECT (new) result ✓  
- WACC with gap<3:  Reads an OLDER result (we found 5 such cases)

The WACC violations suggest either:
1. Intentional reading of a previous multiplication result
2. Or WACC timing constraint is less strict than documented

Since RP has NO violations, the algorithms respect the 2-cycle multiplier latency.
""")

print("\nKey finding: All RP reads have gap >= 2")
print("This confirms the 2-cycle multiplier delay is real and respected.")
print("\nFor continuous multiplication to work, we need to ensure:")
print("- mul_result reflects X*Y from 2 cycles ago, not current X*Y")

  TIMING CLARIFICATION

From programmer's guide:
- WXY to RP:   2 cycles → at N+2, NEW result is ready
- WXY to WACC: 3 cycles → at N+3, NEW result is ready for accumulation

This means:
- RP with gap=2: Reads the CORRECT (new) multiplication result ✓
- RP with gap<2: Would read OLD result (but we found NO such cases)

- WACC with gap=3+: Reads the CORRECT (new) result ✓  
- WACC with gap<3:  Reads an OLDER result (we found 5 such cases)

The WACC violations suggest either:
1. Intentional reading of a previous multiplication result
2. Or WACC timing constraint is less strict than documented

Since RP has NO violations, the algorithms respect the 2-cycle multiplier latency.


Key finding: All RP reads have gap >= 2
This confirms the 2-cycle multiplier delay is real and respected.

For continuous multiplication to work, we need to ensure:
- mul_result reflects X*Y from 2 cycles ago, not current X*Y


## Final Conclusion

### SAM8905 Pipeline Requirements

**Total pipeline latency: ~4 cycles**
1. SRAM fetch: ~2 cycles (85ns SRAM at 35ns/cycle)
2. Multiplication: ~2 cycles (as documented)

### Evidence from ALG 1 Analysis

When accounting for the full pipeline, all WACC instructions have **4-6 cycle gaps** from the X-setter (WPHI/WWF/WXY) whose result they actually use.

The apparent "1-cycle gaps" we saw earlier were misleading - WACC was reading the result from an **earlier** X-setter that had completed its pipeline, not the immediately preceding one.

### Emulation Requirements

For accurate emulation, we need:

1. **X register pipeline**: WPHI/WWF with external waveforms should update X after ~2 cycles delay
2. **Multiplication pipeline**: mul_result should reflect X*Y from ~2 cycles ago

Or equivalently, a combined 4-stage pipeline where WACC/RP read results from 4 cycles back.

### Current Implementation Status

The current "continuous multiplication" approach computes mul_result immediately. This may work if the algorithms are designed to always have sufficient gaps, but it's not hardware-accurate.

For full accuracy, implement a delay pipeline:
```cpp
// Track X values through pipeline
m_x_pipeline[3] = m_x_pipeline[2];
m_x_pipeline[2] = m_x_pipeline[1]; 
m_x_pipeline[1] = m_x_pipeline[0];
m_x_pipeline[0] = new_x_from_waveform;

// Use pipelined X for multiplication
m_mul_result = m_x_pipeline[2] * m_y;  // 2-cycle delayed X
```

In [10]:
# Critical check: When WXY instructions overlap, does RP ever try to read the FIRST one's result?
print("="*60)
print("  CRITICAL: Do overlapping WXY sequences affect RP reads?")
print("="*60)
print()

overlaps_info = [
    ('ALG 1', aram_alg1, [(42, 43)]),
    ('ALG 2', aram_alg2, [(2, 4), (10, 11)]),
    ('ALG 3', aram_alg3, [(11, 13)]),
]

for name, aram, overlaps in overlaps_info:
    print(f"\n{name}:")
    
    # Find all RP positions
    rp_positions = [pc for pc, inst in enumerate(aram) if decode_instruction(inst).emitter == 'RP']
    wxy_positions = [pc for pc, inst in enumerate(aram) if decode_instruction(inst).wxy]
    
    for wxy1, wxy2 in overlaps:
        print(f"\n  Overlap: WXY at PC{wxy1:02d} → PC{wxy2:02d}")
        
        # Find next RP after this overlap
        next_rp = [p for p in rp_positions if p > wxy2]
        if next_rp:
            rp_pc = next_rp[0]
            # Which WXY does this RP use?
            preceding = [w for w in wxy_positions if w < rp_pc and rp_pc - w >= 2]
            if preceding:
                used_wxy = max(preceding)
                gap = rp_pc - used_wxy
                
                if used_wxy == wxy1:
                    print(f"  ⚠ RP at PC{rp_pc:02d} reads FIRST WXY at PC{wxy1:02d} (gap={gap})")
                    print(f"     This would require pipeline emulation!")
                else:
                    print(f"  ✓ RP at PC{rp_pc:02d} reads WXY at PC{used_wxy:02d} (gap={gap})")
                    if used_wxy >= wxy2:
                        print(f"     First WXY at PC{wxy1:02d} is NOT read by RP - no conflict")

print("\n" + "="*60)
print("If no RP reads the FIRST WXY of an overlapping pair,")
print("continuous multiplication works correctly.")
print("="*60)

  CRITICAL: Do overlapping WXY sequences affect RP reads?


ALG 1:

  Overlap: WXY at PC42 → PC43
  ✓ RP at PC47 reads WXY at PC43 (gap=4)
     First WXY at PC42 is NOT read by RP - no conflict

ALG 2:

  Overlap: WXY at PC02 → PC04
  ✓ RP at PC09 reads WXY at PC07 (gap=2)
     First WXY at PC02 is NOT read by RP - no conflict

  Overlap: WXY at PC10 → PC11
  ✓ RP at PC14 reads WXY at PC11 (gap=3)
     First WXY at PC10 is NOT read by RP - no conflict

ALG 3:

  Overlap: WXY at PC11 → PC13
  ✓ RP at PC17 reads WXY at PC13 (gap=4)
     First WXY at PC11 is NOT read by RP - no conflict

If no RP reads the FIRST WXY of an overlapping pair,
continuous multiplication works correctly.


In [14]:
# Re-analyze with SRAM fetch latency in mind
print("="*60)
print("  SRAM Fetch Latency Analysis")
print("="*60)
print("""
From the docs:
- Clock: 28.322 MHz → ~35ns per instruction cycle
- SRAM access time: 85ns → ~2-3 cycles
- Doc says 250ns waveform fetch = 7 cycles total

Note: 22kHz mode has same cycle time as 44kHz - just 64 instructions 
per sample instead of 32.

Pipeline for external waveform read:
  Cycle N:   WPHI/WWF sets address (WF, PHI)
  Cycle N+1: SRAM fetch in progress
  Cycle N+2: X register updated with SRAM data
  Cycle N+3: Multiplication available? (X*Y)
  Cycle N+4: WACC can use result

So WACC needs ~4 cycles gap from the WPHI/WWF that sets the X it uses.
""")

# Re-analyze ALG 1 - what X does each WACC actually use?
print("\nALG 1: Which WPHI/WWF result does each WACC use?")
print("(Assuming 2-cycle SRAM latency + 2-cycle multiply latency = 4 total)")
print("-" * 60)

aram = aram_alg1

# Find all X-setting instructions  
x_setters = []
for pc, inst in enumerate(aram):
    di = decode_instruction(inst)
    if di.wphi or di.wwf or di.wxy:
        setter_type = []
        if di.wphi: setter_type.append("WPHI")
        if di.wwf: setter_type.append("WWF")
        if di.wxy: setter_type.append("WXY")
        x_setters.append((pc, "+".join(setter_type)))

print("X-setting instructions:", [(f"PC{p:02d}:{t}" ) for p,t in x_setters[:15]])
print()

# For each WACC, find which X-setter it uses (need 4+ cycle gap)
for pc, inst in enumerate(aram):
    di = decode_instruction(inst)
    if di.wacc:
        # Need 4 cycles for result to be ready
        valid = [(p, t) for p, t in x_setters if pc - p >= 4]
        if valid:
            setter_pc, setter_type = valid[-1]
            gap = pc - setter_pc
            print(f"PC{pc:02d} WACC → uses {setter_type} from PC{setter_pc:02d} (gap={gap})")
        else:
            print(f"PC{pc:02d} WACC → uses X from previous frame")

  SRAM Fetch Latency Analysis

From the docs:
- Clock: 28.322 MHz → ~35ns per instruction cycle
- SRAM access time: 85ns → ~2-3 cycles
- Doc says 250ns waveform fetch = 7 cycles total

Note: 22kHz mode has same cycle time as 44kHz - just 64 instructions 
per sample instead of 32.

Pipeline for external waveform read:
  Cycle N:   WPHI/WWF sets address (WF, PHI)
  Cycle N+1: SRAM fetch in progress
  Cycle N+2: X register updated with SRAM data
  Cycle N+3: Multiplication available? (X*Y)
  Cycle N+4: WACC can use result

So WACC needs ~4 cycles gap from the WPHI/WWF that sets the X it uses.


ALG 1: Which WPHI/WWF result does each WACC use?
(Assuming 2-cycle SRAM latency + 2-cycle multiply latency = 4 total)
------------------------------------------------------------
X-setting instructions: ['PC00:WPHI', 'PC01:WWF', 'PC04:WPHI+WWF+WXY', 'PC06:WPHI', 'PC07:WWF', 'PC10:WXY', 'PC12:WPHI', 'PC13:WWF', 'PC16:WXY', 'PC19:WPHI', 'PC21:WXY', 'PC23:WPHI', 'PC24:WWF', 'PC27:WXY', 'PC30:WPHI']

P