### **Karatsuba Formula in GF(2)**

Given $ A(x), B(x) $, split at $ m = \lceil \deg/2 \rceil $:


\begin{aligned}
A_0 &= A \bmod x^m, \quad &A_1 &= \lfloor A / x^m \rfloor \\
B_0 &= B \bmod x^m, \quad &B_1 &= \lfloor B / x^m \rfloor \\
z_0 &= A_0 \cdot B_0 \\
z_2 &= A_1 \cdot B_1 \\
z_1 &= (A_0 + A_1) \cdot (B_0 + B_1) \\
C(x) &= z_2 \cdot x^{2m} \oplus (z_1 \oplus z_0 \oplus z_2) \cdot x^m \oplus z_0
\end{aligned}


where $ \oplus $ denotes coefficient-wise XOR (i.e., addition in GF(2)).

---

### **Example**

Let $ A(x) = x^3 + x + 1  \to$ binary `1011` → integer `11`  
Let $ B(x) = x^2 + 1 \to$ binary `0101` → integer `5`

Split at $ m = 2 $:
- $ A_0 = x + 1 $ (`11` = 3), $ A_1 = x $ (`10` = $2$)
- $ B_0 = 1 $ (`01` = 1), $ B_1 = 1 $ (`01` = $1$)

Compute:
- $ z_0 = A_0 \cdot B_0 = 3 \cdot 1 = x + 1 $
- $ z_2 = A_1 \cdot B_1 = 2 \cdot 1 = x $
- $ A_0 + A_1 = (x+1) + x = 1 $
- $ B_0 + B_1 = 1 + 1 = 0 $
- $ z_1 = (A_0 + A_1) \cdot (B_0 + B_1) = 1 \cdot (0) = 0 $
- $ z_1 \oplus z_0 \oplus z_2 = 0 \oplus (x+1) \oplus x = 1 $

Then:
\begin{aligned}
C(x) &= z_2 \cdot x^{2m} \oplus (z_1 \oplus z_0 \oplus z_2) \cdot x^m \oplus z_0 \\
     &= x \cdot x^4 \oplus 1 \cdot x^2 \oplus (x+1) = x^5 \oplus x^2 \oplus x \oplus 1
\end{aligned}

Which matches $ (x^3+x+1)(x^2+1) = x^5 + x^2 + x + 1 $.

---

This is the **exact mathematical foundation** for recursive or iterative Karatsuba GF(2) multiplier.

In [1]:
import galois

GF2 = galois.GF(2)

def gf2_mul_to_int(a: int, b: int) -> int:
    pa = galois.Poly.Int(a, field=GF2)  # Correctly interprets bits as poly coeffs
    pb = galois.Poly.Int(b, field=GF2)
    pc = pa * pb
    return int(pc)

# Test
#result = gf2_mul_to_int(a, b)  # 8 = x^3, so x^3 * x^3 = x^6 = 64
#print(f"{a} * {b} in GF(2) = {result}")  # Output: 17

In [2]:
def schoolbook_mul32(a: int, b: int) -> int:
    """
    GF(2) polynomial multiplication of two degree-<32 polynomials.
    Implemented as shift-and-XOR (no integer arithmetic tricks).
    Returns 64-bit result (bits 0..62 used).
    """
    result = 0
    for i in range(32):
        if (b >> i) & 1:
            result ^= a << i
    return result & ((1 << 64) - 1)

In [3]:
def popcount_parity(addr):
    """
    Bank select by popcount parity.
    Returns 0 if popcount(addr) is even, 1 if odd.
    HW: XOR reduce of all address bits.
    """
    # bin(addr).count('1') & 1
    a = bin(addr)[2:]  # Get binary string without '0b' prefix
    res = int(a[0])
    for i in range(1, len(a)):
        res ^= int(a[i])
    return res

In [9]:
# ════════════════════════════════════════════════════════════════════
# KARATSUBA GF(2) HARDWARE MODEL (evil/odious memory layout)
# ════════════════════════════════════════════════════════════════════
def karatsuba_gf2_hw_model(a_evil, a_odious, b_evil, b_odious, n_words=2048):
    assert n_words % 2 == 0
    BANK_DEPTH = n_words // 2          # 1024
    N_WORDS_2X = 2 * n_words           # 4096

    # Scratch BRAMs (each is depth 1024, addr 0..1023)
    a_scr_evil   = [0] * BANK_DEPTH
    a_scr_odious = [0] * BANK_DEPTH
    b_scr_evil   = [0] * BANK_DEPTH
    b_scr_odious = [0] * BANK_DEPTH

    result_mem = {}
    stack = [(n_words, 0, 0)]  # (size_words, start_logical, stage)

    def read_input(logical_addr):
        bank = popcount_parity(logical_addr)   # 0 evil, 1 odious
        baddr = logical_addr >> 1              # 0..1023
        if bank == 0:
            return a_evil[baddr], b_evil[baddr]
        else:
            return a_odious[baddr], b_odious[baddr]

    def read_scratch(logical_addr):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            return a_scr_evil[baddr], b_scr_evil[baddr]
        else:
            return a_scr_odious[baddr], b_scr_odious[baddr]

    def write_scratch(logical_addr, a_val, b_val):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            a_scr_evil[baddr] = a_val
            b_scr_evil[baddr] = b_val
        else:
            a_scr_odious[baddr] = a_val
            b_scr_odious[baddr] = b_val

    while stack:
        size, start, stage = stack.pop()
        half = size >> 1

        use_scratch = (start >= n_words)
        base_addr = (start - n_words) if use_scratch else start

        if stage == 0:
            if size == 1:
                addr = base_addr
                if use_scratch:
                    a_val, b_val = read_scratch(addr)
                else:
                    a_val, b_val = read_input(addr)

                prod = schoolbook_mul32(a_val, b_val)
                res = [prod & 0xFFFFFFFF, (prod >> 32) & 0xFFFFFFFF]
                result_mem[(size, start)] = (res, 2)

            else:
                scratch_base_local = n_words - size
                scratch_logical = N_WORDS_2X - size

                for i in range(half):
                    addr_lo = base_addr + i
                    addr_hi = base_addr + half + i

                    if use_scratch:
                        a_lo, b_lo = read_scratch(addr_lo)
                        a_hi, b_hi = read_scratch(addr_hi)
                    else:
                        a_lo, b_lo = read_input(addr_lo)
                        a_hi, b_hi = read_input(addr_hi)

                    a_sum = a_lo ^ a_hi
                    b_sum = b_lo ^ b_hi

                    addr_wr = scratch_base_local + i
                    write_scratch(addr_wr, a_sum, b_sum)

                stack.append((size, start, 1))
                stack.append((half, scratch_logical, 0))  # P1
                stack.append((half, start + half, 0))     # P2
                stack.append((half, start, 0))            # P0

        else:
            scratch_logical = N_WORDS_2X - size

            P0, P0_len = result_mem[(half, start)]
            P2, P2_len = result_mem[(half, start + half)]
            P1, P1_len = result_mem[(half, scratch_logical)]

            max_len = max(P0_len, P1_len, P2_len)
            P0_pad = P0 + [0] * (max_len - P0_len)
            P1_pad = P1 + [0] * (max_len - P1_len)
            P2_pad = P2 + [0] * (max_len - P2_len)

            result_len = size << 1
            res = [0] * result_len

            for i in range(P0_len):
                res[i] ^= P0[i]
            for i in range(max_len):
                res[half + i] ^= (P0_pad[i] ^ P1_pad[i] ^ P2_pad[i])
            for i in range(P2_len):
                res[(half << 1) + i] ^= P2[i]

            result_mem[(size, start)] = (res, result_len)
    final, _ = result_mem[(n_words, 0)]
    return final



In [None]:
def karatsuba_gf2_hw_model(a_evil, a_odious, b_evil, b_odious, n_words=2048):
    assert n_words % 2 == 0
    BANK_DEPTH = n_words // 2          # 1024
    N_WORDS_2X = 2 * n_words           # 4096

    # Scratch BRAMs (each is depth 1024, addr 0..1023)
    a_scr_evil   = [0] * BANK_DEPTH
    a_scr_odious = [0] * BANK_DEPTH
    b_scr_evil   = [0] * BANK_DEPTH
    b_scr_odious = [0] * BANK_DEPTH

    # Instead of result_mem dict:
    # stack of computed results in DFS completion order.
    # Each entry is (res_list, res_len)
    res_stack = []

    # Work stack: (size_words, start_logical, stage)
    stack = [(n_words, 0, 0)]

    def read_input(logical_addr):
        bank = popcount_parity(logical_addr)   # 0 evil, 1 odious
        baddr = logical_addr >> 1              # 0..1023
        if bank == 0:
            return a_evil[baddr], b_evil[baddr]
        else:
            return a_odious[baddr], b_odious[baddr]

    def read_scratch(logical_addr):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            return a_scr_evil[baddr], b_scr_evil[baddr]
        else:
            return a_scr_odious[baddr], b_scr_odious[baddr]

    def write_scratch(logical_addr, a_val, b_val):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            a_scr_evil[baddr] = a_val
            b_scr_evil[baddr] = b_val
        else:
            a_scr_odious[baddr] = a_val
            b_scr_odious[baddr] = b_val

    while stack:
        size, start, stage = stack.pop()
        half = size >> 1

        use_scratch = (start >= n_words)
        base_addr = (start - n_words) if use_scratch else start

        if stage == 0:
            if size == 1:
                addr = base_addr
                if use_scratch:
                    a_val, b_val = read_scratch(addr)
                else:
                    a_val, b_val = read_input(addr)

                prod = schoolbook_mul32(a_val, b_val)
                res = [prod & 0xFFFFFFFF, (prod >> 32) & 0xFFFFFFFF]

                # push result for this leaf
                res_stack.append((res, 2))

            else:
                scratch_base_local = n_words - size
                scratch_logical = N_WORDS_2X - size

                for i in range(half):
                    addr_lo = base_addr + i
                    addr_hi = base_addr + half + i

                    if use_scratch:
                        a_lo, b_lo = read_scratch(addr_lo)
                        a_hi, b_hi = read_scratch(addr_hi)
                    else:
                        a_lo, b_lo = read_input(addr_lo)
                        a_hi, b_hi = read_input(addr_hi)

                    a_sum = a_lo ^ a_hi
                    b_sum = b_lo ^ b_hi

                    addr_wr = scratch_base_local + i
                    write_scratch(addr_wr, a_sum, b_sum)

                # IMPORTANT: push combine marker first, then children.
                # Execution order (popped) will be: P0, P2, P1, then combine.
                stack.append((size, start, 1))             # combine
                stack.append((half, scratch_logical, 0))   # P1
                stack.append((half, start + half, 0))      # P2
                stack.append((half, start, 0))             # P0

        else:
            # Combine: by DFS order, the last three completed subcalls are:
            # ... P0 finished, then P2 finished, then P1 finished (because P1 was pushed last)
            # so res_stack top is P1, then P2, then P0.
            P1, P1_len = res_stack.pop()
            P2, P2_len = res_stack.pop()
            P0, P0_len = res_stack.pop()

            max_len = max(P0_len, P1_len, P2_len)
            P0_pad = P0 + [0] * (max_len - P0_len)
            P1_pad = P1 + [0] * (max_len - P1_len)
            P2_pad = P2 + [0] * (max_len - P2_len)

            result_len = size << 1
            res = [0] * result_len

            for i in range(P0_len):
                res[i] ^= P0[i]
            for i in range(max_len):
                res[half + i] ^= (P0_pad[i] ^ P1_pad[i] ^ P2_pad[i])
            for i in range(P2_len):
                res[(half << 1) + i] ^= P2[i]

            # push combined result
            res_stack.append((res, result_len))

        # Debug (optional):
        print("work_stack:", len(stack), "res_stack:", len(res_stack))

    # Final product is the only remaining result
    final, _ = res_stack.pop()
    return final


In [5]:
def int_to_words(val: int, word_size=32) -> list:
    words = []
    mask = (1 << word_size) - 1
    while val:
        words.append(val & mask)
        val = val >> word_size
    if not words:
        words = [0]
    return words

verification

In [6]:
# 1. Load directly into Integers
#    (No loop or chunking required!)
with open("A_full.hex", "r") as f:
    a_int = int(f.read().strip(), 16)

with open("B_full.hex", "r") as f:
    b_int = int(f.read().strip(), 16)

print(f"Loaded A: {a_int.bit_length()} bits")
print(f"Loaded B: {b_int.bit_length()} bits")

Loaded A: 65536 bits
Loaded B: 65536 bits


In [None]:
a_words = int_to_words(val=a_int)
# print("a_words =", [hex(w) for w in a_words])
# print("b_words =", [hex(w) for w in b_words])
b_words = int_to_words(val=b_int)

In [13]:
def karatsuba_gf2_hw_model(a_evil, a_odious, b_evil, b_odious, n_words=2048):
    assert n_words % 2 == 0
    assert (n_words & (n_words - 1)) == 0
    BANK_DEPTH = n_words // 2          # 1024
    N_WORDS_2X = 2 * n_words

    # Scratch BRAMs (depth 1024)
    a_scr_evil   = [0] * BANK_DEPTH
    a_scr_odious = [0] * BANK_DEPTH
    b_scr_evil   = [0] * BANK_DEPTH
    b_scr_odious = [0] * BANK_DEPTH

    # -----------------------------
    # BRAM page pool for results
    # -----------------------------
    PAGE_WORDS = 1024

    bram_pages = []      # list of pages, each is [0]*1024
    free_pages = []      # stack of free page indices

    def alloc_pages(n_pages):
        ids = []
        for _ in range(n_pages):
            if free_pages:
                pid = free_pages.pop()
                # optional clear; we overwrite all written words anyway
                ids.append(pid)
            else:
                bram_pages.append([0] * PAGE_WORDS)
                ids.append(len(bram_pages) - 1)
        return ids

    def free_pages_ids(ids):
        for pid in ids:
            free_pages.append(pid)

    def write_result(handle, idx, value):
        # idx is word index within result
        page_list, _ = handle
        pid = page_list[idx >> 10]          # /1024
        off = idx & 0x3FF                   # %1024
        bram_pages[pid][off] = value

    def read_result(handle, idx):
        page_list, _ = handle
        pid = page_list[idx >> 10]
        off = idx & 0x3FF
        return bram_pages[pid][off]

    def make_handle(res_len_words):
        n_pages = (res_len_words + PAGE_WORDS - 1) // PAGE_WORDS
        return (alloc_pages(n_pages), res_len_words)

    # -----------------------------
    # evil/odious input + scratch
    # -----------------------------
    def read_input(logical_addr):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            return a_evil[baddr], b_evil[baddr]
        else:
            return a_odious[baddr], b_odious[baddr]

    def read_scratch(logical_addr):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            return a_scr_evil[baddr], b_scr_evil[baddr]
        else:
            return a_scr_odious[baddr], b_scr_odious[baddr]

    def write_scratch(logical_addr, a_val, b_val):
        bank = popcount_parity(logical_addr)
        baddr = logical_addr >> 1
        if bank == 0:
            a_scr_evil[baddr] = a_val
            b_scr_evil[baddr] = b_val
        else:
            a_scr_odious[baddr] = a_val
            b_scr_odious[baddr] = b_val

    # -----------------------------
    # Work stack + result stack
    # -----------------------------
    # work frame: (size, start, stage)
    work = [(n_words, 0, 0)]

    # result stack holds handles in DFS completion order
    # each handle = (page_ids, res_len_words)
    res_stack = []

    while work:
        size, start, stage = work.pop()
        half = size >> 1

        use_scratch = (start >= n_words)
        base_addr = (start - n_words) if use_scratch else start

        if stage == 0:
            if size == 1:
                if use_scratch:
                    a_val, b_val = read_scratch(base_addr)
                else:
                    a_val, b_val = read_input(base_addr)

                prod = schoolbook_mul32(a_val, b_val)
                h = make_handle(2)
                write_result(h, 0, prod & 0xFFFFFFFF)
                write_result(h, 1, (prod >> 32) & 0xFFFFFFFF)
                res_stack.append(h)

            else:
                scratch_base_local = n_words - size
                scratch_tag = N_WORDS_2X - size  # keep your tag

                for i in range(half):
                    addr_lo = base_addr + i
                    addr_hi = base_addr + half + i

                    if use_scratch:
                        a_lo, b_lo = read_scratch(addr_lo)
                        a_hi, b_hi = read_scratch(addr_hi)
                    else:
                        a_lo, b_lo = read_input(addr_lo)
                        a_hi, b_hi = read_input(addr_hi)

                    write_scratch(scratch_base_local + i, a_lo ^ a_hi, b_lo ^ b_hi)

                # DFS schedule (same as your original):
                # push combine, then P1, P2, P0 (so execution is P0, P2, P1, combine)
                work.append((size, start, 1))
                work.append((half, scratch_tag, 0))   # P1
                work.append((half, start + half, 0))  # P2
                work.append((half, start, 0))         # P0

        else:
            # Pop in reverse completion order: P1, P2, P0
            h1 = res_stack.pop()  # P1
            h2 = res_stack.pop()  # P2
            h0 = res_stack.pop()  # P0

            child_len = 2 * half
            out_len = 2 * size
            hout = make_handle(out_len)

            # initialize output to 0
            for i in range(out_len):
                write_result(hout, i, 0)

            # low ^= P0
            for i in range(child_len):
                write_result(hout, i, read_result(hout, i) ^ read_result(h0, i))

            # middle ^= (P0 ^ P1 ^ P2) at offset half
            for i in range(child_len):
                mid = read_result(h0, i) ^ read_result(h1, i) ^ read_result(h2, i)
                j = half + i
                write_result(hout, j, read_result(hout, j) ^ mid)

            # high ^= P2 at offset 2*half
            for i in range(child_len):
                j = (2 * half) + i
                write_result(hout, j, read_result(hout, j) ^ read_result(h2, i))

            # free children pages (reuse BRAM)
            free_pages_ids(h0[0])
            free_pages_ids(h1[0])
            free_pages_ids(h2[0])

            res_stack.append(hout)

    final_handle = res_stack.pop()
    final_len = final_handle[1]
    final = [0] * final_len
    for i in range(final_len):
        final[i] = read_result(final_handle, i)

    # free final pages too if you want
    # free_pages_ids(final_handle[0])

    return final


In [14]:
# ── Verify: karatsuba_gf2_iterative_hw matches original ──
aw = int_to_words(a_int)
bw = int_to_words(b_int)

# ════════════════════════════════════════════════════════════════════
# BRANCHLESS EVIL/ODIOUS SPLIT (direct formula, no if/else)
# ════════════════════════════════════════════════════════════════════
def split_evil_odious(words: list) -> tuple:
    """Split using closed-form evil/odious index formulas (branchless)."""
    n = len(words) // 2
    evil = [0] * n    # evil(k)   = 2k + parity(k)
    odious = [0] * n  # odious(k) = 2k + 1 - parity(k)
    for k in range(n):
        p = popcount_parity(k)
        evil[k]   = words[(k << 1) + p]      # 2k + parity(k)
        odious[k] = words[(k << 1) + 1 - p]  # 2k + 1 - parity(k)
    return evil, odious

a_evil, a_odious = split_evil_odious(aw)
b_evil, b_odious = split_evil_odious(bw)


result = karatsuba_gf2_hw_model(a_evil, a_odious, b_evil, b_odious)
c_ref  = int_to_words(gf2_mul_to_int(a_int, b_int))


print("hw ver   :", [hex(w) for w in result])
print("reference:", [hex(w) for w in c_ref])
print("hw == ref:", result == c_ref)

hw ver   : ['0x71024370', '0x655f6a41', '0x802134de', '0x8ed7ad94', '0x45e39edd', '0x9efc0237', '0x1f53d90', '0xfa5dfd1c', '0xc98055d1', '0x2def3b62', '0xd83d6b43', '0x5fa7f792', '0x6e033507', '0x819940a1', '0xdc3bb848', '0xa22b886c', '0x34e36ac7', '0xb6f7a3df', '0x6e7c8f67', '0x187bf39a', '0x6b93cc1b', '0x678387ac', '0x7c382795', '0xf28ebfa9', '0xfac7cfe', '0xf739ac19', '0xe86b300', '0xb538d96f', '0xf4871e27', '0xfc59ca7e', '0xee8bb97a', '0x706d6155', '0x68163c34', '0xf9e998b0', '0x9839cfda', '0xf3934b35', '0xc5aad0a0', '0x4fd54223', '0xd0c6dabf', '0xa8a5706e', '0xa56f2b73', '0x71f45ea3', '0xe5dd6cca', '0xfc6a81c', '0xb724596d', '0x2a307259', '0x301f17eb', '0xb2e6e69a', '0xd471eccc', '0x4e516668', '0x198b7a2c', '0x80a44654', '0x62329d04', '0x684c6ff0', '0x6651c31f', '0x8d1be409', '0xca93c44a', '0x56cffaa0', '0xb4eb175b', '0x88f0bc2a', '0x7d965a67', '0x7ae39306', '0x42812515', '0x6650f0ee', '0xb85bc4ff', '0x9761f371', '0xfd2657ac', '0xabc6d543', '0x9cb11273', '0x1d3806f8', '0xebe9a5ad'