In [5]:
# Resource: https://ieeexplore.ieee.org/document/6467863

import math
import time

In [6]:
def Improved_Folding_Barrett_Reduction(A, B, P):
    # Pre-computation
    k = math.ceil(math.log2(P))
    s = k // 2
    mu = 2**(3 * s + 3) // P  # Calculate mu using modular exponentiation
    P0 = 2**(3 * s) % P  # Calculate P0 using modular exponentiation

    # Split the numbers into halves
    A_L, A_H = A & ((1 << s) - 1), A >> s  # A = A_H * 2^s + A_L
    B_L, B_H = B & ((1 << s) - 1), B >> s  # B = B_H * 2^s + B_L

    # Barrett Reduction
    q1 = (A_H * B_H) >> s  # Step 1: q1 ← (A_H * B_H) // 2^s
    q2 = q1 * P0  # Step 2: q2 ← q1 * P0 mod P
    q3 = pow(2, 2 * s) * (A_H * B_H % (1 << s)) + pow(2, s) * ((A_H + A_L) * (B_H + B_L) - A_H * B_H - A_L * B_L) + A_L * B_L   # Step 3: q3 
    X00 = (q2 + q3)  # Step 4: X00 ← (q2 + q3) mod P
    q4 = X00 >> (2 * s - 2)  # Step 5: q4 ← X00 // 2^(2s+2)
    q5 = q4 * mu  # Step 6: q5 ← q4 * mu mod P
    q6 = q5 >> (s + 5)  # Step 7: q6 ← q5 // 2^(s+5)
    r1 = (X00 - (q6 * P)) % (1 << (2 * s + 1))  # Step 8: r1 ← X00 mod 2^(2s+1) - (q6 * P) mod 2^(2s+1)
    if (r1 > P):
        r = r1 - P  # Step 9: r2 ← (r1 - P) mod P
    r = r1
    # Step 10: Choose {r ∈ {r1, r2} | 0 ≤ r < P}
    return r

In [7]:
# Example usage
A = 156582880785126
B = 212210236166232
P = 281474976710129

result = Improved_Folding_Barrett_Reduction(A, B, P)
print("Result:", result)
print("Expected:", (A * B) % P)

Result: 229733405869021
Expected: 229733405869021


In [246]:
def Split3_Folding_Barrett_Reduction(A, B, P):
    # Pre-computation
    k = math.ceil(math.log2(P))
    s = k // 2
    # print("s",s)
    mu = 2**(3 * s + 3) // P  # Calculate mu
    print("mu0", mu)
    P0 = 2**(3 * s) % P  # Calculate P0
    print("mu1",P0)

    # Split the numbers into three parts
    mask = (1 << s) - 1
    A_L = A & mask
    A_H = A >> (s)  # A = A_H * 2^(2s) + A_M * 2^s + A_L

    B_L = B & mask
    B_H = B >> (s)  # B = B_H * 2^(2s) + B_M * 2^s + B_L

    # Barrett Reduction
    q1 = (A_H * B_H) >> s  # Step 1: q1 ← (A_H * B_H) // 2^s
    print("z2", A_H * B_H)
    print("q1",q1)
    print(len(bin(q1)[2:]))
    q2 = q1 * P0  # Step 2: q2 ← q1 * P0 mod P
    print("q2",q2)
    q3 = 2**(2*s) * (A_H * B_H & mask) + 2**s * ((A_H + A_L) * (B_H + B_L) - A_H * B_H - A_L * B_L) + A_L * B_L   # Step 3: q3 
    print("q3",q3)
    X00 = (q2 + q3)  # Step 4: X00 ← (q2 + q3) mod P
    q4 = X00 >> (2 * s - 2)  # Step 5: q4 ← X00 // 2^(3s-2)
    print("X00",X00)
    print("q4",q4)
    q5 = q4 * mu >> (s + 5)  # Step 6: q5 ← q4 * mu mod P
    print("q5",q5)
    # q6 = q5 >> (s + 5)  # Step 7: q6 ← q5 // 2^(s+5)
    print("q6 bit",len(bin(q5)[2:]))
    # print("q6",q6)
    print("r0",q5 * P)
    r1 = (X00 - (q5 * P)) & ((1 << (2 * s + 1)) - 1)  # Step 8: r1 ← X00 mod 2^(3s+1) - (q6 * P) mod 2^(3s+1)
    print("r1",r1)
    if r1 >= P:
        r = r1 - P  # Step 9: r2 ← (r1 - P) mod P
    else:
        r = r1

    # Step 10: Choose {r ∈ {r1, r2} | 0 ≤ r < P}
    return r

In [247]:
# Example usage
# A = 156582880785126
A = 303379748
# B = 212210236166232
B = 281473911971984
P = 281474976710129
# P = 140737488355201

result = Split3_Folding_Barrett_Reduction(A, B, P)
print("Result:", result)
print("Expected:", (A * B) % P)
if result == ((A * B) % P): 
    print(True)
else:
    if result > (A * B) % P:
        print("bigger")
    else:
        print("smaller")

mu0 134217728
mu1 8841592832
z2 301988736
q1 17
5
q2 150307078144
q3 5113254273850720347200
X00 5113254274001027425344
q4 72663713
q5 18165928
q6 bit 25
r0 5113254160717880284712
r1 113283147140632
Result: 113283147140632
Expected: 113283147140632
True


In [59]:
# Resource: https://ieeexplore.ieee.org/document/6467863
def mod_mul(A, B, q, mu0, mu1, BW=48):
    k = BW // 2

    print(len(bin(mu0)[2:]), "bits in mu0")
    print(len(bin(mu1)[2:]), "bits in mu1")

    # Split A and B
    low1 = A & ((1 << k) - 1)
    high1 = A >> k
    low2 = B & ((1 << k) - 1)
    high2 = B >> k

    # Partial products
    z0 = low1 * low2
    z2 = high1 * high2
    low1high1 = low1 + high1
    low2high2 = low2 + high2
    z1 = low1high1 * low2high2

    # Compute q1
    q1 = z2 >> k
    print(len(bin(z0)[2:]), "bits in z0", z0)
    print(len(bin(z1)[2:]), "bits in z1", z1)
    print(len(bin(z2)[2:]), "bits in z2", z2)
    print(len(bin(q1)[2:]), "bits in q1", q1)

    # Construct q3
    z2_24bits = z2 & ((1 << 24) - 1)
    q3 = (z2_24bits << (2 * k)) + ((z1 - z0 - z2) << k) + z0

    # Compute x = q1 * mu1 + q3
    q2 = q1 * mu1
    x = q2 + q3

    # q4 = x >> (2k - 2)
    q4 = x >> (2 * k - 2)

    # q5 = q4 * mu0
    q5 = q4 * mu0

    print(len(bin(q2)[2:]), "bits in q2", q2)
    print(len(bin(q3)[2:]), "bits in q3", q3)
    print(len(bin(q4)[2:]), "bits in q4", q4)
    print(len(bin(q5)[2:]), "bits in q5", q5)
    print(len(bin(x)[2:]), "bits in x",     x)

    # r0 = q5[54:29] * q
    q5_extract = (q5 >> 29) & ((1 << 26) - 1)
    r0 = q5_extract * q

    # r1 = x - r0
    r1 = x - r0

    print(len(bin(r0)[2:]), "bits in r0", r0)
    print(len(bin(r1)[2:]), "bits in r1", r1)

    # Final modular reduction
    M = r1 - q if r1 >= q else r1
    return M & ((1 << BW) - 1)




In [60]:
# Example usage
A = 281472921425416
B = 281473683445346
q = 281474960326657  # near 2^48
mu0 = 134217735
mu1 = 274877890166784

result = mod_mul(A, B, q, mu0, mu1)
result

28 bits in mu0
48 bits in mu1
47 bits in z0 127656893128464
50 bits in z1 806240414862596
48 bits in z2 281471604499834
24 bits in q1 16777015
72 bits in q2 4611630486496487669760
73 bits in q3 6665133010197395316496
28 bits in q4 160252447
55 bits in q5 21508720464547545
74 bits in x 11276763496693882986256
74 bits in r0 11276763142237376303241
49 bits in r1 354456506683015


72981546356358

In [58]:
274877890166784*16777015

4611630486496487669760

In [52]:
A*B%q

72981546356358

In [50]:
# Example usage
A = 303379748
B = 281473911971984
q = 281474960326657  # near 2^48
mu0 = 134217735
mu1 = 274877890166784

result = mod_mul(A, B, q, mu0, mu1)
result

28 bits in mu0
48 bits in mu1
44 bits in z0
46 bits in z1
29 bits in z2
5 bits in q1
53 bits in q2
73 bits in q3
27 bits in q4
54 bits in q5
73 bits in x


17128659760006

In [28]:
A*B%q

17128659760006

In [38]:
# Example usage
A = 303379748
B = 281473911971984
q = 281474976317441  # near 2^48
mu0 = 134217728
mu1 = 6597052989440

result = mod_mul(A, B, q, mu0, mu1)
result

29 bits in z2
5 bits in q1


232416282817432

In [33]:
A*B%q

232416282817432

In [217]:
281474673330381*B

79227777423338614072448045904

In [220]:
(79227777423338614072448045904)%P

168191829569497

In [221]:
(0-168191829569497)%P

113283147140632

In [7]:
def karatsuba30_py(a: int, b: int) -> int:
    """
    Emulates the Karatsuba multiplication for:
    - a: 28-bit integer
    - b: 30-bit integer
    """

    assert 0 <= a < (1 << 28), "a must be 28-bit"
    assert 0 <= b < (1 << 30), "b must be 30-bit"

    # Split `a` into two 14-bit halves
    low1 = a & ((1 << 14) - 1)
    high1 = a >> 14

    # Split `b` into two 15-bit halves
    low2 = b & ((1 << 15) - 1)
    high2 = b >> 15

    # Partial products
    z0 = low1 * low2
    z2 = high1 * high2
    z1 = (low1 + high1) * (low2 + high2)

    # Final result using Karatsuba formula
    result = (z2 << 28) + ((z1 - z2 - z0) << 14) + z0

    return result


In [8]:
a = 0x0FFFFFF  # 28-bit example
b = 0x3FFFFFFF # 30-bit example

print("Result:", karatsuba30_py(a, b))
print("Expected:", a * b)  # should match


Result: 9007473578983425
Expected: 18014397418962945
