## Simulation for Multi-Lane NTT/INTT

In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sympy.ntheory.residue_ntheory import nthroot_mod
import os
import time
from multiprocessing import Pool

In [2]:
def egcd(a, b):
    if a == 0:
        return (b, 0, 1)
    else:
        g, y, x = egcd(b % a, a)
        return (g, x - (b // a) * y, y)


In [3]:
def modinv(a, m):
    '''
    input (a, m)
    a: input
    m: Modulus
    '''
    g, x, y = egcd(a, m)
    if g != 1:
        raise Exception('Modular inverse does not exist')
    else:
        return x % m

In [4]:
# Bit-Reverse integer
def bit_reverse(a, n):
    return int(('{:0'+str(n)+'b}').format(a)[::-1],2)

def indexReverse(B, v):
    '''
    B : NTT result in bit-reverse order (BO)
    '''
    n = len(B)
    reversed_indices = [0] * n

    for i in range(n):
        reversed_indices[i] = bit_reverse(i, v) # int(format(i, '0' + str(v) + 'b')[::-1], 2)

    result = [0] * n
    for i in range(n):
        result[reversed_indices[i]] = B[i]

    return result

# # Example usage
# B = [0, 1, 2, 3]
# v = 2

# reversed_B = indexReverse(B, v)
# print(reversed_B)

In [5]:
def tfg(n, q):
    if nthroot_mod(-1,n,q) != None:
        psi = int(nthroot_mod(-1,n,q))
        # print("generate psi table...")
        Y_table = [0] * n  # Start with the first element, which is 1 (psi^0 mod q)
        Y_table[0] = 1
        
        for i in range(1, n):
            Y_table[i] = (psi * Y_table[i-1]) % q
    else:
        print("nthroot_mod not found.")

    return Y_table

In [6]:
def tfg_table(psi, n, q):
    # print("generate psi table...")
    Y_table = [0] * n  # Start with the first element, which is 1 (psi^0 mod q)
    Y_table[0] = 1
    
    for i in range(1, n):
        Y_table[i] = (psi * Y_table[i-1]) % q

    return Y_table

In [7]:
# Cooley-Tukey Butterfly Structure
# A0,A1: input coefficients
# W: twiddle factor
# q: modulus
# B0,B1: output coefficients

# JIT-compiled butterfly
# @njit
def CT_Butterfly(A0,A1,W,q):
    r"""
    A0 -------\--|+|-- B0
               \/
               /\
    A1 --|x|--/--|-|-- B1
    """
    M = (A1 * W) % q

    B0 = (A0 + M) % q
    B1 = (A0 - M) % q

    return B0,B1

In [8]:
# Gentleman-Sandle Butterfly Structure
# A0,A1: input coefficients
# W: twiddle factor
# q: modulus
# B0,B1: output coefficients
def GS_Butterfly(A0,A1,W,q):
    r"""
    A0 --\--|+|------- B0
          \/
          /\
    A1 --/--|-|--|x|-- B1
    """
    M0 = (A0 + A1) % q
    M1 = (A0 - A1) % q

    B0 = M0
    B1 = (M1 * W) % q

    return B0,B1

In [9]:
def DIV2(n,q):
    if (n % 2 == 0):
        n = n >> 1  # Right-shift to divide by 2
    else: # n is odd
        n = (n >> 1) + ((q + 1) >> 1)   # Modular adjustment for odd n
    return n

In [10]:
def GS_BU_DIV2(A0,A1,W,q):
    B0, B1 = GS_Butterfly(A0,A1,W,q)
    return DIV2(B0,q),DIV2(B1,q)

In [11]:
def parallel_GS_layer(coeff_pairs, twiddle_factors, q):
    '''
    Process 16 butterfly operations in parallel
    Input: 32 coefficients (16 pairs), 16 twiddle factors
    Output: 32 processed coefficients
    This simulates one cycle of 16 parallel BUs in hardware
    
    Args:
        coeff_pairs: list of tuples [(a0,b0), (a1,b1), ..., (a15,b15)] - 16 pairs of coefficients
        twiddle_factors: list of 16 twiddle factors [W0, W1, ..., W15]
        q: modulus
        
    Returns:
        list of 32 processed coefficients
    '''
    BU_COUNT = 16
    results = [None] * (BU_COUNT * 2)  # 32 output coefficients
    
    # Process 16 butterfly operations in parallel (simulating hardware)
    for bu_idx in range(BU_COUNT):
        a_val = coeff_pairs[bu_idx][0]
        b_val = coeff_pairs[bu_idx][1]
        W = twiddle_factors[bu_idx]
        
        # Each BU processes one butterfly operation
        result = GS_BU_DIV2(a_val, b_val, W, q)
        
        # Store results (in hardware, this would happen simultaneously)
        results[bu_idx * 2] = np.asarray(result[0]).item()
        results[bu_idx * 2 + 1] = np.asarray(result[1]).item()
        
    return results

In [12]:
def INTT(hatA, Y_table, q):
    """
    Parallel INTT using 16 Butterfly Units (BUs)
    Simulates hardware implementation with 16 parallel butterfly units
    """
    a = hatA.copy()
    n = 65536
    stages = 16
    total_ops = n // 2  # 32768
    num_chunks = total_ops // 16  # 2048

    for stage in range(stages):
        t = 1 << stage
        m = n >> (stage + 1)
        t_mask = t - 1
        shift_amt = stage
        j_shift_amt = stage + 1

        for chunk_id in range(num_chunks):
            coeff_pairs = []
            twiddle_factors = []

            for bu_id in range(16):
                global_op_idx = (chunk_id << 4) + bu_id

                i = global_op_idx >> shift_amt
                offset = global_op_idx & t_mask
                j = (i << j_shift_amt) + offset
                j_plus_t = j + t

                omega_idx = m + i
                twiddle_idx = bit_reverse(omega_idx, 16)

                coeff_pairs.append((a[j], a[j_plus_t]))
                twiddle_factors.append(Y_table[twiddle_idx])

            processed = parallel_GS_layer(coeff_pairs, twiddle_factors, q)

            for bu_id in range(16):
                global_op_idx = (chunk_id << 4) + bu_id
                i = global_op_idx >> shift_amt
                offset = global_op_idx & t_mask
                j = (i << j_shift_amt) + offset
                j_plus_t = j + t

                a[j] = processed[bu_id * 2]
                a[j_plus_t] = processed[bu_id * 2 + 1]

    return a  # No final scaling — correct as-is

### INTT

In [13]:
# Reading inverse psi table from file
inv_psi_table = pd.read_csv("psi_inv_partp_df.csv")

In [14]:
# Reading modulus
qp = pd.read_csv("qp.csv")
qp = qp.to_numpy(dtype=object).flatten()  # Flatten the DataFrame to a 1D array
qp_partp = qp[24:32]

In [15]:
mu = (1 << (48*2)) // qp
mu = np.array(mu, dtype=object)

In [16]:
# Reading data from file
cTilda1_partp = pd.read_csv("cTilda1_partp.csv")
cTilda1_partp = cTilda1_partp.to_numpy(dtype=object)  # Flatten the DataFrame to a 1D array

In [17]:
# # INTT in ModUp before BConv
# cTilda1_coef = [[] for i in range(len(cTilda1_partp))]
# ringDim = len(cTilda1_partp[0])


# for i in range(len(cTilda1_partp)):
#     a = cTilda1_partp[i] # EVAL form
#     qi = qp_partp[i]
#     psi_inv = inv_psi_table.iloc[i].tolist()  # Convert to list for compatibility with INTT
#     result = INTT(a, indexReverse(psi_inv, int(math.log2(len(psi_inv)))), qi)
#     cTilda1_coef[i] = result
    
# cTilda1_coef = np.array(cTilda1_coef, dtype=object)

In [18]:
EVAL_a = cTilda1_partp[0] # EVAL form
psi_inv = inv_psi_table.iloc[0].tolist()

In [19]:
# qi = qp_partp[0]
# COEF_a = INTT(EVAL_a, psi_inv, qi)

Simulation for 1 lane INTT

In [20]:
def INTT_16BU(hatA, Y_table, q):
    """
    Parallel INTT using 16 Butterfly Units (BUs)
    Simulates hardware implementation with 16 parallel butterfly units
    """
    a = hatA.copy()
    n = 65536
    stages = 16
    total_ops = n // 2  # 32768
    num_chunks = total_ops // 16  # 2048

    for stage in range(stages):
        t = 1 << stage
        m = n >> (stage + 1)
        t_mask = t - 1
        shift_amt = stage
        j_shift_amt = stage + 1

        # print(f"Stage {stage + 1}/{stages}, t={t}, m={m}, total_ops={total_ops}, num_chunks={num_chunks}")

        for chunk_id in range(num_chunks):
            coeff_pairs = []
            twiddle_factors = []

            idx_coeff_pairs = []
            idx_twiddle_factors = []

            # print(f"Processing chunk {chunk_id + 1}/{num_chunks}...")

            for bu_id in range(16):
                global_op_idx = (chunk_id << 4) + bu_id

                i = global_op_idx >> shift_amt
                offset = global_op_idx & t_mask
                j = (i << j_shift_amt) + offset
                j_plus_t = j + t

                omega_idx = m + i
                twiddle_idx = bit_reverse(omega_idx, 16)

                coeff_pairs.append((a[j], a[j_plus_t]))
                twiddle_factors.append(Y_table[twiddle_idx])

                idx_coeff_pairs.append((j, j_plus_t))
                idx_twiddle_factors.append(twiddle_idx)

            # Process 16 butterfly operations in parallel (simulating hardware)
            # print(idx_coeff_pairs, idx_twiddle_factors)

            processed = parallel_GS_layer(coeff_pairs, twiddle_factors, q)

            for bu_id in range(16):
                global_op_idx = (chunk_id << 4) + bu_id
                i = global_op_idx >> shift_amt
                offset = global_op_idx & t_mask
                j = (i << j_shift_amt) + offset
                j_plus_t = j + t

                a[j] = processed[bu_id * 2]
                a[j_plus_t] = processed[bu_id * 2 + 1]

    return a  # No final scaling — correct as-is

In [21]:
qi = qp_partp[0]
COEF_a = INTT_16BU(EVAL_a, psi_inv, qi)
pd.DataFrame(COEF_a)

Unnamed: 0,0
0,38405975504760
1,137191805260920
2,55793489809391
3,49582118286039
4,14812524024336
...,...
65531,242678592560382
65532,179247914006390
65533,97690931196823
65534,203964154682497
