In [None]:
from numba import cuda
import numpy as np
from typing import List
import itertools

In [None]:
inputs = np.array([1, 2], dtype=np.float32)
outputs = np.array([1, 2, 3], dtype=np.float32)

In [None]:
def generate_partitions_matrix(arr: np.ndarray) -> np.ndarray:
    arr = arr.astype(np.float32)
    n = len(arr)

    def generate_index_partitions(n: int) -> List[List[List[int]]]:
        """Generate all set partitions of indices [0, ..., n-1]"""
        if n == 0:
            return [[]]
        prev_partitions = generate_index_partitions(n - 1)
        new_partitions = []
        for partition in prev_partitions:
            new_partitions.append(partition + [[n - 1]])  # New block
            for i in range(len(partition)):
                copy = [block.copy() for block in partition]
                copy[i].append(n - 1)
                new_partitions.append(copy)
        return new_partitions

    # Generate index partitions
    index_partitions = generate_index_partitions(n)

    # Build float32 rows with 0.0 separators, last column is number of sets
    rows = []
    for partition in index_partitions:
        row = []
        for block in partition:
            row.extend(arr[block])
            row.append(0.0)  # Separator between blocks
        rows.append((np.array(row, dtype=np.float32), len(partition)))

    # Determine max row length (excluding count), and add 1 column for count
    max_len = max(len(r[0]) for r in rows)
    total_columns = max_len + 1

    # Allocate final padded matrix
    matrix = np.zeros((len(rows), total_columns), dtype=np.float32)
    for i, (data_row, num_sets) in enumerate(rows):
        matrix[i, :len(data_row)] = data_row
        matrix[i, -1] = float(num_sets)  # Set count in the last column

    return matrix

In [None]:
input_partitions = generate_partitions_matrix(inputs)
output_partitions = generate_partitions_matrix(outputs)

In [None]:
input_partitions

array([[1., 0., 2., 0., 2.],
       [1., 2., 0., 0., 1.]], dtype=float32)

In [None]:
# Size of both arrays and size of matrix to store results in for chunked processing
DEPTH = len(inputs) * len(outputs) # nach hinten
SIZE = 100000 # nach unten
WIDTH = 2 * (len(inputs) + len(outputs)) # zur Seite

# Define the CUDA kernel (GPU function)
# inputs should be potential_mappings_inputs
# outputs should be potential_mappings_outputs
@cuda.jit
def gpu_function(inputs, outputs, result):
    row_idx = cuda.grid(1)  # each thread processes one "row" (SIZE)

    depth = result.shape[0]   # DEPTH
    size = result.shape[1]    # SIZE
    width = result.shape[2]   # WIDTH

    rows_inputs = inputs.shape[0]   # ROWS
    width_inputs = inputs.shape[1]    # WIDTH

    rows_outputs = outputs.shape[0]   # ROWS
    width_outputs = outputs.shape[1]    # WIDTH

    if row_idx < size:
      # Thread logic for GPU
      # result[d, row_idx, w] zum Schreiben von Werten

      result[:, row_idx, :] = 2
      result[:, row_idx, :] = 3

In [None]:
import time

# Iterate over both matrices to find potential valid mappings that have the same amount of partitions
# Store them in two new matrices such that same rows refer to a potential valid mapping
#for row in input_partitions:
#  for row in output_partitions:
input_columns = input_partitions.shape[1]
output_columns = output_partitions.shape[1]

potential_mappings_inputs = np.zeros((SIZE, input_columns), dtype=np.float32)
potential_mappings_outputs = np.zeros((SIZE, output_columns), dtype=np.float32)

result = np.zeros((DEPTH, SIZE, WIDTH), dtype=np.float32)

counter = 0
for ip in input_partitions:
  for op in output_partitions:
      if ip[input_columns-1] == op[output_columns-1]:
        potential_mappings_inputs[counter] = ip
        potential_mappings_outputs[counter] = op
        counter += 1
        if counter >= SIZE: # Maximum array size reached
          # Copy data to GPU
          d_inputs = cuda.to_device(potential_mappings_inputs)
          d_outputs = cuda.to_device(potential_mappings_outputs)
          d_result = cuda.device_array_like(result)

          # Configure the blocks
          threads_per_block = 256
          blocks_per_grid = (SIZE + threads_per_block - 1) // threads_per_block

          # Launch the kernel
          gpu_function[blocks_per_grid, threads_per_block](d_inputs, d_outputs, d_result)

          # Copy result back to host
          d_result.copy_to_host(result)

          #----------------------------------
          print("Fertig...")

          print(result)

          time.sleep(50)

          # Clear arrays
          potential_mappings_inputs.fill(0)
          potential_mappings_outputs.fill(0)

          # Reset counter
          counter = 0

# Copy data to GPU
d_inputs = cuda.to_device(potential_mappings_inputs)
d_outputs = cuda.to_device(potential_mappings_outputs)
d_result = cuda.device_array_like(result)

# Configure the blocks
threads_per_block = 256
blocks_per_grid = (SIZE + threads_per_block - 1) // threads_per_block

# Launch the kernel
gpu_function[blocks_per_grid, threads_per_block](d_inputs, d_outputs, d_result)

# Copy result back to host
d_result.copy_to_host(result)

#----------------------------------
print("Fertig...")

print(result)

time.sleep(50)

Fertig...
[[[3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  ...
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]]

 [[3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  ...
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]]

 [[3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  ...
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]]

 [[3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  ...
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]
  [3. 3. 3. ... 3. 3. 3.]]]


KeyboardInterrupt: 