In [3]:
import numpy as np
from tqdm import tqdm
import math

### Let's get started with some data preparations

In [4]:
MAX_VALUE = 2**32

In [5]:
with open("output.bin", "wb") as file:
    big_ints = np.random.randint(
                    low=0,
                    high=MAX_VALUE,
                    size=5000,
                    dtype=np.uint32,
                )
    big_ints = big_ints.newbyteorder()
    assert (
        big_ints.dtype.byteorder == ">"
        ), "Result array somehow is not big endian"
    file.write(big_ints.tobytes())

In [6]:
with open("output.bin", "rb") as file:
        data = np.fromfile(file, dtype=">u4", count=5000)

In [7]:
def get_prime_factors_amount(number):
    count = 0

    # Check if 2 is a prime factor
    while number % 2 == 0:
        count += 1
        number //= 2

    # Check for other prime factors
    for i in range(3, int(math.sqrt(number)) + 1, 2):
        while number % i == 0:
            count += 1
            number //= i

    # If the given number is still greater than 1, it is a prime factor
    if number > 1:
        count += 1

    return count

### Simple sequentual algorithm

In [8]:
def dummy_calculate(data):
    return sum(get_prime_factors_amount(number) for number in tqdm(data))

In [9]:
%%timeit
print(f"Amount of prime numbers is: {dummy_calculate(data)}")

100%|██████████| 5000/5000 [00:07<00:00, 641.52it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 642.74it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 632.41it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 643.33it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 630.75it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 641.36it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 646.61it/s]


Amount of prime numbers is: 20486


100%|██████████| 5000/5000 [00:07<00:00, 643.93it/s]

Amount of prime numbers is: 20486
7.81 s ± 69.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)





### Multiprocessing algorithm

In [9]:
from multiprocessing import Pool
from multiprocessing import cpu_count

In [10]:
device_amount = cpu_count()
print(f"Cpu amount is {device_amount}")

Cpu amount is 10


In [15]:
def mp_calculate(data):
    with Pool(device_amount) as pool:
        results = tqdm(
            pool.imap_unordered(
                get_prime_factors_amount,
                data,
            ),
            total=len(data),
        )

        return sum(results)

In [None]:
%%timeit
print(f"Amount of prime numbers is: {mp_calculate(data)}")

### Implementation using PySpark

In [None]:
from pyspark.sql import SparkSession
from operator import add

spark = SparkSession.builder.getOrCreate()

In [None]:
rdd = spark.sparkContext.parallelize(data, device_amount)

In [None]:
def spark_calculate(rdd):
    whole_amount = rdd.map(get_prime_factors_amount).reduce(add)
    return whole_amount

In [None]:
%%timeit
print(f"Amount of prime numbers is: {spark_calculate(data)}")

### Go for go solution!

Source code may be found in sub-directory named `goroutines`

In [10]:
!time go run main.go -input_file output.bin

{"level":"info","time":"2023-06-07T21:45:24+03:00","message":"Goes with 10 cpus on board."}
{"level":"info","time":"2023-06-07T21:45:24+03:00","message":"Done! Amount of prime numbers for ints in output.bin: 20486"}
go run main.go -input_file output.bin  0.15s user 0.21s system 94% cpu 0.382 total
