<a href="https://colab.research.google.com/github/markste-in/collatz/blob/main/create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ray &> /dev/null
!pip install "ray[default]" &> /dev/null

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import sympy
from sympy.ntheory import factorint
import ray
import psutil
import time
import os

In [4]:
print("CPU Count:", psutil.cpu_count())

CPU Count: 2


In [5]:
ray.init()

2021-04-25 20:08:13,156	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'metrics_export_port': 59251,
 'node_id': 'de71d887aada7bda5f94715b5b477e5cf8c5cfe72a695685bf6c11f2',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2021-04-25_20-08-11_407437_2370/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2021-04-25_20-08-11_407437_2370/sockets/raylet',
 'redis_address': '172.28.0.2:6379',
 'session_dir': '/tmp/ray/session_2021-04-25_20-08-11_407437_2370',
 'webui_url': '127.0.0.1:8265'}

In [6]:
def collatz_generator(n : int,as_sequence = True) -> int:
#     if as_sequence:
#         yield int(n)   
    while (n>1):
        if n%2 == 0: 
            _n =n / 2
        else: 
            _n = n*3+1
        if as_sequence:
            yield int(n)        
        else:
            yield (int(n),int(_n))
        n = _n
    return

In [7]:
header = ["Number", 
          "Length",
          "Max", 
          "IsEven",
          "nPrimes",
          "Smallest_Prime",
          "nSmallest_Prim",
          "Biggest_Prime",
          "nBiggest_Prime", 
          "isPrime"]

def all_fancy_calculations(serie):
        calculations = list()
        num_in_question = serie[0]
        fac_n = factorint(num_in_question)
        
        calculations.append(num_in_question)       # Starting number
        calculations.append(len(serie))     # Length of series
        calculations.append(max(serie))     # Max of number series
        calculations.append(num_in_question%2 ==0) # Is even?
        calculations.append(len(fac_n)) # how many primes
        calculations.append(min(fac_n)) # smallest prime
        calculations.append(fac_n[min(fac_n)]) # how often does the smallest prime appear
        calculations.append(max(fac_n)) # biggest prime
        calculations.append(fac_n[max(fac_n)]) # how often does the biggest prime appear
        calculations.append(sympy.isprime(num_in_question)) #is prime?
        
        return calculations
    

In [8]:
header_with_pre_and_post = [ pre+item for pre in ["","pre","post"] for item in header]
SMALLEST_NUMBER = 3 # since we calculate the previous serie the smallest number is actually one smaller (should not go below 2 since the return value of the collatz could be an empty list)
@ray.remote
def create_random_dataset(nums,highest_number):
    print("called with", nums,highest_number)
    cz_series = [[i for i in collatz_generator(j)] for j in np.random.randint(3,highest_number, nums)]
    dataset = list()
    for serie in cz_series:
        previous_serie = [i for i in collatz_generator(serie[0]-1)] # Also calculate the previous serie
        next_serie =  [i for i in collatz_generator(serie[0]+1)] #And also calculate tne next serie

        all_calculations = list()
        all_calculations = all_fancy_calculations(serie)
        all_calculations.extend(all_fancy_calculations(previous_serie))
        all_calculations.extend(all_fancy_calculations(next_serie))
        dataset.append(all_calculations)
        
    return pd.DataFrame(dataset)

In [9]:
num_entries = int(1000)
highest_number = int(1e12)
num_jobs = 100
assert highest_number>num_entries

In [10]:
print("Generate dataset with", num_entries*num_jobs,"entries\n\n")
for i in range(3):
  # Generate job
  results = [create_random_dataset.remote(num_entries,highest_number) for i in range(num_jobs)]

  # Build data frame
  dataframes = ray.get(results)
  df = pd.concat(dataframes)
  df.columns = header_with_pre_and_post

  # Save dataframe
  PATH = "gdrive/MyDrive/collatz_dbs/"
  os.makedirs(PATH,exist_ok=True)
  FILENAME = 'collatz_db_' + time.strftime("%Y%m%d-%H%M%S") + '.parquet'
  file = os.path.join(PATH,FILENAME)
  df.to_parquet(file)

Generate dataset with 100000 entries


[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m called with 1000 1000000000000
[2m[36m(pid=2548)[0m called with 1000 1000000000000
[2m[36m(pid=2549)[0m ca