# Imports

In [1]:
import concurrent.futures
import io
import os
import re
import shlex
import socket
import subprocess
from pathlib import Path

import numpy as np
import tqdm

import pandas as pd
import psutil

# Parameters

In [2]:
NOTEBOOK_NAME = "generate_difficult_sudokus"

In [3]:
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).resolve()
NOTEBOOK_PATH.mkdir(exist_ok=True)
NOTEBOOK_PATH

PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus')

In [4]:
hostname = socket.gethostname()

if hostname == "strokach-cloudtop":
    NPROC = 16
elif hostname.startswith("nia") or hostname.startswith("beluga"):
    NPROC = 40
else:
    NPROC = psutil.cpu_count(logical=True)  # For some reason we see only 20 cpus on beluga 

In [5]:
START_BATCH_IDX = int(os.getenv("START_BATCH_IDX", 10_000))
offset = int(os.getenv("SLURM_ARRAY_TASK_ID", 2))
START_BATCH_IDX += offset
START_BATCH_IDX

10002

# Functions

In [6]:
def sudoku_is_solved(values) -> bool:
    ref = np.arange(1, 10)
    mat = values.reshape(9, 9)
    for i in range(3):
        for j in range(3):
            v = np.sort(mat[i * 3 : (i + 1) * 3, j * 3 : (j + 1) * 3], axis=None)
            if not (v == ref).all():
                return False
    for i in range(9):
        v = np.sort(mat[i, :])
        if not (v == ref).all():
            return False
    for j in range(9):
        v = np.sort(mat[:, j])
        if not (v == ref).all():
            return False
    return True


def decode_sugen_output(output):
    grid = np.empty((9, 9), dtype=np.int)
    for i, row in enumerate(output.split("\n")[:9]):
        for j, value in enumerate(row.split(" ")):
            if value == "_":
                grid[i, j] = 0
            else:
                grid[i, j] = int(value)

    info_string = output.strip().split("\n")[-1]
    try:
        difficulty = int(
            re.findall("Difficulty: (\d+)", info_string)[0]
        )
    except IndexError:
        difficulty = None
    return grid, difficulty


def generate_sudoku():
    sc = "sugen -i 5000 -t 10000 generate"
    ps = subprocess.run(shlex.split(sc), stdout=subprocess.PIPE)

    sc2 = "sugen solve"
    ps2 = subprocess.run(shlex.split(sc2), input=ps.stdout, stdout=subprocess.PIPE)

    puzzle, _ = decode_sugen_output(ps.stdout.decode())
    solution, difficulty = decode_sugen_output(ps2.stdout.decode())

    assert sudoku_is_solved(solution)
    return puzzle, solution, difficulty

In [7]:
generate_sudoku()

(array([[0, 0, 0, 2, 0, 0, 4, 0, 0],
        [4, 0, 0, 0, 9, 0, 6, 0, 0],
        [0, 7, 0, 6, 1, 0, 0, 3, 5],
        [1, 0, 2, 0, 0, 0, 0, 0, 0],
        [0, 3, 0, 1, 0, 6, 0, 5, 0],
        [0, 0, 0, 0, 0, 0, 3, 0, 1],
        [3, 6, 0, 0, 4, 1, 0, 8, 0],
        [0, 0, 1, 0, 2, 0, 0, 0, 6],
        [0, 0, 5, 0, 0, 8, 0, 0, 0]]), array([[5, 9, 6, 2, 3, 7, 4, 1, 8],
        [4, 1, 3, 8, 9, 5, 6, 2, 7],
        [2, 7, 8, 6, 1, 4, 9, 3, 5],
        [1, 5, 2, 3, 7, 9, 8, 6, 4],
        [7, 3, 4, 1, 8, 6, 2, 5, 9],
        [6, 8, 9, 4, 5, 2, 3, 7, 1],
        [3, 6, 7, 9, 4, 1, 5, 8, 2],
        [8, 4, 1, 5, 2, 3, 7, 9, 6],
        [9, 2, 5, 7, 6, 8, 1, 4, 3]]), 1053)

# Generate sudokus

In [None]:
for batch_idx in range(START_BATCH_IDX, START_BATCH_IDX + 1_000_000):
    print(f"Generating batch {batch_idx}...")
    results = []
    with concurrent.futures.ThreadPoolExecutor(NPROC) as pool:
        futures = [pool.submit(generate_sudoku) for _ in range(100_000)]
        for future in tqdm.tqdm_notebook(
            concurrent.futures.as_completed(futures), total=len(futures)
        ):
            puzzle, solution, difficulty = future.result()
            puzzle_str = "".join([str(v) for v in puzzle.reshape(-1)])
            solution_str = "".join([str(v) for v in solution.reshape(-1)])
            results.append((puzzle_str, solution_str, difficulty))
    df = pd.DataFrame(results, columns=["puzzle", "solution", "difficulty"])
    df.to_csv(NOTEBOOK_PATH.joinpath(f"sodoku_{batch_idx}.csv"), sep=",", index=False)

Generating batch 10002...


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


Generating batch 10003...


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))