## Summary

## Imports

In [1]:
from pathlib import Path

import tqdm

import gcsfs
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Parameters

In [2]:
NOTEBOOK_PATH = Path("generate_difficult_sudokus").resolve()
NOTEBOOK_PATH.mkdir(exist_ok=True)
NOTEBOOK_PATH

PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus')

In [3]:
USE_EXISTING_VALIDATION_DATASET = True

In [4]:
USE_LOCAL_DATA_DIR = True

In [5]:
project = 'deep-protein-gen'
bucket = 'deep-protein-gen'
if USE_LOCAL_DATA_DIR:
    bucket = Path(f"~/datapkg_data_dir/{bucket}").expanduser().resolve()
    bucket.joinpath("sudoku_difficult").mkdir(exist_ok=True)

project, bucket

('deep-protein-gen', PosixPath('/data/datapkg_data_dir/deep-protein-gen'))

In [6]:
if USE_LOCAL_DATA_DIR:
    fs = None
else:
    fs = gcsfs.GCSFileSystem(project=project)

## Workspace

In [7]:
!ls generate_difficult_sudokus/

beluga	niagara-1  old_951		strokach-cloudtop-2
new	niagara-2  strokach-cloudtop-1


### Read all generated sudokus

In [9]:
sudoku_files = (
    list(NOTEBOOK_PATH.joinpath("beluga").glob("*.csv")) + 
    list(NOTEBOOK_PATH.joinpath("niagara-1").glob("*.csv")) + 
    list(NOTEBOOK_PATH.joinpath("niagara-2").glob("*.csv")) + 
    list(NOTEBOOK_PATH.joinpath("strokach-cloudtop-1").glob("*.csv")) +
    list(NOTEBOOK_PATH.joinpath("strokach-cloudtop-2").glob("*.csv"))
)
print(len(sudoku_files))
print(sudoku_files[:3])

271
[PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus/beluga/sodoku_20070_beluga.csv'), PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus/beluga/sodoku_20121_beluga.csv'), PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus/beluga/sodoku_20090_beluga.csv')]


In [10]:
dfs = []
for file in tqdm.tqdm_notebook(sudoku_files, total=len(sudoku_files)):
    df = pd.read_csv(file)
    df = df.drop_duplicates()
    df["is_951"] = (df["difficulty"] == 951).all()
    dfs.append(df)

HBox(children=(IntProgress(value=0, max=271), HTML(value='')))




In [11]:
df = pd.concat(dfs, ignore_index=True, sort=False)
print(len(df))  # 21482631

26776593


In [12]:
df = df.sort_values("is_951").drop_duplicates(subset=["puzzle"], keep="first")
print(len(df))  # 18566560

23860522


In [13]:
display(df.head(2))
print(len(df))

Unnamed: 0,puzzle,solution,difficulty,is_951
13388296,0300700500000100260045001000907000002003010080...,1326798545894137267645821938937654122573419686...,655,False
17827660,0900564004023000000000009000500280040306070508...,1982564734623795813758149627591286342316478598...,453,False


23860522


### Create a validation dataset

In [14]:
if not USE_EXISTING_VALIDATION_DATASET:
    validation_df = df.sample(n=1_000, random_state=42)
else:
    validation_df = pq.read_table(f"{bucket}/sudoku_difficult/valid_0.parquet").to_pandas()
validation_solutions = set(validation_df["solution"])
print(len(validation_solutions))

1000


### Puzzles that are not in validation are in training

In [15]:
training_df = df[~df["solution"].isin(validation_solutions)]

In [16]:
len(training_df)

23859522

In [17]:
assert len(training_df) <= len(df) - len(validation_df)

### Write final datasets

In [18]:
def write_table(df, outfile, filesystem, chunk_size=100_000):
    writer = None
    for i in range(0, len(df), chunk_size):
        chunk = df[i : i + chunk_size]
        table = pa.Table.from_pandas(chunk, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(
                outfile, schema=table.schema, filesystem=filesystem
            )
        writer.write_table(table)
    if writer is not None:
        writer.close()

In [19]:
num_chunks = 20

tmp_df = training_df.sample(frac=1, random_state=42)
chunk_size = len(tmp_df) // num_chunks + 1
print(f"Chunk size: {chunk_size}.")

for i in range(num_chunks):
    chunk_df = tmp_df.iloc[chunk_size * i:chunk_size * (i + 1)]
    print(f"Chunk {i} size: {len(chunk_df)}.")
    write_table(chunk_df, f"{bucket}/sudoku_difficult/train_{i}.parquet", filesystem=fs)

Chunk size: 1192977.
Chunk 0 size: 1192977.
Chunk 1 size: 1192977.
Chunk 2 size: 1192977.
Chunk 3 size: 1192977.
Chunk 4 size: 1192977.
Chunk 5 size: 1192977.
Chunk 6 size: 1192977.
Chunk 7 size: 1192977.
Chunk 8 size: 1192977.
Chunk 9 size: 1192977.
Chunk 10 size: 1192977.
Chunk 11 size: 1192977.
Chunk 12 size: 1192977.
Chunk 13 size: 1192977.
Chunk 14 size: 1192977.
Chunk 15 size: 1192977.
Chunk 16 size: 1192977.
Chunk 17 size: 1192977.
Chunk 18 size: 1192977.
Chunk 19 size: 1192959.


In [20]:
if not USE_EXISTING_VALIDATION_DATASET:
    write_table(validation_df, f"{bucket}/sudoku_difficult/valid_0.parquet", filesystem=fs)