## Summary

## Imports

In [36]:
from pathlib import Path

import tqdm

import gcsfs
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Parameters

In [4]:
NOTEBOOK_PATH = Path("generate_difficult_sudokus").resolve()
NOTEBOOK_PATH.mkdir(exist_ok=True)
NOTEBOOK_PATH

PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus')

In [None]:
project = 'deep-protein-gen'
bucket = 'deep-protein-gen'

## Workspace

In [5]:
!ls generate_difficult_sudokus/

beluga	new  niagara  old_951  strokach-cloudtop


### Read all generated sudokus

In [9]:
sudoku_files = (
    list(NOTEBOOK_PATH.joinpath("beluga").glob("*.csv")) + 
    list(NOTEBOOK_PATH.joinpath("niagara").glob("*.csv")) + 
    list(NOTEBOOK_PATH.joinpath("strokach-cloudtop").glob("*.csv"))
)
print(len(sudoku_files))
print(sudoku_files[:3])

57
[PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus/beluga/sodoku_20070_beluga.csv'), PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus/beluga/sodoku_20121_beluga.csv'), PosixPath('/data/workspace/proteinsolver/notebooks/generate_difficult_sudokus/beluga/sodoku_20090_beluga.csv')]


In [23]:
dfs = []
for file in tqdm.tqdm_notebook(sudoku_files, total=len(sudoku_files)):
    df = pd.read_csv(file)
    df = df.drop_duplicates()
    df["is_951"] = (df["difficulty"] == 951).all()
    dfs.append(df)

HBox(children=(IntProgress(value=0, max=57), HTML(value='')))




In [27]:
df = pd.concat(dfs, ignore_index=True, sort=False)
print(len(df))

5383370


In [28]:
df = df.sort_values("is_951").drop_duplicates(subset=["puzzle"], keep="first")
print(len(df))

5049540


In [25]:
display(df.head(2))
print(len(df))

Unnamed: 0,puzzle,solution,difficulty,is_951
0,6008100000809740009400000000020589100080003000...,6278135495819746239436251783627589144582913671...,951,True
1,6000017000007000800028903010305000000600100900...,6834217599417536825728963412385791644653128971...,951,True


5383370


### Write final datasets

In [29]:
def write_table(df, outfile, filesystem, chunk_size=100_000):
    writer = None
    for i in range(0, len(df), chunk_size):
        chunk = df[i : i + chunk_size]
        table = pa.Table.from_pandas(chunk, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(
                outfile, schema=table.schema, filesystem=filesystem
            )
        writer.write_table(table)
    if writer is not None:
        writer.close()

In [34]:
fs = gcsfs.GCSFileSystem(project=project)

In [None]:
for rs in range(10):
    df_out = df.sample(frac=1, random_state=rs)
    write_table(df_out, f"{bucket}/sudoku_difficult/train_{rs}.parquet", filesystem=fs)