# Make sure you change the kernel to vacc_eff

In [None]:
% load_ext autoreload
% autoreload 2

In [7]:
import os
import random
from contextlib import contextmanager
from typing import IO, List


@contextmanager
def open_multiple(paths: List[str], mode: str) -> List[IO]:
    files: List[IO] = []
    try:
        for path in paths:
            files.append(open(path, mode))
        yield files
    finally:
        for file in files:
            file.close()


def gen_data(n_file: int = 20, n_line: int = 100000) -> List[str]:
    targets = [f'source{i}.txt' for i in range(n_file)]
    for target in targets:
        with open(target, 'w') as f:
            for i_line in range(n_line):
                f.write(str(random.randint(0, 100000000)))
                f.write('\n')
    return targets


def make_dir_if_needed(path):
    d = os.path.dirname(path)
    os.makedirs(d, exist_ok=True)


def read_and_shard(path, target_folder, n_shard: int):
    targets = [f'{target_folder}/{i}/shard_{path}' for i in range(n_shard)]
    for target in targets:
        make_dir_if_needed(target)

    with open(path) as f_src:
        with open_multiple(targets, 'w') as f_target:
            for line in f_src:
                index = int(line) % n_shard
                f_target[index].write(line)

In [8]:
source = gen_data(5, 10000)
read_and_shard(source[0], 'shard', n_shard=20)