# split dataframe by key and do stuff over partitions

allows joining / grouping

 1. map column(s) to hash, mod to map rows to partition
 2. create individual file for each partition
 3. if several dfs have been partitioned, can do joins on partitions separately (and in parallel)

In [1]:
import os
import pandas as pd
import numpy as np
import toolz
from pdpart import Partitioned

In [2]:
def make_test_data(n, filename=None):
    chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    df = pd.DataFrame({"key": np.random.choice(chars, size=n),
                       "val": np.random.rand(n)})
    if filename is not None:
        df.to_csv(filename, index=False)
    return df

DATA_DIR = "../data"
filename = os.path.join(DATA_DIR, "test.csv")
df = make_test_data(401, filename)

In [3]:
# split into partitions
parts = Partitioned("../data/parts", by="key", n_partition=7, compression="gzip")
parts.init_dir()

for _df in pd.read_csv(filename, chunksize=100):
    parts.append(_df)
    
# do something on partitions
def do_sth(fn):
    df = pd.read_csv(fn, compression="gzip")
    return df.val.sum()
    
np.allclose(sum(toolz.map(do_sth, parts.partitions())), df.val.sum())