# Examples

In [None]:
import os
import pandas as pd
import numpy as np
from pdpart import Partitioned
import tempfile

def make_test_data(n, filename=None):
    chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    df = pd.DataFrame({"key": np.random.choice(chars, size=n),
                       "val": np.random.rand(n)})
    if filename is not None:
        df.to_csv(filename, index=False)
    return df


tmp = tempfile.TemporaryDirectory()
path = tmp.name

filename = os.path.join(path, "test.csv")
data = make_test_data(401, filename)

In [None]:
# split a dataframe into partitions by column "key"
dirname = os.path.join(path, "parts")

# no filesystem changes at this stage
parts = Partitioned(dirname, by="key", n_partition=13, compression="gzip")

# this creates the directory and removes its content if it exists
parts.init_dir()

# write data frame to partitions, this can be done chunkwise
for df in pd.read_csv(filename, chunksize=100):
    parts.append(df)

In [None]:
def do_sth(fn):
    """do something on a single partition"""
    df = pd.read_csv(fn, compression="gzip")
    return df.val.sum()

# work on data by looping over partitions
# In practice, you could use parallelization
assert np.allclose(sum([do_sth(fn) for fn in parts.partitions]), data.val.sum())

In [None]:
# apply transformations per partition and store in new directory

def transform(fn_in, fn_out, compression=None):
    """apply transformation on fn_in and write to fn_out"""
    df = pd.read_csv(fn_in, compression=compression)
    df["val"] *= 2.
    df.to_csv(fn_out, index=False, compression=compression)

dest = Partitioned.new_like(os.path.join(path, "dest"), parts).init_dir()
for fn_in, fn_out in zip(parts.partitions, dest.partitions):
    transform(fn_in, fn_out, "gzip")

    
# check correct
assert np.allclose(sum([do_sth(fn) for fn in dest.partitions]), 2 * data.val.sum())

In [None]:
tmp.cleanup()