# Examples

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from pdpart import Partitioned
import tempfile


def make_test_data(n):
    chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    return pd.DataFrame({
        "key": np.random.choice(chars, size=n),
        "value": np.arange(n)
    })


tmp = tempfile.TemporaryDirectory()
path = Path(tmp.name)
dirname = path.joinpath('parts')
data = make_test_data(401)

parts = Partitioned.create(dirname, by='key', n_partition=13)

parts.append(data)

# reassemble data frame from parts
df_parts = pd.concat([pd.read_csv(fn) for fn in parts.partitions], axis=0)

# check dataframe has not changed
assert np.all(np.equal(
    *[df.sort_values(['key', 'value']).reset_index(drop=True) for df in [data, df_parts]]
))

# we can keep on appending data, e.g. when using chunkwise reading of files
new_data = make_test_data(200)
parts.append(new_data)

assert pd.concat([pd.read_csv(fn) for fn in parts.partitions], axis=0).shape == (601, 2)


# work on data by looping over partitions
# In practice, you could use parallelization
def do_sth(fn):
    """do something on a single partition"""
    df = pd.read_csv(fn)
    return df['value'].sum()

assert np.allclose(sum([do_sth(fn) for fn in parts.partitions]), pd.concat([data, new_data], axis=0)['value'].sum())

tmp.cleanup()