In [1]:
import os
import urllib.parse

import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset
import pandas as pd

In [2]:
data_dir = '/home/matthew/Documents/TSE/AppliedEconometrics/repo/data'
source_path = os.path.join(data_dir, 'debug/DISPATCHLOAD.parquet')
dest_dir = os.path.join(data_dir, 'debug/DISPATCHLOAD-pq-3/')


partition_keys = ['DUID']

In [28]:

parquet_writers = {}


with pq.ParquetFile(source_path) as f_in:
    for batch in f_in.iter_batches(use_threads = False, batch_size=10):
        table = pa.Table.from_batches([batch])
        duids = table.column('DUID')
        for duid in duids:
            expr = pc.field("DUID") <= duid
            filtered_table = table.filter(expr).drop('DUID')
            if str(duid) not in parquet_writers:
                encoded_duid = urllib.parse.quote_plus(str(duid) + 'space slash/')
                
                dest_subdir = os.path.join(dest_dir, f"DUID={encoded_duid}")
                dest_path = os.path.join(dest_subdir, 'data.parquet')
                print(f"New path is {dest_path}")
                if not os.path.exists(dest_subdir):
                    os.makedirs(dest_subdir)
                schema = batch.schema
                i = schema.get_field_index('DUID')
                dest_schema = schema.remove(i)
                assert dest_schema is not None
                assert isinstance(dest_schema, pa.Schema)
                parquet_writers[str(duid)] = pq.ParquetWriter(dest_path, dest_schema)
            parquet_writers[str(duid)].write(filtered_table)
        break

for (duid, writer) in parquet_writers.items():
    writer.close()
        
        # 
        # grouped_table = table.group_by(partition_keys)
        # for group_key, group_table in grouped_table:
        #     # Accessing the values of the group key (columns 'a' and 'b')
        #     a_value, b_value = group_key['a'], group_key['b']
        #     print(f"{a_value=} {b_value=} {type(group_table)}")

New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=AGLHALspace+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=AGLSOMspace+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=ANGAST1space+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=APD01space+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=ARWF1space+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=ASNAES1space+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/DUID=ASNENC1space+slash%2F/data.parquet
New path is /home/matthew/Documents/TSE/AppliedEconometrics/repo/dat

In [30]:
ds = pyarrow.dataset.dataset(dest_dir, partitioning=pyarrow.dataset.HivePartitioning.discover())
ds.to_table().to_pandas()['DUID'].unique()

array(['AGLHAL', 'AGLHALspace slash\\', 'AGLHALspace slash',
       'AGLHALspace+slash/', 'AGLSOM', 'AGLSOMspace slash\\',
       'AGLSOMspace slash', 'AGLSOMspace+slash/', 'ANGAST1',
       'ANGAST1space slash\\', 'ANGAST1space slash',
       'ANGAST1space+slash/', 'APD01', 'APD01space slash\\',
       'APD01space slash', 'APD01space+slash/', 'ARWF1',
       'ARWF1space slash\\', 'ARWF1space slash', 'ARWF1space+slash/',
       'ASNAES1', 'ASNAES1space slash\\', 'ASNAES1space slash',
       'ASNAES1space+slash/', 'ASNENC1', 'ASNENC1space slash\\',
       'ASNENC1space slash', 'ASNENC1space+slash/', 'ASQENC1',
       'ASQENC1space slash\\', 'ASQENC1space slash',
       'ASQENC1space+slash/', 'ASSEL1', 'ASSEL1space slash\\',
       'ASSEL1space slash', 'ASSEL1space+slash/', 'ASSENC1',
       'ASSENC1space slash\\', 'ASSENC1space slash',
       'ASSENC1space+slash/'], dtype=object)

In [21]:
dest_dir

'/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/'

In [17]:
dest_dir

'/home/matthew/Documents/TSE/AppliedEconometrics/repo/data/debug/DISPATCHLOAD-pq-3/'