In [4]:
import pyarrow.parquet as pq
import os
import pyarrow as pa

In [5]:
def split_parquet_10pieces(src_path, dst_dir, columns=None, batch_size=10_000):
    """
    - Parquet 파일을 10개의 조각으로 분할하여 저장
    - pandas 변환 없이 Arrow 배치만 처리 → 메모리 안전
    - columns로 필요한 컬럼만 추려 메모리 추가 절약 가능
    """
    pf = pq.ParquetFile(src_path)
    total = pf.metadata.num_rows
    piece_size = total // 10  # 각 조각의 크기
    
    os.makedirs(dst_dir, exist_ok=True)
    
    for piece in range(10):
        writer = None
        remaining = piece_size
        written = 0
        start_idx = piece * piece_size
        
        # 마지막 조각은 나머지 모든 행을 포함
        if piece == 9:
            remaining = total - (piece_size * 9)
            
        piece_num = piece + 1
        dst_path = os.path.join(dst_dir, f"part_{piece_num:02d}.parquet")
        
        for batch in pf.iter_batches(batch_size=batch_size, columns=columns):
            if start_idx > 0:
                start_idx -= len(batch)
                continue
                
            if remaining <= 0:
                break
                
            n = len(batch)
            if n > remaining:
                batch = batch.slice(0, remaining)
                n = remaining
                
            if writer is None:
                writer = pq.ParquetWriter(dst_path, batch.schema, compression="zstd", use_dictionary=True)
                
            writer.write_table(pa.Table.from_batches([batch]))
            remaining -= n
            written += n

        if writer is not None:
            writer.close()
            
        print(f"Piece {piece_num}: wrote_rows={written:,} ({written/total*100:.2f}%) -> {dst_path}")


In [6]:
src_path = "../../data/raw/train.parquet"
dst_dir = "../../data/processed/split_data"
split_parquet_10pieces(src_path, dst_dir)

Piece 1: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_01.parquet
Piece 2: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_02.parquet
Piece 3: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_03.parquet
Piece 4: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_04.parquet
Piece 5: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_05.parquet
Piece 6: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_06.parquet
Piece 7: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_07.parquet
Piece 8: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_08.parquet
Piece 9: wrote_rows=1,070,417 (10.00%) -> ../../data/processed/split_data\part_09.parquet
Piece 10: wrote_rows=1,064,179 (9.94%) -> ../../data/processed/split_data\part_10.parquet
