In [1]:
import pathlib
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from typing import List

In [2]:
def filter_kab(path: pathlib.Path, output_dir: pathlib.Path) -> None:
    try:
        # Read both sheets at once to minimize I/O operations
        df_dict = pd.read_excel(path, sheet_name=[0, 1])
        df1 = df_dict[0]
        df2 = df_dict[1]

        # Filter DataFrames
        df1 = df1[df1['kab'] == 3404]
        df2 = df2[df2['kab'] == 3404]

        # Create output path
        output_path = output_dir / path.parent.name / path.name
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
            df1.to_excel(writer, sheet_name='Kecamatan', index=False)
            df2.to_excel(writer, sheet_name='Desa', index=False)
        print(f'Finished processing {path}')
    except Exception as e:
        print(f'Error processing {path}: {e}')

In [6]:
def process_files(paths: List[pathlib.Path], output_dir: pathlib.Path, max_workers: int = 8) -> None:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(filter_kab, path, output_dir) for path in paths]
        for future in futures:
            future.result()

if __name__ == "__main__":
    bab_folders = ['Bab 1', 'Bab 2', 'Bab 3', 'Bab 4', 'Bab 5', 'Bab 6', 'Bab 7', 'Bab 8', 'Bab 9', 'Bab 10']
    data_dir = pathlib.Path('in')
    output_dir = pathlib.Path('out')

    for bab in bab_folders:
        folder_path = data_dir / bab
        files = list(folder_path.glob('*.xlsx'))
        process_files(files, output_dir, max_workers=16)

Finished processing in\Bab 1\Nasional_Tabulasi_UTP_BAB_0_tabel_0_H1.xlsx
