In [17]:
import pathlib
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from typing import List

In [18]:
# kabupaten yang mau difilter
kabupaten = 3404

In [19]:
# Fungsi untuk memfilter data kabupaten
def filter_kab(path: pathlib.Path, output_dir: pathlib.Path) -> None:
    try:
        # nama_sheet_kec = get name sheet that include 'kec'
        # nama_sheet_desa = get name sheet that include 'desa'
        try:
            excel_file = pd.ExcelFile(path)
            sheet_names = excel_file.sheet_names

            # Find the sheets that include 'kec' and 'desa'
            nama_sheet_kec = next(sheet for sheet in sheet_names if 'kec' in sheet.lower())
            nama_sheet_desa = next(sheet for sheet in sheet_names if 'desa' in sheet.lower())

            # Read the specified sheets
            df1 = pd.read_excel(excel_file, sheet_name=nama_sheet_kec)
            df2 = pd.read_excel(excel_file, sheet_name=nama_sheet_desa)
        except Exception as e:
            print(f'Error reading {path}: {e}')
            return
        

        # Filter DataFrames
        df1 = df1[df1['kab'] == kabupaten]
        df2 = df2[df2['kab'] == kabupaten]

        # Create output path
        output_path = output_dir / path.parent.name / path.name
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
            df1.to_excel(writer, sheet_name='Kecamatan', index=False)
            df2.to_excel(writer, sheet_name='Desa', index=False)
        print(f'Finished processing {path}')
    except Exception as e:
        print(f'Error processing {path}: {e}')

In [20]:
# Function to process multiple files concurrently
def process_files(paths: List[pathlib.Path], output_dir: pathlib.Path, max_workers: int = 8) -> None:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(filter_kab, path, output_dir) for path in paths]
        for future in futures:
            future.result()

# Main program
if __name__ == "__main__":
    bab_folders = ['Bab 1', 'Bab 2', 'Bab 3', 'Bab 4', 'Bab 5', 'Bab 6', 'Bab 7', 'Bab 8', 'Bab 9', 'Bab 10']
    data_dir = pathlib.Path('in')
    output_dir = pathlib.Path('out')

    for bab in bab_folders:
        folder_path = data_dir / bab
        files = list(folder_path.glob('*.xlsx'))
        process_files(files, output_dir, max_workers=16)

Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_25.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_24.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_2.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_1.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_14.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_21.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_22.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_15.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_17.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_18.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_12.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_10.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BAB_1_tabel_1_16.xlsx
Finished processing in\Bab 3\Nasional_Tabulasi_UTP_BA