In [None]:
import os
from datetime import datetime
from dbfread import DBF, DBFNotFound
import pandas as pd

from tqdm import tqdm


In [None]:

def read_dbf_in_chunks(file_path, chunk_size):
    """
    Read a .dbf file in chunks.
    """

    try:
        table = DBF(file_path, encoding='cp850', ignore_missing_memofile=True)
        chunk = []
        for record in table:
            chunk.append(record)
            if len(chunk) == chunk_size:
                yield pd.DataFrame(chunk)
                chunk = []
        if chunk:
            yield pd.DataFrame(chunk)
    except DBFNotFound:
        print(f"DBF file {file_path} not found.")
        return None

In [None]:

def write_csv_in_chunks(chunks, file_name, append=False):
    """
    Write chunks of data to a CSV file incrementally.
    """

    for i, chunk in enumerate(chunks):
        mode = 'a' if append or i > 0 else 'w'
        header = not (append or i > 0)
        chunk.to_csv(file_name, mode=mode, header=header, index=False, sep=';', encoding='cp850')
        print(f"Chunk {i + 1} written to {file_name}")

In [None]:

def etl_flow(chunk_size=1000):
    """
    ETL flow for processing .dbf files and saving them as .csv.
    """
    lands = ['F01','F02', 'F03', 'F04']
    
    files = [
        'V2AD1001', 'V2AD1056', 'V2AD1096', 'V2AD1156', 'V2AD1004', 'V2AD1005',  
        'V2AR1001', 'V2AR1002', 'V2AR1004', 'V2AR1005', 'V2AR1007', 
        'V2LA1001', 'V2LA1002', 'V2LA1003', 'V2LA1005', 'V2LA1006', 'V2LA1008', 
        'V4AR1009', 'V2AD1009', 'V4LA1009',
    ]
    ## Werbehistorie ==> 'V4AD1023'
    ## Nachfrage tabelle ==> 'V2SC1010',
    ## Kundenwanderung  ==> 'V2AD2000',

    files = ['V2SC1010']
    # lands = ['F01']

    total_files = len(lands) * len(files)
    processed_files = 0
    progress_bar = tqdm(total=total_files, desc="Processing files", ncols=100, dynamic_ncols=True)

    for LAND in lands:
        for FILE_NAME in files:
            processed_files += 1
            dbf_file_path = fr'/Volumes/DATA/{LAND}/{FILE_NAME}.dbf'
            csv_file_path = fr'/Volumes/MARAL/CSV/{LAND}/{FILE_NAME}.csv'

            print(f"\n[{datetime.now()}] Processing file {processed_files}/{total_files}: {FILE_NAME} ({LAND})")

            if os.path.exists(csv_file_path):
                modification_time = os.path.getmtime(csv_file_path)
                modification_date = datetime.fromtimestamp(modification_time).date()
                if modification_date == datetime.today().date():
                    print(f"File {csv_file_path} is up-to-date.")
                    continue

            chunks = read_dbf_in_chunks(dbf_file_path, chunk_size)
            write_csv_in_chunks(chunks, csv_file_path)

            progress_bar.update(1)

    progress_bar.close()




In [None]:
if __name__ == "__main__":
    etl_flow(chunk_size=5000)