In [None]:
import pandas as pd
import os
from multiprocessing.pool import ApplyResult
import logging
import multiprocessing


In [None]:
column_name = [
    "line", "date_dd", "date_mon", "date_yyyy", "caseid_type", "caseid_no", "filed_dd",
    "filed_mon", "filed_yyyy", "original_court", "original_code", "original_number",
    "original_year", "case_type", "judge_1", "judge_2", "judge_3", "judge_4", "judge_5",
    "judge_6", "judge_7", "comingfor", "outcome", "reason_adj", "next_dd", "next_mon",
    "next_yyyy", "male_applicant", "female_applicant", "organization_applicant",
    "male_defendant", "female_defendant", "organization_defendant", "legalrep",
    "applicant_witness", "defendant_witness", "custody", "other_details"
]

In [None]:
def generate_file_paths(root_folder, start_year, end_year):
    """Generates a list of file paths for Excel files within the specified financial year range."""
    logging.info(f"Entering generate_file_paths")
    file_paths = []
    for root, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".xlsx"):
                file_path = os.path.join(root, file)
                try:
                    year = int(os.path.basename(os.path.dirname(root)))
                    if start_year <= year <= end_year:
                        file_paths.append(file_path)
                except ValueError as ve:
                    logging.error(f"Error processing file path {file_path}: {ve}")
                    continue

    logging.info(f"Exiting generate_file_paths successfully")
    return file_paths


In [None]:
def process_file(file_path):
    """Process a single Excel file, extracting the court code and relevant data."""
    logging.info(f"Entering process_file for file {file_path}")
    try: 
        path_components = os.path.normpath(file_path).split(os.sep) 
        court_name = path_components[-4]
        df = pd.read_excel(file_path, header=4, names=column_name)  
        df = df.assign(court_name=court_name)
        logging.info(f"Exiting process_file successfully for file {file_path}")
        return df
    except (ValueError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        logging.error(f"Error processing file {file_path}: {e}")
        return pd.DataFrame()

In [None]:
def process_files(file_paths):
    """Reads and processes Excel files from a list of paths using multiple processes."""
    with multiprocessing.Pool() as pool:
        results = [pool.apply_async(process_file, args=(path,)) for path in file_paths]
        processed_dfs = [result.get(timeout=60) for result in results if isinstance(result, ApplyResult)]

    combined_df = pd.concat([df for df in processed_dfs if df is not None], ignore_index=True)
    logging.info(f"Exiting process_files successfully")
    return combined_df


In [None]:
def save_to_csv(data_frame, output_file):
    """Saves the processed data frame to a CSV file."""
    logging.info(f"Entering save_to_csv for file {output_file}")
    try:
        data_frame.to_csv(output_file, index=False)
        logging.info(f"Processed data saved to {output_file}")
    except Exception as e:
        logging.error(f"Error saving data to CSV file {output_file}: {e}")
    logging.info(f"Exiting save_to_csv successfully for file {output_file}")


In [None]:
root_folder = "/home/arch/devel/data/TEMPLATE 3"
file_paths = generate_file_paths(root_folder, 2023, 2024)
raw_df = process_files(file_paths)

In [None]:
save_to_csv(raw_df, '/home/arch/devel/data/merged.csv')