In [28]:
import pandas as pd
import os
import argparse
import logging
from typing import List, Optional, Dict
from multiprocessing import Pool

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [30]:
# Define common column names
column_names = [
    "line", "date_dd", "date_mon", "date_yyyy", "caseid_type", "caseid_no", "filed_dd",
    "filed_mon", "filed_yyyy", "original_court", "original_code", "original_number",
    "original_year", "case_type", "judge_1", "judge_2", "judge_3", "judge_4", "judge_5",
    "judge_6", "judge_7", "comingfor", "outcome", "reason_adj", "next_dd", "next_mon",
    "next_yyyy", "male_applicant", "female_applicant", "organization_applicant",
    "male_defendant", "female_defendant", "organization_defendant", "legalrep",
    "applicant_witness", "defendant_witness", "custody", "other_details"
]

In [29]:
def read_excel_file(file_path: str) -> Optional[pd.DataFrame]:
    """Read an Excel file, extract court name, and return a DataFrame with specified columns."""
    try:
        file_name = os.path.basename(file_path)
        court_name = file_name.split("-")[0]
        df = pd.read_excel(file_path, header=4, names=column_names)
        df['court_name'] = court_name
        df = df.drop(df.columns[0], axis=1)
        df = df[['court_name'] + list(df.columns[:-1])]
        logger.info(f"Successfully processed file: {file_path}")
        return df
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {e}")
        return None

In [31]:
def process_dataframe(df: pd.DataFrame, name_map: Optional[Dict[str, str]] = None) -> pd.DataFrame:
    """Process the DataFrame by filling NaN values and optionally mapping court names."""
    df.fillna(0, inplace=True)  # Fill NaN with 0 and modify dataframe directly
    if name_map:
        court_series = df['court_name'].str.replace(to_replace=name_map, regex=True)
        court_series = court_series.str.split().str.get(0)  # Extract first word from court name
        df['court'] = court_series.str.strip()  # Remove extra spaces with strip
        df = df[['court'] + [col for col in df.columns if col != 'court_name']]
    else:
        df.rename(columns={'court_name': 'court'}, inplace=True)
        df = df[['court'] + [col for col in df.columns if col != 'court_name']]
    return df


In [None]:
def process_folder(folder_path: str) -> pd.DataFrame:
    """Process all Excel files in a folder, combining them into a single DataFrame."""
    file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)
                  if filename.endswith((".xls", ".xlsx"))]

    if not file_paths:
        raise ValueError("No Excel files found in the specified folder.")

    with Pool() as pool:
        data_frames = pool.map(read_excel_file, file_paths)

    data_frames = [df for df in data_frames if df is not None]

    if not data_frames:
        raise ValueError("Unable to read any Excel files.")

    combined_df = pd.concat(data_frames, ignore_index=True)
    return process_dataframe(combined_df)


In [None]:
def process_api_data(folder_path: str) -> pd.DataFrame:
    """Process API data (assuming similar structure as Excel files)."""
    combined_df = read_excel_files(folder_path, column_names)  # Removed unused function
    name_map = {'_High Court Div': '', '_High Court Civil': '', '_High Court Criminal': ''}
    court_prefix_to_remove = "High Court_High Court"
    processed_df = process_dataframe(combined_df, name_map)
    processed_df['court'] = processed_df['court'].str.replace(court_prefix_to_remove, "", case=False, regex=False)
    return processed_df



In [None]:
def main(folder_path: str, is_api: bool = False) -> pd.DataFrame:
    """Main function to read and process Excel files based on user input."""
    try:
        if is_api:
            processed_df = process_api_data(folder_path)
            logger.info("API data processing completed successfully.")
        else:
            processed_df = process_folder(folder_path)
            logger.info("Non-API data processing completed successfully.")

        return processed_df

    except Exception as e:
        logger.error(f"An error occurred during data processing: {e}")
        raise


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process Excel files for court data.")
    parser.add_argument("folder_path", help="Path to the folder or root directory containing Excel files.")
    parser.add_argument("--is_api", action="store_true", help="Specify if processing API data.")

    args = parser.parse_args()

    # Run the main processing function
    try:
        result_df = main(args.folder_path, args.is_api)
        print(result_df.head())
        print(f"Total rows: {len(result_df)}")
        logger.info("Data processing completed successfully.")
    except Exception as e:
        logger.error(f"Processing failed: {e}")
    else:
        logger.info("Script finished successfully.")