In [2]:
import pandas as pd
import os
from multiprocessing import Pool
from typing import List, Optional
import logging

In [3]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [38]:
file_path = '/home/stanoo/Projects/data/coa/cts'
raw_data_path = '/home/stanoo/Projects/data/coa/'

In [5]:
column_names = [
    "line", "date_dd", "date_mon", "date_yyyy", "caseid_type", "caseid_no", "filed_dd",
    "filed_mon", "filed_yyyy", "original_court", "original_code", "original_number",
    "original_year", "case_type", "judge_1", "judge_2", "judge_3", "judge_4", "judge_5",
    "judge_6", "judge_7", "comingfor", "outcome", "reason_adj", "next_dd", "next_mon",
    "next_yyyy", "male_applicant", "female_applicant", "organization_applicant",
    "male_defendant", "female_defendant", "organization_defendant", "legalrep",
    "applicant_witness", "defendant_witness", "custody", "other_details"
]

In [6]:
def read_excel_file(file_path: str, column_names: List[str]) -> Optional[pd.DataFrame]:
    """
    Read an Excel file and return a DataFrame with specified column names and an added court name column.
    
    Args:
        file_path (str): Path to the Excel file.
        column_names (List[str]): List of column names to use for the DataFrame.
        
    Returns:
        Optional[pd.DataFrame]: DataFrame with the specified columns and court name, or None if an error occurs.
    """
    try:
        file_name = os.path.basename(file_path)
        court_name = file_name.split("-")[0]
        df = pd.read_excel(file_path, header=4, names=column_names)
        df = df.assign(court_name=court_name).drop(df.columns[0], axis=1)
        df = df[[df.columns[-1]] + list(df.columns[:-1])]
        return df
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return None

In [7]:
def read_excel_files(folder_path, column_names):

    file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)
                  if filename.endswith(".xls") or filename.endswith(".xlsx")]

    if not file_paths:
        raise ValueError("No Excel files found in the specified folder.")

    pool = Pool()
    data_frames = pool.starmap(read_excel_file, [(file_path, column_names) for file_path in file_paths])
    pool.close()
    pool.join()

    data_frames = [df for df in data_frames if df is not None and not df.empty]
    
    if not data_frames:
        raise ValueError("Unable to read any Excel files.")
    
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

In [8]:
combined_df = read_excel_files(file_path, column_names)

In [9]:
float_columns = combined_df.select_dtypes(include=['float64']).columns
combined_df[float_columns] = combined_df[float_columns].fillna(0).astype(int)

In [33]:
case_type_mapping = {
    'COA  Criminal Appeal': 'COA  Criminal Appeal',
    'Civil Appeal': 'Court of Appeal Election Petition Appeal',
    'Civil Appeal': 'Civil Appeal',
    'Criminal Applications': 'Criminal Applications'
}
combined_df['broad_case_type'] = combined_df['case_type'].map(case_type_mapping)


In [35]:
combined_df['broad_case_type'] = combined_df['broad_case_type'].fillna('Civil Applications')

In [37]:
combined_df['case_type'], combined_df['broad_case_type'] = combined_df['broad_case_type'], combined_df['case_type']

In [39]:
combined_df.to_csv(f'{raw_data_path}/q2-24-25-coa-data.csv', index=False)