In [16]:
import os
import pandas as pd
import logging
from multiprocessing import Pool
from typing import List, Optional, Dict
from functools import partial

In [17]:
# Set up logging (optional for notebook)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [18]:
# Function to generate file paths
def generate_file_paths(root_folder: str, start_year: Optional[int] = None, end_year: Optional[int] = None, recursive: bool = True) -> List[str]:
    logging.info(f"Generating file paths from {root_folder}, recursive={recursive}")
    file_paths = []
    
    if recursive:
        if start_year is None or end_year is None:
            raise ValueError("start_year and end_year must be provided when recursive is True.")
        
        for root, _, files in os.walk(root_folder):
            for file in files:
                if file.endswith((".xlsx", ".xls")):
                    file_path = os.path.join(root, file)
                    try:
                        year = int(os.path.basename(os.path.dirname(root)))
                        if start_year <= year <= end_year:
                            file_paths.append(file_path)
                    except ValueError as ve:
                        logging.error(f"Error processing file path {file_path}: {ve}")
    else:
        for file in os.listdir(root_folder):
            if file.endswith((".xlsx", ".xls")):
                file_path = os.path.join(root_folder, file)
                file_paths.append(file_path)
    
    logging.info(f"Generated {len(file_paths)} file paths")
    return file_paths

In [19]:
# # Function to read Excel files
# def read_excel_file(file_path: str, column_names: List[str]) -> Optional[pd.DataFrame]:
#     try:
#         file_name = os.path.basename(file_path)
#         court_name = file_name.split("-")[0]  # Assuming court name is part of the file name
#         df = pd.read_excel(file_path, header=4, names=column_names)
#         df = df.assign(court_name=court_name)
#         return df
#     except Exception as e:
#         logger.error(f"Error reading file {file_path}: {e}")
#         return None



In [20]:
# Function to read Excel files
def read_excel_file(file_path: str, column_names: List[str]) -> Optional[pd.DataFrame]:
    try:
        file_name = os.path.basename(file_path)
        court_name = file_name.split("-")[0]  # Assuming court name is part of the file name
        df = pd.read_excel(file_path, header=4, names=column_names)
        df = df.assign(court_name=court_name)
        return df
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {e}")
        return None

In [21]:
# Function to process the files
def process_files(file_paths: List[str], column_names: List[str]) -> pd.DataFrame:
    logging.info(f"Processing {len(file_paths)} files")
    
    with Pool() as pool:
        read_func = partial(read_excel_file, column_names=column_names)
        data_frames = pool.map(read_func, file_paths)

    data_frames = [df for df in data_frames if df is not None and not df.empty]
    
    if not data_frames:
        raise ValueError("No valid Excel files found or processed.")

    return pd.concat(data_frames, ignore_index=True)


In [22]:
# Function to process DataFrame
def process_dataframe(df: pd.DataFrame, name_map: Optional[Dict[str, str]] = None, court_prefix_to_remove: Optional[str] = None) -> pd.DataFrame:
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].fillna(0).astype(int)

    if name_map:
        def map_names(name: str) -> str:
            for key, value in name_map.items():
                name = name.replace(key, value)
            return name.split()[0]

        df['court'] = df['court_name'].apply(map_names)
        
        if court_prefix_to_remove:
            df['court'] = df['court'].str.replace(court_prefix_to_remove, "", case=False, regex=False)
        
        df['court'] = df['court'].str.replace(r'\s+', ' ', regex=True)
        df = df[['court'] + [col for col in df.columns if col != 'court' and col != 'court_name']]
    else:
        df = df.rename(columns={'court_name': 'court'})
        
        if court_prefix_to_remove:
            df['court'] = df['court'].str.replace(court_prefix_to_remove, "", case=False, regex=False)
        
        df = df[['court'] + [col for col in df.columns if col != 'court']]

    return df

In [23]:
# Function to save to CSV
def save_to_csv(data_frame: pd.DataFrame, output_file: str):
    try:
        data_frame.to_csv(output_file, index=False)
        logging.info(f"Data saved to {output_file}")
    except Exception as e:
        logging.error(f"Error saving data to CSV: {e}")

In [24]:
# Example column names used for the Excel files
column_names = [
    "line", "date_dd", "date_mon", "date_yyyy", "caseid_type", "caseid_no", "filed_dd",
    "filed_mon", "filed_yyyy", "original_court", "original_code", "original_number",
    "original_year", "case_type", "judge_1", "judge_2", "judge_3", "judge_4", "judge_5",
    "judge_6", "judge_7", "comingfor", "outcome", "reason_adj", "next_dd", "next_mon",
    "next_yyyy", "male_applicant", "female_applicant", "organization_applicant",
    "male_defendant", "female_defendant", "organization_defendant", "legalrep",
    "applicant_witness", "defendant_witness", "custody", "other_details"
]

In [25]:
# Test the script in the notebook (variables defined manually)
dcrt_folder = '/home/stanoo/dcrt/data/INPUT'
api_folder = '/home/stanoo/dcrt/data/HC/HcNew'
# Define if using dcrt-template/non-api-data
#start_year = 2020
# Define if using dcrt-template/non-api-data
#end_year = 2021
# Set to False for non-api-data mode
recurse = True

In [26]:
# Run the file generation, processing, and saving steps
output_file = f'dcrt_combined_data.csv'

In [27]:

try:
    file_paths = generate_file_paths(dcrt_folder, 2020, 2021, recursive=recurse)
    combined_df = process_files(file_paths, column_names)
    
    # Process the DataFrame (modify name_map and court_prefix_to_remove as needed)
    name_map = {'_High Court Div': '', '_High Court Civil': '', '_High Court Criminal': ''}
    court_prefix_to_remove = "High Court_High Court"
    
    processed_df = process_dataframe(combined_df, name_map, court_prefix_to_remove)
    save_to_csv(processed_df, output_file)
    logging.info("Data processing and saving completed successfully.")
except Exception as e:
    logger.error(f"An error occurred during execution: {e}")

2024-10-09 16:18:52,239 - INFO - Generating file paths from /home/stanoo/dcrt/data/INPUT, recursive=True
2024-10-09 16:18:52,240 - ERROR - Error processing file path /home/stanoo/dcrt/data/INPUT/CODES/DCRT COURT_CODES.xlsx: invalid literal for int() with base 10: 'INPUT'
2024-10-09 16:18:52,265 - INFO - Generated 782 file paths
2024-10-09 16:18:52,266 - INFO - Processing 782 files
2024-10-09 16:18:53,473 - ERROR - Error reading file /home/stanoo/dcrt/data/INPUT/TEMPLATE 3/Milimani Family Division/2020/04/473011.xlsx: Number of passed names did not match number of header fields in the file (sheet: 0)
2024-10-09 16:19:12,807 - ERROR - Error reading file /home/stanoo/dcrt/data/INPUT/TEMPLATE 3/Machakos High Court/2020/02/~$163013.xlsx: Excel file format cannot be determined, you must specify an engine manually.
2024-10-09 16:19:22,465 - ERROR - Error reading file /home/stanoo/dcrt/data/INPUT/TEMPLATE 3/Malindi High Court/2020/03/~$33020.xlsx: Excel file format cannot be determined, you mu

In [28]:
processed_df.groupby('court')['court'].value_counts().sort_values(ascending=False)

court
473029.xlsx                43525
473011.xlsx                28017
473019.xlsx                21550
123014.xlsx                17678
323027.xlsx                16939
273016.xlsx                14018
13021.xlsx                 12916
373003.xlsx                12815
163013.xlsx                12219
223005.xlsx                11743
193008.xlsx                10348
423004.xlsx                 9060
453031.xlsx                 7608
473006.xlsx                 7268
33020.xlsx                  7259
473002.xlsx                 7120
393015.xlsx                 6361
143010.xlsx                 6192
203012.xlsx                 6080
323026.xlsx                 5411
303049.xlsx                 5353
263018.xlsx                 4689
153041.xlsx                 4369
473023.xlsx                 4337
433030.xlsx                 4030
173060.xlsx                 3909
213007.xlsx                 3476
353024.xlsx                 3413
403001.xlsx                 3243
443032.xlsx                 3110
3330