In [39]:
import pandas as pd
import os
from multiprocessing import Pool
import logging
from typing import List, Optional

In [40]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
pd.set_option('display.max_rows', None)

In [41]:
hc_file_path = '/home/stanoo/dcrt/data/API/CoA/Q1-2024-25/' 
coa_file_path = '/home/stanoo/dcrt/data/API/CoA/Q1-2024-25/'

In [42]:
column_names = [
    "line", "date_dd", "date_mon", "date_yyyy", "caseid_type", "caseid_no", "filed_dd",
    "filed_mon", "filed_yyyy", "original_court", "original_code", "original_number",
    "original_year", "case_type", "judge_1", "judge_2", "judge_3", "judge_4", "judge_5",
    "judge_6", "judge_7", "comingfor", "outcome", "reason_adj", "next_dd", "next_mon",
    "next_yyyy", "male_applicant", "female_applicant", "organization_applicant",
    "male_defendant", "female_defendant", "organization_defendant", "legalrep",
    "applicant_witness", "defendant_witness", "custody", "other_details"
]

In [43]:
def read_excel_file(file_path: str, column_names: List[str]) -> Optional[pd.DataFrame]:
    """
    Read an Excel file and return a DataFrame with specified column names and an added court name column.
    
    Args:
        file_path (str): Path to the Excel file.
        column_names (List[str]): List of column names to use for the DataFrame.
        
    Returns:
        Optional[pd.DataFrame]: DataFrame with the specified columns and court name, or None if an error occurs.
    """
    try:
        file_name = os.path.basename(file_path)
        court_name = file_name.split("-")[0]
        df = pd.read_excel(file_path, header=4, names=column_names)
        df = df.assign(court_name=court_name).drop(df.columns[0], axis=1)
        df = df[[df.columns[-1]] + list(df.columns[:-1])]
        return df
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return None


In [44]:
def read_excel_files(folder_path, column_names):

    file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)
                  if filename.endswith(".xls") or filename.endswith(".xlsx")]

    if not file_paths:
        raise ValueError("No Excel files found in the specified folder.")

    pool = Pool()
    data_frames = pool.starmap(read_excel_file, [(file_path, column_names) for file_path in file_paths])
    pool.close()
    pool.join()

    data_frames = [df for df in data_frames if df is not None and not df.empty]
    
    if not data_frames:
        raise ValueError("Unable to read any Excel files.")
    
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

In [45]:
combined_df = read_excel_files(coa_file_path, column_names)

In [46]:
# Fill NaN values in float columns with 0 and convert them to int
float_columns = combined_df.select_dtypes(include=['float64']).columns
combined_df[float_columns] = combined_df[float_columns].fillna(0).astype(int)

In [32]:
# Create a new column with mapped names
name_map = {'_High Court Div': '', '_High Court Civil': '', '_High Court Criminal': ''}

def map_names(name):
    for key, value in name_map.items():
        name = name.replace(key, value)
    return name.split()[0] 

In [33]:
combined_df['court'] = combined_df['court_name'].apply(map_names)

In [34]:
combined_df.loc[combined_df['court'] == 'Milimani', 'court'] = combined_df['court_name']

In [35]:
combined_df['court'] = combined_df['court'].str.replace("High Court_High Court", "", case=False, regex=False)
combined_df['court'] = combined_df['court'].str.replace(r'\s+', ' ', regex=True)
# move court name to the first column
combined_df.drop(columns=['court_name'], inplace=True)
combined_df = combined_df[[combined_df.columns[-1]] + list(combined_df.columns[:-1])]


In [36]:
combined_df = combined_df[~combined_df['caseid_type'].str.contains("MED")]

In [47]:
# save the dataframe to a new csv file
combined_df.to_csv(f'{coa_file_path}/coa_q1_data.csv', index=False)

In [49]:
combined_df.groupby('court_name').size().reset_index(name='count')

Unnamed: 0,court_name,count
0,Eldoret Court of Appeal_Court of Appeal,647
1,Kisumu Court of Appeal_Court of Appeal,3432
2,Malindi Court of Appeal_Court of Appeal,913
3,Mombasa Court of Appeal_Court of Appeal,2262
4,Nairobi Court of Appeal_Civil Division,7095
5,Nairobi Court of Appeal_Criminal Division,766
6,Nakuru Court of Appeal_Court of Appeal,2251
7,Nyeri Court of Appeal_Court of Appeal,2941
