In [1]:
import pandas as pd
import os
from multiprocessing import Pool
import logging
from typing import List, Optional

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
pd.set_option('display.max_rows', None)

In [3]:
# hc_file_path = '/home/stanoo/dcrt/data/API/CoA/Q1-2024-25/' 
# coa_file_path = '/home/stanoo/dcrt/data/API/CoA/Q1-2024-25/'
hc_file_path = '/home/stanoo/Projects/data/tribunal/' 

In [4]:
column_names = [
    "line", "date_dd", "date_mon", "date_yyyy", "caseid_type", "caseid_no", "filed_dd",
    "filed_mon", "filed_yyyy", "original_court", "original_code", "original_number",
    "original_year", "case_type", "judge_1", "judge_2", "judge_3", "judge_4", "judge_5",
    "judge_6", "judge_7", "comingfor", "outcome", "reason_adj", "next_dd", "next_mon",
    "next_yyyy", "male_applicant", "female_applicant", "organization_applicant",
    "male_defendant", "female_defendant", "organization_defendant", "legalrep",
    "applicant_witness", "defendant_witness", "custody", "other_details"
]

In [5]:
def read_excel_file(file_path: str, column_names: List[str]) -> Optional[pd.DataFrame]:
    """
    Read an Excel file and return a DataFrame with specified column names and an added court name column.
    
    Args:
        file_path (str): Path to the Excel file.
        column_names (List[str]): List of column names to use for the DataFrame.
        
    Returns:
        Optional[pd.DataFrame]: DataFrame with the specified columns and court name, or None if an error occurs.
    """
    try:
        file_name = os.path.basprocessed_dfename(file_path)
        court_name = file_name.split("-")[0]
        df = pd.read_excel(file_path, header=4, names=column_names)
        df = df.assign(court_name=court_name).drop(df.columns[0], axis=1)
        df = df[[df.columns[-1]] + list(df.columns[:-1])]
        return df
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return None


In [6]:
def read_excel_files(folder_path, column_names):

    file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)
                  if filename.endswith(".xls") or filename.endswith(".xlsx")]

    if not file_paths:
        raise ValueError("No Excel files found in the specified folder.")

    pool = Pool()
    data_frames = pool.starmap(read_excel_file, [(file_path, column_names) for file_path in file_paths])
    pool.close()
    pool.join()

    data_frames = [df for df in data_frames if df is not None and not df.empty]
    
    if not data_frames:
        raise ValueError("Unable to read any Excel files.")
    
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

In [7]:
combined_df = read_excel_files(hc_file_path, column_names)

In [8]:
# Fill NaN values in float columns with 0 and convert them to int
float_columns = combined_df.select_dtypes(include=['float64']).columns
combined_df[float_columns] = combined_df[float_columns].fillna(0).astype(int)

In [11]:
# Create a new column with mapped names
name_map = {'_Tribunal': ''}

def map_names(name):
    for key, value in name_map.items():
        name = name.replace(key, value)
    return name.split()[0] 

In [10]:
# Create a new column with mapped names
name_map = {'_High Court Div': '', '_High Court Civil': '', '_High Court Criminal': ''}

def map_names(name):
    for key, value in name_map.items():
        name = name.replace(key, value)
    return name.split()[0] 

In [27]:
combined_df['court'] = combined_df['court'].str.split('Tribunal', expand=True)[0]

In [30]:
def apply_title_case(text, column):
    """
    Convert text to title case. Handle non-string values and nulls.

    Args:
        text: The value from the DataFrame to be processed.
        column (str): The name of the column (for logging purposes).

    Returns:
        The text in title case or the original value if not a string.
    """
    if pd.isna(text):
        return np.nan
    if not isinstance(text, str):
        logger.warning(f"Non-string value encountered in '{column}': {text}")
        return str(text)
    return text.title()

def convert_to_title_case(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Process the specified column of the DataFrame by applying title case.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column to process.

    Returns:
        pd.DataFrame: The DataFrame with the processed column.
    """

    if column not in df.columns:
        logger.error(f"'{column}' column not found in the DataFrame")
        return df

    original_null_count = df[column].isnull().sum()

    # Apply the title case function to the column
    df[column] = df[column].apply(lambda x: apply_title_case(x, column))


    return df

In [31]:
df = convert_to_title_case(combined_df, 'court')

In [32]:
combined_df.groupby('court').size().reset_index(name='count')

Unnamed: 0,court,count
0,Business Premises Rent,27788
1,Capital Markets,127
2,Co,8788
3,Communications And Multimedia Appeals,21
4,Competition,82
5,Copyright,9
6,Education Appeals,1
7,Energy And Petroleum Tribunal_Energy And Petro...,213
8,Hiv,939
9,Industrial Property,101


In [12]:
combined_df.loc[combined_df['court'] == 'Milimani', 'court'] = combined_df['court_name']

In [13]:
combined_df['court'] = combined_df['court'].str.replace("High Court_High Court", "", case=False, regex=False)
combined_df['court'] = combined_df['court'].str.replace(r'\s+', ' ', regex=True)
# move court name to the first column
combined_df.drop(columns=['court_name'], inplace=True)
combined_df = combined_df[[combined_df.columns[-1]] + list(combined_df.columns[:-1])]


In [14]:
combined_df = combined_df[~combined_df['caseid_type'].str.contains("MED")]

In [17]:
# save the dataframe to a new csv file
combined_df.to_csv(f'{hc_file_path}/Vihiga_High_Court.csv', index=False)

In [16]:
combined_df.groupby('court').size().reset_index(name='count')

Unnamed: 0,court,count
0,Vihiga_High_Court_,2532


In [33]:
df.to_csv(f'{hc_file_path}/tribunal.csv', index=False)