In [None]:
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Union, Optional
import logging

In [None]:
file_path = '/home/fiend/Documents/coa/'

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 45)

# Data Cleaning

In [None]:
def drop_nan_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """
    Drop rows containing NaN values from the specified columns of a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to process.
        columns (List[str]): A list of column names to check for NaN values.

    Returns:
        pd.DataFrame: The updated DataFrame with NaN-containing rows dropped.

    Raises:
        ValueError: If any of the specified columns are not present in the DataFrame.
    """
    # Validate that all specified columns exist in the DataFrame
    missing_columns = set(columns) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Columns not found in DataFrame: {', '.join(missing_columns)}")

    # Identify columns with NaN values
    nan_columns = df[columns].columns[df[columns].isna().any()].tolist()

    # Log dropped rows if any
    if nan_columns:
        nan_count = df[columns].isna().sum()
        logger.info("Dropping rows with NaN values:")
        for col in nan_columns:
            logger.info(f"  {col}: {nan_count[col]} rows")

    # Drop rows with NaN values in specified columns
    original_row_count = len(df)
    df_cleaned = df.dropna(subset=columns)
    dropped_row_count = original_row_count - len(df_cleaned)

    if dropped_row_count > 0:
        logger.info(f"Total rows dropped: {dropped_row_count}")
    else:
        logger.info("No rows were dropped.")

    return df_cleaned

In [None]:
def remove_duplicates(data: pd.DataFrame) -> pd.DataFrame:
    """
    Remove duplicates from a DataFrame.
    
    Args:
        data (pd.DataFrame): Input DataFrame.
        
    Returns:
        pd.DataFrame: DataFrame with duplicates removed.
    """
    num_duplicates = data.duplicated().sum()
    
    if num_duplicates > 0:
        logging.info(f"{num_duplicates} duplicates found.")
        data = data.drop_duplicates(keep="first").reset_index(drop=True)
        logging.info(f"{num_duplicates} duplicates removed.")
    else:
        logging.info("No duplicates found.")
    
    # check and drop duplicates on 'court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy', 'comingfor', 'outcome'
    duplicates = data.duplicated(subset=['court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy', 'comingfor', 'outcome'], keep=False).sum()
    
    if duplicates > 0:
        logging.info(f"{duplicates} duplicates found on 'court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy', 'comingfor', 'outcome'.")
        data = data.drop_duplicates(subset=['court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy', 'comingfor', 'outcome'], keep=False)
        logging.info(f"{duplicates} duplicates removed on 'court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy', 'comingfor', 'outcome'.")
    else:  
        logging.info("No duplicates found on 'court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy', 'comingfor', 'outcome'.")

    
    return data

In [None]:
def create_date_column(df: pd.DataFrame, column_names: List[str], new_col: str) -> pd.DataFrame:
    """
    Creates a new date column in the DataFrame by concatenating the values of three specified columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        column_names (List[str]): A list of three column names to be concatenated [year, month, day].
        new_col (str): The import commandsname of the new date column to be created.

    Returns:
        pd.DataFrame: The DataFrame with the new date column added.

    Raises:
        ValueError: If the input list doesn't contain exactly three column names or if columns are missing.
    """
    if len(column_names) != 3:
        raise ValueError("column_names must contain exactly three elements: [year, month, day]")

    year_col, month_col, day_col = column_names

    # Check if all required columns exist in the DataFrame
    missing_columns = set(column_names) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in DataFrame: {', '.join(missing_columns)}")

    # Create copies to avoid SettingWithCopyWarning
    df = df.copy()

    try:
        # Convert year and day columns to integers
        df[year_col] = df[year_col].astype(float).astype(int)
        df[day_col] = df[day_col].astype(float).astype(int)

        # Concatenate the columns to create a date string
        df[new_col] = (df[year_col].astype(str) + '-' + 
                       df[month_col].astype(str) + '-' + 
                       df[day_col].astype(str))

        # Convert to datetime
        df[new_col] = pd.to_datetime(df[new_col], errors='coerce')

        # Log information about the conversion
        valid_dates = df[new_col].notna().sum()
        logger.info(f"Created new date column '{new_col}'. Valid dates: {valid_dates}/{len(df)}")

    except Exception as e:
        logger.error(f"Error creating date column: {str(e)}")
        raise

    return df

### Remove leading and trailing spaces to the add title case to all words

In [None]:
def apply_title_case(text):
    """
    Apply title case to a given string.
    
    Args:
        text: The input string to process.
    
    Returns:
        str: The processed string in title case.
    """
    if pd.isna(text):
        return np.nan
    if not isinstance(text, str):
        logger.warning(f"Non-string value encountered: {text}")
        return str(text)
    return text.title()

In [None]:
def process_outcome_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the 'outcome' column of the DataFrame by applying title case.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing the 'outcome' column.
    
    Returns:
        pd.DataFrame: The DataFrame with the processed 'outcome' column.
    """
    if 'outcome' not in df.columns:
        logger.error("'outcome' column not found in the DataFrame")
        return df

    original_null_count = df['outcome'].isnull().sum()
    
    df['outcome'] = df['outcome'].apply(apply_title_case)
    
    new_null_count = df['outcome'].isnull().sum()
    if new_null_count > original_null_count:
        logger.warning(f"Number of null values in 'outcome' increased from {original_null_count} to {new_null_count}")
    
    non_string_count = df['outcome'].apply(lambda x: not isinstance(x, str) if pd.notna(x) else False).sum()
    if non_string_count > 0:
        logger.warning(f"Found {non_string_count} non-string values in 'outcome' after processing")

    return df

In [None]:
def drop_null_values(df: pd.DataFrame, column_name: str = 'outcome') -> pd.DataFrame:
    """
    Drop rows from the DataFrame where the specified column contains null values.

    Args:
        df (pd.DataFrame): The DataFrame from which to drop rows.
        column_name (str): The name of the column to check for null values. Default is 'outcome'.

    Returns:
        pd.DataFrame: The DataFrame with rows containing null values in the specified column dropped.
    """
    initial_row_count: int = df.shape[0]
    cleaned_df: pd.DataFrame = df.dropna(subset=[column_name])
    final_row_count: int = cleaned_df.shape[0]
    dropped_row_count: int = initial_row_count - final_row_count
    
    logging.info(f"Total dropped rows with null values in '{column_name}': {dropped_row_count}")
    
    return cleaned_df

# Data Analysis

In [None]:
def categorize_case(case_type: str, criminal_cases: Optional[List[str]]) -> str:
    """
    Categorize a case as 'Criminal' or 'Civil' based on its type.
    
    Args:
        case_type (str): The type of the case.
        criminal_cases (Optional[List[str]]): List of case types considered as criminal.
        
    Returns:
        str: 'Criminal' if the case type is in the criminal cases list or if criminal_cases is None, 'Civil' otherwise.
    """
    if criminal_cases is None:
        return 'Criminal'
    else:
        return 'Criminal' if case_type in criminal_cases else 'Civil'

In [None]:

def categorize_cases(df: pd.DataFrame, criminal_cases: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Categorize all cases in the DataFrame as 'Criminal' or 'Civil'.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing case data.
        criminal_cases (Optional[List[str]]): List of case types considered as criminal.
            If None, all cases are categorized as 'Criminal'.
        
    Returns:
        pd.DataFrame: DataFrame with an added 'nature' column indicating case nature.
    """
    df['nature'] = df['case_type'].apply(lambda x: categorize_case(x, criminal_cases))

    # Check for presence of both case types
    if 'Criminal' not in df['nature'].values:
        logging.warning("No criminal cases found in the DataFrame.")
    if 'Civil' not in df['nature'].values:
        logging.warning("No civil cases found in the DataFrame.")
    
    return df

### Create case types using the long format

In [None]:
def apply_dict(value: Any, dictionary: Dict[str, Union[str, List[Any]]]) -> Union[str, None]:
    """
    Find all keys in a dictionary where the given value matches.

    Args:
        value: The value to search for.
        dictionary: The dictionary to search in.

    Returns:
        A list of keys where the value matches, or None if no matches are found.
    """
    matching_keys = []
    for key, dict_value in dictionary.items():
        if isinstance(dict_value, str) and dict_value == value:
            matching_keys.append(key)
        elif isinstance(dict_value, list) and value in dict_value:
            matching_keys.append(key)
    
    if not matching_keys:
        return None
    elif len(matching_keys) == 1:
        return matching_keys[0]
    else:
        return matching_keys

## Add Case number as a unique identifier

In [None]:
def generate_case_num(df: pd.DataFrame, court_col: str, caseid_type_col: str, caseid_no_col: str, filed_date, new_col='case_number') -> pd.DataFrame:
    """
    Generates a case number by concatenating court, caseid_type, caseid_no, and filed_yyyy columns.

    Args:
        df (pd.DataFrame): The dataframe containing the necessary columns.
        court_col (str): The name of the column containing court information.
        caseid_type_col (str): The name of the column containing case ID type.
        caseid_no_col (str): The name of the column containing case ID number.
        filed_yyyy_col (str): The name of the column containing the year the case was filed.
        new_col (str): The name of the new column to be created for the case number. Default is 'case_num'.

    Returns:
        pd.DataFrame: DataFrame with the new case number column.
    """
    # drop if any if court_col: str, caseid_type_col: str, caseid_no_col: str, filed_yyyy_col: str are null and log warning message 
    if pd.isna(court_col) or pd.isna(caseid_type_col) or pd.isna(caseid_no_col) or pd.isna(filed_date):
        logging.warning("One or more columns are null and will be dropped.")
        df.dropna(subset=[court_col, caseid_type_col, caseid_no_col, filed_date], inplace=True)
        

    df[new_col] = df[court_col] + '/' + df[caseid_type_col] + '/' + df[caseid_no_col] + '/' + df[filed_date].astype(str)
    return df

#### Filed cases 

In [None]:

def is_concluded(outcome: str, resolved_outcomes: List[str]) -> int:
    """
    Determine if the case is concluded based on the outcome.
    
    Args:
        outcome (str): The outcome of the case.
        resolved_outcomes (List[str]): List of outcomes considered as resolved.
        
    Returns:
        int: 1 if the case outcome is resolved, 0 otherwise.
    """
    return 1 if outcome in resolved_outcomes else 0

In [None]:
def is_case_registered(outcome: str, activity_date: Union[pd.Timestamp, str], filed_date: Union[pd.Timestamp, str]) -> bool:
    """
    Determine if a case is registered based on its outcome and dates.

    Args:
        outcome (str): The outcome of the case.
        activity_date (Union[pd.Timestamp, str]): The date of the activity.
        filed_date (Union[pd.Timestamp, str]): The date the case was filed.

    Returns:
        bool: True if the case is registered, False otherwise.
    """
    try:
        # Normalize the outcome string
        normalized_outcome = outcome.strip().lower()

        # Check if the outcome indicates registration
        is_registered_outcome = 'case registered/filed' in normalized_outcome 

        # Convert dates to pd.Timestamp if they're strings
        if isinstance(activity_date, str):
            activity_date = pd.to_datetime(activity_date, errors='coerce')
        if isinstance(filed_date, str):
            filed_date = pd.to_datetime(filed_date, errors='coerce')

        # Check if dates are equal
        dates_match = pd.notna(activity_date) and pd.notna(filed_date) and activity_date == filed_date

        is_registered = is_registered_outcome and dates_match

        if is_registered:
            logger.debug(f"Case registered: outcome='{outcome}', activity_date={activity_date}, filed_date={filed_date}")
        
        return is_registered

    except Exception as e:
        logger.error(f"Error in is_case_registered: {e}")
        return False


In [None]:
# Function to apply is_case_registered to the DataFrame
def process_case_status(df: pd.DataFrame, resolved_outcome: List[str]) -> pd.DataFrame:
    """
    Process the DataFrame to add 'concluded' and 'registered' columns.

    Args:
        df (pd.DataFrame): The input DataFrame containing case information.

    Returns:
        pd.DataFrame: The DataFrame with added 'concluded' and 'registered' columns.

    Raises:
        ValueError: If required columns are missing from the DataFrame.
    """
    required_columns = ['outcome', 'activity_date', 'filed_date']
    missing_columns = set(required_columns) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

    #df['concluded'] = df['outcome'].apply(is_case_concluded)
    df['concluded'] = df['outcome'].apply(lambda x: is_concluded(x, resolved_outcome))
    df['registered'] = df.apply(lambda row: is_case_registered(row['outcome'], row['activity_date'], row['filed_date']), axis=1)

    logger.info(f"Processed {len(df)} cases")
    logger.info(f"Concluded cases: {df['concluded'].sum()}")
    logger.info(f"Registered cases: {df['registered'].sum()}")

    return df

#### Total filed/resolved  per court cases by case_type

In [None]:
def analyze_court_outcomes(df: pd.DataFrame, start_date: str, end_date: str, outcome: str) -> pd.DataFrame:
    """
    Calculate the number of case outcomes per court within a specified period.
    
    Args:
        df (pd.DataFrame): A pandas DataFrame containing the data.
        start_date (str): The starting date of the period (YYYY-MM-DD format).
        end_date (str): The ending date of the period (YYYY-MM-DD format).
        outcome (str): A column representing the outcome of interest.
        
    Returns:
        pd.DataFrame: A DataFrame showing the number of resolved cases per court and case category.
    """
    try:
        period_start = pd.to_datetime(start_date)
        period_end = pd.to_datetime(end_date)
        
        if period_start > period_end:
            raise ValueError("start_date must be earlier than end_date")
        
        required_columns = {'court', 'case_type', 'activity_date', outcome}
        if not required_columns.issubset(df.columns):
            missing_columns = required_columns - set(df.columns)
            raise KeyError(f"Missing required columns: {missing_columns}")
        
        filtered_cases = df[
            (df['activity_date'] >= period_start) &
            (df['activity_date'] <= period_end) &
            (df[outcome] == 1)
        ]
        
        if filtered_cases.empty:
            logging.warning("No cases found for the given date range and outcome.")
   
        outcome_by_type = (
            filtered_cases
            .groupby(['court', 'case_type'])
            .size()
            .reset_index(name='num_cases')
        )

        result = outcome_by_type.pivot_table(
            index='court', 
            columns='case_type', 
            values='num_cases', 
            fill_value=0
        )
        
        logging.info("Successfully calculated case outcomes per court.")
        return result
    
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        raise

In [None]:
def check_time_limit(age, case_category, concluded, time_lines):
    """
    Check if a case falls within the specified time limit for its category and is concluded.
    
    Parameters:
        age (int): The age of the case in days.
        case_category (str): The category of the case.
        concluded (int): The status of the case conclusion (1 for concluded, 0 otherwise).
        time_lines (dict): A dictionary with case categories as keys and time limits as values.
        
    Returns:
        int: 1 if the case is within the time limit and concluded, otherwise 0.
    """
    time_limit = time_lines.get(case_category, 0)
    return 1 if age <= time_limit and concluded == 1 else 0


In [None]:
MERIT_OUTCOMES = [
    'Ruling Delivered- Case Closed', 
    'Judgment Delivered- Case Closed',
    'Judgment Delivered',
    'Judgment Delivered- Acquittal',
    'Judgment Delivered- Convicted',
    'Grant Revoked',
    'Ruling Delivered- Accused Discharged',
    'Retrial'
]

MERIT_CATEGORY = {
    'Judgment Delivered': [
        'Judgment Delivered- Case Closed',
        'Judgment Delivered',
        'Judgment Delivered- Acquittal',
        'Judgment Delivered- Convicted',
        'Grant Revoked',
        'Retrial'
        ],
    'Ruling Case Closed': [
        'Ruling Delivered- Case Closed', 
        'Ruling Delivered- Accused Discharged',
        ],
    'Final Grant': [
        'Grant Confirmed',
        'Limited Grant Issued',
        ],
    'Case Withdrawn': [
        'Matter Withdrawn',
        'Application Withdrawn - Case Closed',
        ],
   'Out Of Court Settlement': [
        'Consent Recorded - Case Closed',
        'Matter Settled Through Mediation',
        'Out Of Court Settlement Reached',
    ],
    'Dismissed':[
        'Dismissed For Want Of Prosecution - Case Closed',
        'Dismissed',
        'Appeal Dismissed',
        'Terminated'
    ],
    'Case Transfered': [
        'File Transfered -case Closed',
        'File Transferred',
    ],
    'Case Closed': [
        'Struck Out',
        'Application Dismissed - Case Closed',
        'Application Allowed - Case Closed',
        'Matter Settled- Case Closed',
        'Ruling Delivered- Application Closed',
        'Consolidated- Case Closed',
        'Abated',
        'Placed In Probation',
        'Revision Declined',
        'Probation Orders Issued',
        'Appeal Rejected',
        'Interlocutory Judgement Entered',
    ],
}

CRIMINAL_CASES = [
    'Murder Case',
    'Criminal Revision',
    'Criminal Appeal',
    'Murder - Gender Justice Criminal Case',
    'Criminal Court Martial Appeal',
    'Anti-Corruption and Economic Crimes Revision',
    'Criminal Miscellaneous Application',
    'Criminal Applications', 
    'COA Criminal Appeal'
]

RESOLVED_OUTCOMES = [
    'Ruling Delivered- Case Closed', 'Grant Confirmed', 'Matter Withdrawn',
    'Dismissed For Want Of Prosecution - Case Closed', 'Dismissed',
    'Terminated', 'Judgment Delivered- Case Closed',
     'Application Allowed - Case Closed',
    'Matter Settled- Case Closed', 'Consent Recorded - Case Closed',
    'Judgment Delivered', 'Judgment Delivered- Acquittal',
    'Judgment Delivered- Convicted', 'Application Withdrawn - Case Closed',
    'Struck Out', 'Application Dismissed - Case Closed',
    'Out Of Court Settlement Reached', 'Terminated',
    'Ruling Delivered- Application Closed', 'Consolidated- Case Closed',
    'Interlocutory Judgement Entered', 'Abated', 'Limited Grant Issued',
    'Grant Revoked', 'Placed In Probation', 'Ruling Delivered- Accused Discharged',
    'Revision Declined', 'Retrial', 'Probation Orders Issued',
    'Matter Settled Through Mediation', 'Appeal Dismissed', 'Appeal Rejected', 
    'Order Issued - Case Closed',
    'Terminated/ Struck Out/ Dismissed/Case Closed '
]

COA_PMMU_TIME_LINES = {
    'COA  Criminal Appeal': 360,
    'Civil Appeal': 360,
    'Civil Applications': 90,
    'Criminal Applications': 90,
}

NON_ADJOURNABLE = [
    'Taxation and Issuance of Certificates',
    'Orders',
    'Appointments of  Mediator',
    'Screening of files for Mediation',
    'Post-judgment',
    'Re-activation',
    'Reactivation',
    'Notice of Taxation',
    'Entering Interlocutory Judgments',
    'Approval by DR', 
    'Registration/Filing-Application', 
    'Registration/Filing', 
    'Registration/Filing-Application',
 ]

TRANSFERED_OUTCOMES = ['File Transfered -case Closed', 'File Transferred']

In [None]:
df = pd.read_csv('/home/fiend/Documents/coa/CoA/coa_data-23-24.csv')

In [None]:
df['court_name'] = df['court_name'].replace('Malindi Court of Appeal_Court of Appeal', 'Mombasa Court of Appeal_Court of Appeal')
df = df.rename(columns={'court_name': 'court'})

In [None]:
# df.to_csv(f'{file_path}/q2-24-25-coa.csv', index=False)

In [None]:
# create activity date and filed date columns
df = create_date_column(df.copy(), ['date_dd', 'date_mon', 'date_yyyy'], 'activity_date')
df = create_date_column(df.copy(), ['filed_dd', 'filed_mon', 'filed_yyyy'], 'filed_date')

start_date = pd.to_datetime('2023-07-01')
cutoff_date = pd.to_datetime('2024-06-30')

df = df[df['activity_date'] <= cutoff_date]

df = drop_nan_columns(df, ['date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no',
       'filed_dd', 'filed_mon', 'filed_yyyy', 'case_type', 'comingfor'])

df = remove_duplicates(df)

#Apply title case to the outcomes column
df = process_outcome_column(df)

# missing outcomes
df = drop_null_values(df)

# Add broad case category of civil and criminal
df = categorize_cases(df, CRIMINAL_CASES)

# Add case number to the data
df = generate_case_num(df, 'court', 'caseid_type', 'caseid_no', 'filed_date')
df = df.sort_values(by=['activity_date', 'case_number'])

# Remove whitespace and rename terminated 
df['outcome'] = df['outcome'].str.strip()
df['outcome'] = df['outcome'].replace('Terminated/ Struck Out/ Dismissed/Case Closed ', 'Terminated')

# Add filed and resolved outcomes
df = process_case_status(df, RESOLVED_OUTCOMES)

# Apply the function to create a new column with keys
df['productivity'] = df['outcome'].apply(lambda x: apply_dict(x, MERIT_CATEGORY))

# Add the age column 
df['age'] = (df['activity_date'] - df['filed_date']).dt.days

# Add time lines
df['time_lines'] = df.apply(lambda row: check_time_limit(row['age'], row['case_type'], row['concluded'], COA_PMMU_TIME_LINES), axis=1)


# remove leading and trailing spaces on comingfor
df['comingfor'] = df['comingfor'].str.strip()

# create a new column of 1 if reason_adj is not null and comingfor is not in non_adjourned else 0
df['adjourned'] = (df['reason_adj'].notnull() & df['comingfor'].apply(lambda x: x not in NON_ADJOURNABLE)).astype(int)
# an event is adjournable if non_adjourned is not in comingfor
df['adjournable'] = df['comingfor'].apply(lambda x: x not in NON_ADJOURNABLE).astype(int)
# Extract the quarter from the activity_date column
df['quarter'] = df['activity_date'].dt.to_period('Q')

In [None]:
# Create data for output
filed_cases = analyze_court_outcomes(df, start_date, cutoff_date, 'registered')
resolved_cases = analyze_court_outcomes(df, start_date, cutoff_date, 'concluded')

monthly_filed_cases = df.groupby(['court','date_mon']).agg({'registered':'sum'}).reset_index()
monthly_concluded_cases = df.groupby(['court','date_mon']).agg({'concluded':'sum'}).reset_index()

average_time_to_conclude = df.loc[df['concluded'] == 1].pivot_table(index='court', columns='nature', values='age', aggfunc='mean', fill_value=0).round(2)


pmmu_timelines = df[df['time_lines'] == 1].pivot_table(index='court', columns='case_type', values='time_lines', aggfunc='count', fill_value=0)
total_concluded_per_court = df[df['concluded'] == 1].pivot_table(index='court', columns='case_type', values='time_lines', aggfunc='count', fill_value=0)
resolved_within_pmmu_timeline = pmmu_timelines / total_concluded_per_court

court_productivity = df.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='count', fill_value=0)
court_productivity = court_productivity.rename_axis(columns=None)
# judge_productivity = judge_df.pivot_table(index='judge_1', columns='productivity', values='concluded', aggfunc='sum', fill_value=0)
# judge_productivity = judge_productivity.rename_axis(columns=None)

adjourned_per_court = df.groupby(['court', 'reason_adj'])['adjourned'].sum().reset_index(name='count')
adjourned = df.groupby('court')['adjourned'].sum().reset_index(name='total_adjourned')
adjournable = df.groupby('court')['adjournable'].sum().reset_index(name='total_adjournable')
# deternine the rate of adjournments
adjourn_proportion = pd.merge(adjourned, adjournable, on=['court'])
adjourn_proportion['adjourn_proportion'] = (adjourn_proportion['total_adjourned']/adjourn_proportion['total_adjournable'])*100

In [None]:
df.to_csv('/home/fiend/Documents/coa/CoA/processed-coa-23-24.csv')

In [None]:
df[(df['court'] == 'Kisumu Court of Appeal_Court of Appeal') & (df['outcome'] == "Case Registered/Filed")].resample('QE', on='activity_date')['case_type'].size()

In [None]:
kisumu_appeals = df[(df['court'] == 'Kisumu Court of Appeal_Court of Appeal') & (df['case_type'] =='COA  Criminal Appeal') & (df['concluded'] == 1)]
#.resample('QE', on='activity_date')['case_type'].size()

In [None]:
kisumu_appeals.groupby('case_type')['case_type'].size()

In [None]:
kisumu_appeals.to_csv('/home/fiend/Documents/coa/CoA/kisumu-coa-appeals.csv')

In [None]:
kisumu_resolved_appeals = df[(df['court'] == 'Kisumu Court of Appeal_Court of Appeal') & (df['concluded'] == 1)]

In [None]:
df.groupby('court')['concluded'].sum().sort_values(ascending=False).reset_index(name='count')

In [None]:
# Group by quarter and sum the adjourned cases
total_adjournments_per_quarter = df.resample('QE', on='activity_date')['adjourned'].sum().reset_index()

# Rename the columns for clarity
total_adjournments_per_quarter.columns = ['quarter', 'total_adjournments']

print(total_adjournments_per_quarter)

In [None]:
df.resample('QE', on='activity_date')['adjournable'].sum().reset_index()

In [None]:
df.resample('QE', on='activity_date')['adjourned'].sum().reset_index()

In [None]:
mombasa = df[df['court'] == 'Mombasa Court of Appeal_Court of Appeal']

In [None]:
mombasa.groupby('case_type')['concluded'].sum().sort_values(ascending=False).reset_index(name='count')

In [None]:
mombasa.groupby(['caseid_type','broad_case_type'])['concluded'].sum().sort_values(ascending=False).reset_index(name='count')

In [None]:
mombasa['max_resolved'] = mombasa.groupby('case_number')['concluded'].transform('max')

In [None]:
mombasa_resolved = mombasa[mombasa['max_resolved'] == 1]

In [None]:
# find total duplicates of mombasa_resolved['case_number'] 

resolved_cases=mombasa_resolved.groupby(['case_number', 'outcome'])['concluded'].sum().sort_values(ascending=False).reset_index(name='count')

In [None]:
mombasa = mombasa.sort_values(by=['activity_date', 'case_number'])

In [None]:
#mombasa[mombasa['case_number']=='Mombasa Court of Appeal_Court of Appeal/COACAPPL/55/2020-10-20']

In [None]:
mombasa[mombasa['case_number']=='Mombasa Court of Appeal_Court of Appeal/COAEPA/E002/2023-03-28']

In [None]:
mombasa.groupby('outcome')['concluded'].sum().sort_values(ascending=False).reset_index(name='count')

In [None]:
mombasa.groupby(pd.Grouper(key='activity_date', freq='QE'))['concluded'].sum()


In [None]:
adjourned_court = df.groupby(['court', 'date_mon'])['total_adjournable'].sum().reset_index(name='count')

In [None]:
output_path = f"{file_path}\\reports\\"

In [None]:
# save data
column_order = ['COA  Criminal Appeal', 'Criminal Applications', 'Civil Appeal', 'Civil Applications']

filed_cases.to_csv(f'{output_path}filed_cases.csv', index=True)
resolved_cases.to_csv(f'{output_path}/resolved_cases.csv', index=True)
monthly_filed_cases.to_csv(f'{output_path}filed_per_month.csv', index=False)
monthly_concluded_cases.to_csv(f'{output_path}monthly_concluded.csv', index=False)
court_productivity.to_csv(f'{output_path}court_productivity.csv', index=True)
average_time_to_conclude.to_csv(f'{output_path}average_time_to_conclude.csv', index=True)
resolved_within_pmmu_timeline.to_csv(f'{output_path}resolved_within_pmmu_timeline.csv', index=True)
#judge_productivity.to_csv(f'{file_path}/reports/judge_productivity.csv', index=False)
adjourned_per_court.to_csv(f'{output_path}adjourn_per_court.csv', index=False)
adjourn_proportion.to_csv(f'{output_path}adjourn_proportion.csv', index=False)

In [None]:
df.to_csv(f'{file_path}/reports/coa_raw.csv', index=False)

### Determine Productivity per judge (Merit/Non Merit)

In [None]:
# df.groupby('judge_1')['concluded'].sum().sort_values(ascending=False).reset_index(name='count').to_csv(f'{file_path}/reports/judge_productivity.csv', index=False)

In [None]:
#judges = df['judge_1'].unique().tolist()
#judge_productivity = df.groupby('judge_1')['concluded'].sum().sort_values(ascending=False).reset_index(name='count')
#judge_productivity.to_csv(f'{output_path}/judge_productivity.csv', index=False)
#judge_df.groupby(['judge_1', 'court'])['court'].count()

In [None]:
# # productivity per court
# court_productivity = df.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='sum', fill_value=0)
# court_pivot = court_productivity.rename_axis(columns=None)
# # matters handled by judge per court
# matters_handled = judge_df.groupby(['court','judge_1'])['court'].count().reset_index(name='count')
# # Resolved cases by judge per court
# judge_court_productivity = judge_df.groupby(['judge_1', 'court'])['concluded'].sum().reset_index(name='count')

## Pending Cases Analysis

In [None]:
unique_df_cases = merged_df[merged_df['resolved'] != 1]

In [None]:
unique_df = unique_df_cases.drop_duplicates(subset='case_number', keep='first')

In [None]:
unique_df = unique_df.reset_index(drop=True)

In [None]:
# Filter to get only the rows that are in pending_baseline but not in df
only_in_baseline = unique_df[unique_df['_merge'] == 'left_only']

In [None]:
only_in_baseline.groupby('court').size()

In [None]:
# def preprocess_dataframe(df):
#     # Sort DataFrame by 'activity_date' in descending order
#     df_sorted = df.sort_values(by=['court', 'year_filed'], ascending=False)
    
#     # Drop duplicates based on 'number_on_file' while keeping the first occurrence
#     df_unique_cases = df_sorted.drop_duplicates(subset='case_number', keep='first')
    
#     # Filter unresolved cases
#     unique_unresolved_cases = df_unique_cases[df_unique_cases['resolved'] == 0]
    
#     # Reset index
#     unique_unresolved_cases = unique_unresolved_cases.reset_index(drop=True)
    
#     return unique_unresolved_cases
# processed_df = preprocess_dataframe(merged_df)

In [None]:
unique_df.groupby('court')['court'].value_counts().sort_values(ascending=False)

In [None]:
processed_df.groupby('court')['court'].value_counts().sort_values(ascending=False)

In [None]:
# df = ['court', 'comingfor', 'outcome', 'activity_date', 'filed_date', 'activity_date_year', 'activity_date_month', 'nature', 'case_category', 'case_number']

In [None]:
# Convert both dataframes to sets of 'case_number' values
pending_baseline_cases = set(pending_baseline['case_number'].unique())

df_cases = set(df['case_number'].unique())

# Find the cases present in df but not in df_a
cases_not_in_pending = df_cases.difference(pending_baseline_cases)

# Print the cases
print("Cases present in df but not in df_a:")

In [None]:
cases_not_in_pending = pd.DataFrame(cases_not_in_pending)

In [None]:
combined_df = pd.merge(df, pending_baseline, on='case_number', how='left', indicator=True)

In [None]:
combined_df = combined_df[combined_df['_merge'] == 'left_only'].drop(columns='_merge')

In [None]:
combined_df = combined_df.sort_values(by=['court', 'activity_date', 'case_number'], ascending=True)

In [None]:
# Convert df_a['case_number'] to a set
df_a_cases = set(pending_baseline['case_number'].unique())

In [None]:
filtered_df_closed = df[(df['concluded'] == 1) & (df['case_number'].isin(df_a_cases))]

In [None]:
filtered_df_closed

In [None]:
# Filter df based on 'outcome' == 1 and 'case_number' not in df_a_cases
filtered_df = df[(df['concluded'] == 1) & ~(df['case_number'].isin(df_a_cases))]

In [None]:
filtered_df.groupby('court')['case_number'].count().sort_values(ascending=False).reset_index(name='count')

In [None]:
filtered_df['resolved'] = filtered_df.groupby('case_number')['concluded'].transform('max')

In [None]:
  # Drop duplicates based on 'number_on_file' while keeping the first occurrence
filtered_df = filtered_df.drop_duplicates(subset='case_number', keep='first')
    

In [None]:
nakuru = filtered_df[filtered_df['court'] == 'Milimani Anti Corruption and Economic Crimes']

In [None]:
nakuru.to_csv(f'{output_path}/nakuru.csv', index=False)

In [None]:
pending_baseline.to_csv(f'{output_path}/pending_baseline_2.csv', index=False)

In [None]:
filtered_df.groupby('court')['case_number'].count().sort_values(ascending=False).reset_index(name='count')

In [None]:
pending_baseline[pending_baseline['case_number'] == 'Busia/HCCCMISC/E026/2023']

#### Backlog determination

In [None]:
#import pandas as pd
# raw data of pending cases(next period baseline)
#df = pd.read_csv(f'{output_path}/raw_pending_cases.csv')

In [None]:
# The date for computation of backlog ought to be the end of the quarter
quarter_end = pd.to_datetime('2024-03-31')
df['end_date'] = quarter_end

In [None]:

# Define a function to categorize ages
def categorize_age(age):
    if age <= 365:  # 0-1 years
        return '0-1 years'
    elif age <= 3 * 365:  # 1-3 years
        return '1-3 years'
    else:
        return 'Over 3 years'


In [None]:
#df['filed_date'] = pd.to_datetime(df['filed_date'], format='%Y-%m-%d')
processed_df['pending_age'] = (processed_df['end_date'] - processed_df['filed_date']).dt.days
processed_df = processed_df[processed_df['pending_age'] >= 0]


In [None]:
processed_df['age_group'] = processed_df['pending_age'].apply(categorize_age)
backlog_category = processed_df.pivot_table(index='court', columns='age_group', values='unique_number', aggfunc='count', fill_value=0)


# Save the data to csv

In [None]:
column_order = ['Murder', 'Criminal Appeal', 'Criminal Application', 
                'Criminal Revision', 'Civil Suit', 'Civil Appeal', 'Miscellaneous Application', 
                'Constitution Petition', 'Judicial Review',
                  'Bankruptcy and Insolvency', 'Tax Appeal', 
                  'Adoption', 'Divorce', 'Probate Administratio']
# Total missing per month
missing_per_month.to_csv(f'{output_path}/missing_outcomes_per_month.csv', index=False)

# Total filed cases
filed_cases.to_csv(f'{output_path}/filed_cases.csv', columns=column_order, index=True)

# Total concluded cases
concluded_cases.to_csv(f'{output_path}/concluded_cases.csv', columns=column_order, index=True)

# Total filed, concluded and CCR per month
monthly_cases.to_csv(f'{output_path}/monthly_cases.csv', index=False)

# Productivity per court
df_pivot.to_csv(f'{output_path}/productivity.csv', index=True)

# Average time to conclude
average_time_to_conclude.to_csv(f'{output_path}/average_time_to_conclude.csv', index=True)

# Time lines
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)

judge_pivot.to_csv(f'{output_path}/judge_productivity.csv', index=True)

judge_court_productivity.to_csv(f'{output_path}/judge_court_productivity.csv', index=False)

court_pivot.to_csv(f'{output_path}/court_productivity.csv', index=True)

matters_handled.to_csv(f'{output_path}/judge_matters_handled.csv', index=False)

adjourned_per_court.to_csv(f'{output_path}/adjourned_per_court.csv', index=False)

adjourn_proportion.to_csv(f'{output_path}/adjourn_proportion.csv', index=False)

pending_cases.to_csv(f'{output_path}/hc_pending_cases.csv', index=True)

backlog_category.to_csv(f'{output_path}/hc_backlog.csv', index=True)


# raw data of pending cases(next period baseline)
processed_df.to_csv(f'{output_path}/raw_pending_cases.csv', index=False)


In [None]:
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)

In [None]:
### COA Saving
output_path = '/home/arch/devel/data/Report'
'''
column_order = ['murder', 'criminal_appeal', 'criminal_application', 
                'revision', 'suit', 'civil_appeal', 'misc_application', 
                'constitutional_petition', 'judicial_review',
                  'bankruptcy_and_insolvency', 'tax_appeal', 
                  'adoption', 'divorce', 'probate_and_admin']

'''
column_order  = ['COA Criminal Appeal', 'Criminal Applications', 'Civil Appeal', 'Civil Applications']
# Total missing per month
missing_per_month.to_csv(f'{output_path}/missing_outcomes_per_month.csv', index=False)

# Total filed cases
filed_cases.to_csv(f'{output_path}/filed_cases.csv', index=True)

# Total concluded cases
concluded_cases.to_csv(f'{output_path}/concluded_cases.csv', index=True)

# Total filed, concluded and CCR per month
monthly_cases.to_csv(f'{output_path}/monthly_cases.csv', index=False)

# Productivity per court
df_pivot.to_csv(f'{output_path}/productivity.csv', index=True)

# Average time to conclude
average_time_to_conclude.to_csv(f'{output_path}/average_time_to_conclude.csv', index=True)

# Time lines
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)


court_pivot.to_csv(f'{output_path}/court_productivity.csv', index=True)


adjourned_per_court.to_csv(f'{output_path}/adjourned_per_court.csv', index=False)

adjourn_proportion.to_csv(f'{output_path}/adjourn_proportion.csv', index=False)


In [None]:
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)

In [None]:
df.to_csv(f'{output_path}/coa_cases.csv', index=False)

### Exploratory analysis

In [None]:
def get_cases_per_quarter(df, column):
    # Group by quarters and count cases
    cases_per_quarter = df.groupby(pd.Grouper(key='activity_date', freq='QE'))[column].sum()

    # Reset index to make the quarters a column
    cases_per_quarter = cases_per_quarter.reset_index()

    # Rename the columns
    cases_per_quarter.columns = ['quarter', f'cases_{column}']

    return cases_per_quarter

In [None]:
def get_case_nature_per_quarter(df: pd.DataFrame, column: str, nature: str):
   
    # Set up date range for quarters
    quarters = df['activity_date'].dt.to_period('Q')

    # Filter cases where concluded == 1 and case nature == 'civil'
    filtered_df = df[(df[column] == 1) & (df['nature'] == nature)]

    # Group by quarter and count cases
    cases_per_quarter = filtered_df.groupby(quarters).size()

    return cases_per_quarter

In [None]:
# drop if activity_date is null
df = df.dropna(subset=['activity_date'])

In [None]:
quarterly_adjourned =  get_cases_per_quarter(df, 'adjourned')

In [None]:
quarterly_adjournable =  get_cases_per_quarter(df, 'adjournable')

In [None]:
quarterly_adjournable

In [None]:
quarterly_concluded =  get_cases_per_quarter(df, 'concluded')

In [None]:
quarterly_concluded =  get_cases_per_quarter(df, 'concluded')
quarterly_registered =  get_cases_per_quarter(df, 'registered')


In [None]:
# merge quarterly_concluded and quarterly_registered on quarter column
merged_quarterly = pd.merge(quarterly_concluded, quarterly_registered, on='quarter')

In [None]:
df.drop('court', inplace=True, axis=1)

In [None]:
df.groupby(['court', 'month'])['concluded'].sum().reset_index(name='count')

In [None]:
quarterly_concluded_civil = get_case_nature_per_quarter(df, 'concluded', 'Civil')
quarterly_registered_civil = get_case_nature_per_quarter(df, 'registered', 'Civil')

quarterly_concluded_criminal = get_case_nature_per_quarter(df, 'concluded', 'Criminal')
quarterly_registered_criminal = get_case_nature_per_quarter(df, 'registered', 'Criminal')

In [None]:
# merge quarterly_concluded_civil quarterly_registered_civil quarterly_concluded_criminal quarterly_registered_criminal on quarter column 
merged_civil_criminal = pd.merge(quarterly_concluded_civil, quarterly_registered_civil, on='quarter')
merged_civil_criminal = pd.merge(merged_civil_criminal, quarterly_concluded_criminal, on='quarter')
merged_civil_criminal = pd.merge(merged_civil_criminal, quarterly_registered_criminal, on='quarter')

In [None]:
merged_civil_criminal.to_csv(f'{output_path}/quarterly_case_nature.csv', index=True)

TODO
### adjourn by event
### adjourn by case type

### backlog by case type
### check if there a courts that resolved more cases than pending

In [None]:
#df[(df['court'] == 'Milimani Commercial and Tax') & (df['activity_date_year'] == 2024)].groupby('judge_1').size().reset_index(name='count')
# group by case_type if outcome == 'Ruling Delivered- Case Closed' 
#df[df['productivity'] == 'judgment'].groupby('case_category').size().reset_index(name='count')
#court_productivity = df[df['productivity'] == 'ruling']
#court_productivity_pivot = court_productivity.pivot_table(index='court', columns='case_category', values='concluded', aggfunc='sum', fill_value=0)
#productivity_pivot = court_productivity_pivot.rename_axis(columns=None)
#df[df['productivity'] == 'ruling'].groupby('case_category').size().reset_index(name='count')
#df[df['outcome'] == 'Ruling Delivered- Accused Discharged'].groupby(['court', 'case_category'])['court'].size().reset_index(name='count')


In [None]:
pending_count = pending_df.groupby(['court', 'case_category']).size().reset_index(name='pending_count')

In [None]:
pending_pivot = pending_count.pivot_table(index='court', columns='case_category', values='pending_count', fill_value=0)

In [None]:
pending_pivot.to_csv('/home/arch/devel/data/pending_case_types.csv', index=True)

In [None]:
def categorize_concluded_cases(df):
    """
    Categorize concluded cases into '1-3 year' and 'over 3 year' based on age column.

    Args:
        df (pandas.DataFrame): DataFrame containing the data.

    Returns:
        pandas.DataFrame: DataFrame with an additional column 'age_category'.
    """
    # Filter only concluded cases
    df_concluded = df[df['concluded'] == 1]

    # Calculate age in years
    df_concluded['age_years'] = df_concluded['age'] / 365

    # Categorize based on age
    df_concluded.loc[:, 'age_category'] = df_concluded['age_years'].apply(lambda x: '1-3 year' if 1 <= x <= 3 else 'over 3 year')

    # Merge back to original DataFrame
    df = pd.merge(df, df_concluded[['case_number', 'age_category']], on='case_number', how='left')

    return df


In [None]:
df = categorize_concluded_cases(df)

In [None]:
# groupby judge_1 if court == 'Milimani Commercial and Tax' 
#df(df['court'] == 'Milimani Commercial and Tax').groupby([ 'judge_1',])['outcome'].size().reset_index(name='count')
df[df['court'] == 'Meru'].groupby(['judge_1'])['outcome'].size().reset_index(name='count')

In [None]:
# group by court if case_type == 'Murder Case' and registered == 1
df[(df['case_type'] == 'Criminal Appeal') & (df['outcome'] == 'File Transferred')].groupby('court').size().reset_index(name='count')

In [None]:
['comingfor', 'outcome',  'male_applicant', 'female_applicant',
       'organization_applicant', 'male_defendant', 'female_defendant',
       'organization_defendant', 'legalrep', 'court',
       'activity_date', 'filed_date', 'activity_date_year',
       'activity_date_month', 'nature', 'case_category', 'case_number',
       'registered', 'concluded', 'productivity', 'age', 'time_lines',
       'adjourned', 'adjournable']

In [None]:
   # Drop duplicates based on 'number_on_file' while keeping the first occurrence
df_unique_cases = df.drop_duplicates(subset='case_number', keep='last')
    

In [None]:
df_unique_cases.shape

In [None]:
filed_df = df[df['registered'] == 1]

In [None]:
male_applicants = filed_df[(filed_df['male_applicant'] == 1) & (filed_df['case_category'] != 'murder')].groupby('case_category').size().reset_index(name='count')

In [None]:
female_applicants = filed_df[(filed_df['female_applicant'] == 1) & (filed_df['case_category'] != 'murder')].groupby('case_category').size().reset_index(name='count')

In [None]:
male_accused = filed_df[(filed_df['male_defendant'] == 1) & (filed_df['case_category'] == 'murder')].groupby('case_category').size().reset_index(name='count')

In [None]:
male_murder_defendants = filed_df[(filed_df['male_defendant'] == 1) & (filed_df['case_category'] == 'murder')].groupby('case_category').size().reset_index(name='count')

In [None]:
female_murder_defendants = filed_df[(filed_df['female_defendant'] == 1) & (filed_df['case_category'] == 'murder')].groupby('case_category').size().reset_index(name='count')

In [None]:
applicants = pd.merge(female_applicants, male_applicants, on='case_category', how='outer', suffixes=('_female', '_male'))

In [None]:
applicants.to_csv(f'{output_path}/applicants_gender.csv', index=False)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))

bplot = ax.boxplot(pending_baseline['total_events'],
                     vert=False,  # Set vert to False for a vertical box plot
                     patch_artist=True) 

# Add labels for median, mean, etc.
# Add labels for median, lower quartile, upper quartile, and outliers
for line in bplot.keys():
    if line == 'medians':
        for median in bplot[line]:
            ax.text(median.get_xdata()[0], median.get_ydata()[0],
                    f'{median.get_xdata()[0]:.1f}', ha='center', va='bottom', color='red', fontsize=10)
    elif line == 'whiskers':
        for whisker in bplot[line]:
            ax.text(whisker.get_xdata()[0], whisker.get_ydata()[0],
                    f'{whisker.get_xdata()[0]:.1f}', ha='center', va='bottom', color='green', fontsize=10)
            ax.text(whisker.get_xdata()[1], whisker.get_ydata()[1],
                    f'{whisker.get_xdata()[1]:.1f}', ha='center', va='bottom', color='green', fontsize=10)
    elif line == 'fliers':
        for fliers in bplot[line]:
            for f in fliers.get_xdata():
                ax.text(f, fliers.get_ydata()[0], f'{f:.1f}', ha='center', va='bottom', color='blue', fontsize=10)


ax.set_title('Workload Analysis')
ax.set_xlabel('Cases per Judge')

# Set face color for the box plot
colors = ['lightgreen']
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)

plt.show()

data = [pending_baseline['total_events']]

# Create a box plot with horizontal orientation
fig, ax = plt.subplots()
bplot = ax.boxplot(data, vert=True, patch_artist=True)

# Add labels for median, mean, etc.
for line in bplot.keys():
    if line == 'medians':
        for median in bplot[line]:
            # Add label for median
            ax.text(median.get_xdata()[0], median.get_ydata()[0],
                    f'{median.get_xdata()[0]:.2f}', ha='center', va='bottom', color='red', fontsize=8)
    elif line == 'fliers':
        for fliers in bplot[line]:
            # Add labels for outliers
            for f in fliers.get_xdata():
                ax.text(f, fliers.get_ydata()[0], f'{f:.2f}', ha='center', va='bottom', color='blue', fontsize=8)

# Set labels and title
ax.set_xlabel('Workload')
ax.set_ylabel('Values')
ax.set_title('Workload Analysis')

plt.show()