In [1]:
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Union, Optional
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Data Cleaning

In [3]:
def drop_null_values(df: pd.DataFrame, column_name: str = 'outcome') -> pd.DataFrame:
    """
    Drop rows from the DataFrame where the specified column contains null values.

    Args:
        df (pd.DataFrame): The DataFrame from which to drop rows.
        column_name (str): The name of the column to check for null values. Default is 'outcome'.

    Returns:
        pd.DataFrame: The DataFrame with rows containing null values in the specified column dropped.
    """
    initial_row_count: int = df.shape[0]
    cleaned_df: pd.DataFrame = df.dropna(subset=[column_name])
    final_row_count: int = cleaned_df.shape[0]
    dropped_row_count: int = initial_row_count - final_row_count
    
    if dropped_row_count > 0:
        logger.info(f"Total dropped rows with null values in '{column_name}': {dropped_row_count}")
    else:
        logger.info(f"No rows dropped with null values in '{column_name}'")
    return cleaned_df

In [4]:
def drop_nan_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """
    Drop rows containing NaN values from the specified columns of a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to process.
        columns (List[str]): A list of column names to check for NaN values.

    Returns:
        pd.DataFrame: The updated DataFrame with NaN-containing rows dropped.

    Raises:
        ValueError: If any of the specified columns are not present in the DataFrame.
    """
    # Validate that all specified columns exist in the DataFrame
    missing_columns = set(columns) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Columns not found in DataFrame: {', '.join(missing_columns)}")

    # Identify columns with NaN values
    nan_columns = df[columns].columns[df[columns].isna().any()].tolist()

    # Log dropped rows if any
    if nan_columns:
        nan_count = df[columns].isna().sum()
        logger.info("Dropping rows with NaN values:")
        for col in nan_columns:
            logger.info(f"  {col}: {nan_count[col]} rows")

    # Drop rows with NaN values in specified columns
    original_row_count = len(df)
    df_cleaned = df.dropna(subset=columns)
    dropped_row_count = original_row_count - len(df_cleaned)

    if dropped_row_count > 0:
        logger.info(f"Total rows dropped: {dropped_row_count}")
    else:
        logger.info("No rows were dropped.")

    return df_cleaned

In [5]:
def remove_duplicates(data: pd.DataFrame) -> pd.DataFrame:
    """
    Remove duplicates from a DataFrame.
    
    Args:
        data (pd.DataFrame): Input DataFrame.
        
    Returns:
        pd.DataFrame: DataFrame with duplicates removed.
    """
    num_duplicates = data.duplicated().sum()
    
    if num_duplicates > 0:
        logging.info(f"{num_duplicates} duplicates found.")
        data = data.drop_duplicates(keep="first").reset_index(drop=True)
        logging.info(f"{num_duplicates} duplicates removed.")
    else:
        logging.info("No duplicates found.")
    
    return data

In [6]:
def create_date_column(df: pd.DataFrame, column_names: List[str], new_col: str) -> pd.DataFrame:
    """
    Creates a new date column in the DataFrame by concatenating the values of three specified columns.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        column_names (List[str]): A list of three column names to be concatenated [year, month, day].
        new_col (str): The import commandsname of the new date column to be created.

    Returns:
        pd.DataFrame: The DataFrame with the new date column added.

    Raises:
        ValueError: If the input list doesn't contain exactly three column names or if columns are missing.
    """
    if len(column_names) != 3:
        raise ValueError("column_names must contain exactly three elements: [year, month, day]")

    year_col, month_col, day_col = column_names

    # Check if all required columns exist in the DataFrame
    missing_columns = set(column_names) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in DataFrame: {', '.join(missing_columns)}")

    # Create copies to avoid SettingWithCopyWarning
    df = df.copy()

    try:
        # Convert year and day columns to integers
        df[year_col] = df[year_col].astype(float).astype(int)
        df[day_col] = df[day_col].astype(float).astype(int)

        # Concatenate the columns to create a date string
        df[new_col] = (df[year_col].astype(str) + '-' + 
                       df[month_col].astype(str) + '-' + 
                       df[day_col].astype(str))

        # Convert to datetime
        df[new_col] = pd.to_datetime(df[new_col], errors='coerce')

        # Log information about the conversion
        valid_dates = df[new_col].notna().sum()
        logger.info(f"Created new date column '{new_col}'. Valid dates: {valid_dates}/{len(df)}")

    except Exception as e:
        logger.error(f"Error creating date column: {str(e)}")
        raise

    return df

In [7]:
def generate_case_num(df: pd.DataFrame, court_col: str, caseid_type_col: str, caseid_no_col: str, filed_yyyy_col: str, new_col='case_number') -> pd.DataFrame:
    """
    Generates a case number by concatenating court, caseid_type, caseid_no, and filed_yyyy columns.

    Args:
        df (pd.DataFrame): The dataframe containing the necessary columns.
        court_col (str): The name of the column containing court information.
        caseid_type_col (str): The name of the column containing case ID type.
        caseid_no_col (str): The name of the column containing case ID number.
        filed_yyyy_col (str): The name of the column containing the year the case was filed.
        new_col (str): The name of the new column to be created for the case number. Default is 'case_num'.

    Returns:
        pd.DataFrame: DataFrame with the new case number column.
    """
    df[new_col] = df[court_col] + '/' + df[caseid_type_col] + '/' + df[caseid_no_col] + '/' + df[filed_yyyy_col].astype(str)
    return df

In [8]:
def apply_title_case(text):
    """
    Apply title case to a given string.
    
    Args:
        text: The input string to process.
    
    Returns:
        str: The processed string in title case.
    """
    if pd.isna(text):
        return np.nan
    if not isinstance(text, str):
        logger.warning(f"Non-string value encountered: {text}")
        return str(text)
    return text.title()

In [9]:
def process_outcome_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the 'outcome' column of the DataFrame by applying title case.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing the 'outcome' column.
    
    Returns:
        pd.DataFrame: The DataFrame with the processed 'outcome' column.
    """
    if 'outcome' not in df.columns:
        logger.error("'outcome' column not found in the DataFrame")
        return df

    original_null_count = df['outcome'].isnull().sum()
    
    df['outcome'] = df['outcome'].apply(apply_title_case)
    
    new_null_count = df['outcome'].isnull().sum()
    if new_null_count > original_null_count:
        logger.warning(f"Number of null values in 'outcome' increased from {original_null_count} to {new_null_count}")
    
    non_string_count = df['outcome'].apply(lambda x: not isinstance(x, str) if pd.notna(x) else False).sum()
    if non_string_count > 0:
        logger.warning(f"Found {non_string_count} non-string values in 'outcome' after processing")

    return df

In [10]:
def categorize_case(case_type: str, criminal_cases: Optional[List[str]]) -> str:
    """
    Categorize a case as 'Criminal' or 'Civil' based on its type.
    
    Args:
        case_type (str): The type of the case.
        criminal_cases (Optional[List[str]]): List of case types considered as criminal.
        
    Returns:
        str: 'Criminal' if the case type is in the criminal cases list or if criminal_cases is None, 'Civil' otherwise.
    """
    if criminal_cases is None:
        return 'Criminal'
    else:
        return 'Criminal' if case_type in criminal_cases else 'Civil'

In [11]:

def categorize_cases(df: pd.DataFrame, criminal_cases: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Categorize all cases in the DataFrame as 'Criminal' or 'Civil'.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing case data.
        criminal_cases (Optional[List[str]]): List of case types considered as criminal.
            If None, all cases are categorized as 'Criminal'.
        
    Returns:
        pd.DataFrame: DataFrame with an added 'nature' column indicating case nature.
    """
    df['nature'] = df['case_type'].apply(lambda x: categorize_case(x, criminal_cases))

    # Check for presence of both case types
    if 'Criminal' not in df['nature'].values:
        logging.warning("No criminal cases found in the DataFrame.")
    if 'Civil' not in df['nature'].values:
        logging.warning("No civil cases found in the DataFrame.")
    
    return df

In [12]:
def apply_dict(value: Any, dictionary: Dict[str, Union[str, List[Any]]]) -> Union[str, None]:
    """
    Find all keys in a dictionary where the given value matches.

    Args:
        value: The value to search for.
        dictionary: The dictionary to search in.

    Returns:
        A list of keys where the value matches, or None if no matches are found.
    """
    matching_keys = []
    for key, dict_value in dictionary.items():
        if isinstance(dict_value, str) and dict_value == value:
            matching_keys.append(key)
        elif isinstance(dict_value, list) and value in dict_value:
            matching_keys.append(key)
    
    if not matching_keys:
        return None
    elif len(matching_keys) == 1:
        return matching_keys[0]
    else:
        return matching_keys

In [13]:

def _convert_to_date(date: Union[str, pd.Timestamp]) -> pd.Timestamp:
    """
    Safely convert a string to a pd.Timestamp object.
    """
    if isinstance(date, pd.Timestamp):
        return date
    try:
        return pd.to_datetime(date)
    except (ValueError, TypeError):
        return pd.NaT

In [14]:
def is_concluded(outcome: str, resolved_outcomes: List[str]) -> bool:
    """
    Determine if a case outcome is resolved.

    Args:
        outcome (str): The outcome of the case.
        resolved_outcomes (List[str]): List of outcomes that indicate resolution.

    Returns:
        bool: True if the outcome is considered resolved, otherwise False.
    """
    return outcome.lower() in (resolved.lower() for resolved in resolved_outcomes)


In [15]:
def is_case_registered(outcome: str, activity_date: Union[str, pd.Timestamp], filed_date: Union[str, pd.Timestamp]) -> bool:
    """
    Check if a case is registered based on outcome and whether activity and filed dates match.

    Args:
        outcome (str): The outcome of the case.
        activity_date (Union[str, pd.Timestamp]): The date of the activity.
        filed_date (Union[str, pd.Timestamp]): The date the case was filed.

    Returns:
        bool: True if the outcome implies registration and activity_date matches filed_date, otherwise False.
    """
    outcome = outcome.strip().lower()
    
    if 'registered' not in outcome and 'filed' not in outcome:
        return False

    activity_date = _convert_to_date(activity_date)
    filed_date = _convert_to_date(filed_date)

    return pd.notna(activity_date) and pd.notna(filed_date) and activity_date == filed_date


In [16]:
def validate_columns(df: pd.DataFrame, required_columns: Union[str, List[str]]) -> None:
    """
    Validate that the DataFrame contains the required columns.

    Args:
        df (pd.DataFrame): The input DataFrame.
        required_columns (Union[str, List[str]]): A single column name (str) or a list of column names (List[str]) that must be present.

    Raises:
        ValueError: If any required columns are missing.
    """
    # Convert required_columns to a list if it's a string
    if isinstance(required_columns, str):
        required_columns = [required_columns]

    # Check which required columns are missing
    missing_columns = set(required_columns) - set(df.columns)
    
    # If there are missing columns, raise a ValueError with an informative message
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")


In [17]:
def add_concluded_column(df: pd.DataFrame, resolved_outcomes: List[str]) -> pd.DataFrame:
    """
    Add the 'concluded' column to the DataFrame based on resolved outcomes.
    
    Args:
        df (pd.DataFrame): The DataFrame with case data.
        resolved_outcomes (List[str]): List of outcomes considered resolved.
    
    Returns:
        pd.DataFrame: DataFrame with 'concluded' column added.
    """
    df['concluded'] = df['outcome'].apply(lambda outcome: is_concluded(outcome, resolved_outcomes))
    return df



In [18]:
def add_registered_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add the 'registered' column to the DataFrame based on case registration criteria.
    
    Args:
        df (pd.DataFrame): The DataFrame with case data.
    
    Returns:
        pd.DataFrame: DataFrame with 'registered' column added.
    """
    df['registered'] = df.apply(
        lambda row: is_case_registered(row['outcome'], row['activity_date'], row['filed_date']),
        axis=1
    )
    return df

In [19]:
from typing import Dict
from datetime import datetime, timedelta
import pandas as pd

TIME_LIMITS: Dict[str, int] = {
    'murder': 360,
    'revision': 90,
    'misc_application': 90,
    'suit': 360,
    'judicial_review': 180,
    'constitutional_petition': 180,
}

def is_case_within_time_limit(age: int, case_category: str, concluded: bool) -> bool:
    """
    Check if a case falls within the specified time limit for its category and is concluded.

    Args:
        age (int): The age of the case in days.
        case_category (str): The category of the case.
        concluded (bool): Whether the case is concluded.

    Returns:
        bool: True if the case is within the time limit and concluded, otherwise False.
    """
    time_limit = TIME_LIMITS.get(case_category, 0)
    return age <= time_limit and concluded

def calculate_case_age(filed_date: datetime, activity_date: datetime) -> int:
    """
    Calculate the age of a case in days.

    Args:
        filed_date (datetime): The date the case was filed.
        activity_date (datetime): The date of the latest activity.

    Returns:
        int: The age of the case in days.
    """
    return (activity_date - filed_date).days

def add_pmmu_timelines(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the case data by adding age and time limit compliance columns.

    Args:
        df (pd.DataFrame): The input DataFrame containing case data.

    Returns:
        pd.DataFrame: The processed DataFrame with new columns added.
    """
    df['age'] = df.apply(lambda row: calculate_case_age(row['filed_date'], row['activity_date']), axis=1)
    df['within_time_limit'] = df.apply(
        lambda row: is_case_within_time_limit(row['age'], row['broad_case_type'], row['concluded']),
        axis=1
    )
    return df

# Example usage
# df = pd.read_csv('case_data.csv')  # Load your data
# df['filed_date'] = pd.to_datetime(df['filed_date'])
# df['activity_date'] = pd.to_datetime(df['activity_date'])
# df['concluded'] = df['concluded'].astype(bool)
# processed_df = add_pmmu_timelines(df)



In [20]:
def monthly_case_stats(df, registered_col, concluded_col):
    """Calculates monthly statistics for registered and concluded cases.

    Args:
        df (pandas.DataFrame): The input DataFrame containing case data.
        registered_col (str): The name of the column containing registered cases.
        concluded_col (str): The name of the column containing concluded cases.

    Returns:
        pandas.DataFrame: A DataFrame with monthly statistics for registered and concluded cases.
    """

    monthly_cases = df.groupby(['court', 'date_mon', 'case_type']).agg(
        registered=(registered_col, 'sum'),
        concluded=(concluded_col, 'sum')
    ).reset_index()

    return monthly_cases

In [21]:
def strip_dataframe_columns(df):
    """Strips leading and trailing whitespace from all columns in a Pandas DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame to modify.

    Returns:
        pandas.DataFrame: The modified DataFrame with stripped columns.
    """

    try:
        df = df.astype(str).apply(lambda x: x.str.strip())
        logger.info("str.strip() applied successfully to all columns.")
        return df
    except Exception as e:
        logger.error(f"Error applying str.strip(): {e}")
        return None

In [22]:
#CONSTANTS
CRIMINAL_CASES = [
    'Murder Case',
    'Criminal Revision',
    'Criminal Appeal',
    'Murder - Gender Justice Criminal Case',
    'Criminal Court Martial Appeal',
    'Anti-Corruption and Economic Crimes Revision',
    'Criminal Miscellaneous Application',
    'Criminal Applications', 
    'COA Criminal Appeal'
]

CRIMINAL_CASES = [
    'Murder Case',
    'Criminal Revision',
    'Criminal Appeal',
    'Murder - Gender Justice Criminal Case',
    'Criminal Court Martial Appeal',
    'Anti-Corruption and Economic Crimes Revision',
    'Criminal Miscellaneous Application',
    'Criminal Applications', 
    'COA Criminal Appeal'
]
BROAD_CASE_TYPES = {
    'Civil Suit': [
        'Civil Suit',
        'Anti-Corruption and Economic Crimes Suit',
        'Family Originating Summons',
        'Family Civil Case',
        'HCC(OS) Family',
        'Commercial Admiralty',
        'Commercial Matters',
    ],
    
    'Adoption': 'Family Adoption',
    'Divorce': 'Family Divorce Cause',

    'Criminal Application': 'Criminal Miscellaneous Application',

    'Miscellaneous Application': [
        'Civil Case Miscellaneous',
        'Judicial Review Miscellaneous',
        'JR  Petition Miscellaneous',
        'Anti-Corruption and Economic Crimes Miscellaneous',
        'Commercial Miscellaneous',
        'Constitution and Human Rights Petitions Miscellaneous',
        'Family Miscellaneous',
        'Commercial Arbitration',
    ],
    'Judicial Review': [
        'Anti-Corruption and Economic Crime Judicial review',
        'Judicial Review ELC',
        'Judicial Review',
    ],
    'Criminal Revision': [
        'Criminal Revision',
        'Anti-Corruption and Economic Crimes Revision',
    ],
    'Criminal Appeal': [
        'Criminal Appeal',
        'Criminal Court Martial Appeal',
        'Anti-Corruption and Economic Crimes Appeal',
    ],
    'Civil Appeal': [
        'Family Appeal',
        'Civil Appeal',
        'Commercial Appeal',
        'Constitution and Human Rights Election Petition Appeal',
        'Constitution and Human Rights Petition Appeal',
        'Constitution and Human Rights Election Petition Appeal',
        'Gender Justice Civil Appeal',
        'Constitution and Human Rights Miscellaneous Election Petition Appeal (MEPA)',
    ],
    'Constitution Petition': [
        'Anti Corruption and Economic Crimes Petition',
        'High Court Criminal Petition',
        'Constitution and Human Rights Petition (Civil)',
        'Constitution and Human Rights Election Petition',
        'High Court Constitution and Human Rights Petitions (Criminal)',
        'Commercial Petition',
    ],
    'Probate Administration': [
        'Family P&A Intestate',
        'Family P&A Ad Litem',
        'Family P&A Ad Colligenda',
        'Family P&A Citation',
        'Family P&A Testate',
        'Family P&A Resealing of Grant',
        'Family P&A De Bonis Non',
        'Resealing of Grant',
        'Citation-Family',
    ],
    'Murder': [
        'Murder Case',
        'Murder - Gender Justice Criminal Case',
    ],
    'Tax Appeal': [
        'Commercial Income Tax Apperiod_startpeal',
        'Commercial Custom Tax Appeal',
    ],
    'Bankruptcy and Insolvency' : [
        'Commercial Insolvency Notice Petition',
        'Commercial Insolvency Petition',
        'Commercial Bankruptcy Notice',
        'Commercial Insolvency Cause',
        'Commercial Insolvency Notice',
        'Commercial Bankruptcy Cause',
        'Commercial Winding Up Cause',
    ]
}


RESOLVED_OUTCOMES = ['Ruling Delivered- Case Closed',
 'Terminated',
 'Matter Settled- Case Closed',
 'Application Dismissed - Case Closed',
 'Judgment Delivered- Case Closed',
 'Matter Withdrawn',
 'Application Allowed - Case Closed',
 'Application Withdrawn - Case Closed',
 'Judgment Delivered- Convicted',
 'Placed In Probation',
 'Dismissed',
 'Judgment Delivered',
 'Judgment Delivered- Acquittal',
 'Ruling Delivered- Accused Discharged',
 'Abated',
 'Consolidated- Case Closed',
 'Grant Confirmed',
 'Limited Grant Issued',
 'Struck Out',
 'Grant Revoked',
 'Consent Recorded - Case Closed',
 'Dismissed For Want Of Prosecution - Case Closed',
 'Out Of Court Settlement Reached',
 'Appeal Dismissed',
 'Retrial',
 'Appeal Rejected',
 'Sentence Commuted',
 'Ruling Delivered- Application Closed',
 'Probation Orders Issued',
 'Order Issued - Case Closed',
 'Revision Declined']


TRANSFERED_CASES = ['File Transfered -case Closed', 
        'File Transferred',]

MERIT_CATEGORY = {
    'Judgment Delivered': [
        'Judgment Delivered- Case Closed',
        'Judgment Delivered',
        'Judgment Delivered- Acquittal',
        'Judgment Delivered- Convicted',
        'Grant Revoked',
        'Retrial'
        ],
    'Ruling Case Closed': [
        'Ruling Delivered- Case Closed', 
        'Ruling Delivered- Accused Discharged',
        ],
    'Final Grant': [
        'Grant Confirmed',
        'Limited Grant Issued',
        ],
    'Case Withdrawn': [
        'Matter Withdrawn',
        'Application Withdrawn - Case Closed',
        ],
   'Out Of Court Settlement': [
        'Consent Recorded - Case Closed',
        'Matter Settled Through Mediat26	Jun	2024	ion',
        'Out Of Court Settlement Reached',
    ],
    'Dismissed':[
        'Dismissed For Want Of Prosecution - Case Closed',
        'Dismissed',
        'Appeal Dismissed',
        'Terminated'
    ],

    'Case Closed': [
        'Struck Out',
        'Application Dismissed - Case Closed',
        'Application Allowed - Case Closed',
        'Matter Settled- Case Closed',
        'Ruling Delivered- Application Closed',
        'Consolidated- Case Closed',
        'Abated',
        'Placed In Probation',
        'Revision Declined',
        'Probation Orders Issued',
        'Appeal Rejected',
        'Interlocutory Judgement Entered',
        'Order issued - Case closed'   
    ],
}

HC_PMMU_TIME_LINES = {
    'murder': 360,
    'revision': 90,
    'misc_application': 90,
    'suit': 360,
    'judicial_review': 180,
    'constitutional_petition': 180,
}

NON_ADJOURNABLE = [
    'Taxation and Issuance of Certificates',
    'Orders',
    'Appointments of  Mediator',
    'Screening of files for Mediation',
    'Post-judgment',
    'Re-activation',
    'Reactivation',
    'Notice of Taxation',
    'Entering Interlocutory Judgments',
    'Approval by DR', 
    'Registration/Filing-Application', 
    'Registration/Filing', 
    'Registration/Filing-Application',
 ]
MERIT_OUTCOMES = [
    'Ruling Delivered- Case Closed', 
    'Judgment Delivered- Case Closed',
    'Judgment Delivered',
    'Judgment Delivered- Acquittal',
    'Judgment Delivered- Convicted',
    'Grant Revoked',
    'Ruling Delivered- Accused Discharged',
    'Retrial'
]

In [25]:
# file_path = '/home/stanoo/dcrt/data/API/'
# raw_df = pd.read_csv(f'{file_path}/Hc/hc_23-24_data.csv')
# # # Load name of judges
# # judge_names = pd.read_csv(f'{file_path}/reports/judges.csv')
# # # Load name of judges
# # #pending_baseline = pd.read_csv(f'{file_path}/reports/pending_baseline.csv')
file_path = '/home/stanoo/dcrt/data/API/dcrt_2023-2024.csv'
raw_df = pd.read_csv(file_path)

In [26]:
raw_df.rename(columns={'court_name': 'court'}, inplace=True)
df = df.rename(columns={'court_name': 'court'})
try:
    validate_columns(df, ['outcome', 'activity_date', 'filed_date'])
    logger.info("Validation passed.")
except ValueError as e:
    logger.error(e)
raw_df = strip_dataframe_columns(raw_df)
# create activity date and filed date columns
raw_df = create_date_column(raw_df.copy(), ['date_dd', 'date_mon', 'date_yyyy'], 'activity_date')
raw_df = create_date_column(raw_df.copy(), ['filed_dd', 'filed_mon', 'filed_yyyy'], 'filed_date')
raw_df = create_date_column(raw_df.copy(), ['next_dd', 'next_mon','next_yyyy'], 'next_date')
#cutoff_date = pd.Timestamp('2024-06-30') 
#df = raw_df[raw_df['activity_date'] <= cutoff_date]
df = raw_df.copy()

df = drop_nan_columns(df, ['date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no',
       'filed_dd', 'filed_mon', 'filed_yyyy', 'case_type', 'comingfor'])

df = remove_duplicates(df)

# Step 2: Add the 'concluded' column
df = add_concluded_column(df, resolved_outcomes)
    
    # Step 3: Add the 'registered' column
df = add_registered_column(df)

# missing outcomes
df = drop_null_values(df)

df = generate_case_num(df, 'court', 'caseid_type', 'caseid_no', 'filed_yyyy')

# Add broad case category of civil and criminal
df = categorize_cases(df, CRIMINAL_CASES)
# Add filed and resolved outcomes
df = process_case_status(df, RESOLVED_OUTCOMES)


# Add sojar case groupings
df['broad_case_type'] = df['case_type'].apply(lambda x: apply_dict(x, BROAD_CASE_TYPES))


# Apply the function to create a new column with keys
df['productivity'] = df['outcome'].apply(lambda x: apply_dict(x, MERIT_CATEGORY))

# Add time lines
df = add_pmmu_timelines(df)

# transferred
df['transfer'] = df['outcome'].apply(lambda x: x in TRANSFERED_CASES)

df['productivity_category'] = df.apply(
    lambda row: 'merit' if row['outcome'] in MERIT_OUTCOMES and row['concluded'] == 1 
    else 'non-merit' if row['outcome'] not in MERIT_OUTCOMES and row['concluded'] == 1 
    else None, axis=1
)

# Create data for output
filed_cases = analyze_court_outcomes(df, '2023-07-01', '2024-06-30', 'registered')
resolved_cases = analyze_court_outcomes(df, '2023-07-01', '2024-06-30', 'concluded')


2024-10-14 16:06:54,100 - ERROR - Missing required columns: filed_date, activity_date
2024-10-14 16:06:54,183 - INFO - str.strip() applied successfully to all columns.
2024-10-14 16:06:54,201 - INFO - Created new date column 'activity_date'. Valid dates: 12896/12896
2024-10-14 16:06:54,227 - INFO - Created new date column 'filed_date'. Valid dates: 1495/12896
2024-10-14 16:06:54,250 - INFO - Created new date column 'next_date'. Valid dates: 8957/12896
2024-10-14 16:06:54,270 - INFO - No rows were dropped.
2024-10-14 16:06:54,287 - INFO - 1221 duplicates found.
2024-10-14 16:06:54,308 - INFO - 1221 duplicates removed.


NameError: name 'resolved_outcomes' is not defined

In [None]:
# remove leading and trailing spaces on comingfor
df['comingfor'] = df['comingfor'].str.strip()

# create a new column of 1 if reason_adj is not null and comingfor is not in non_adjourned else 0
df['adjourned'] = (df['reason_adj'].notnull() & df['comingfor'].apply(lambda x: x not in NON_ADJOURNABLE)).astype(int)
# an event is adjournable if non_adjourned is not in comingfor
df['adjournable'] = df['comingfor'].apply(lambda x: x not in NON_ADJOURNABLE).astype(int)

In [None]:

monthly_filed_cases = df.groupby(['court','date_mon']).agg({'registered':'sum'}).reset_index()
monthly_concluded_cases = df.groupby(['court','date_mon']).agg({'concluded':'sum'}).reset_index()
concluded_cases = df.groupby('court').agg({'concluded':'sum'}).reset_index()


average_time_to_conclude = df.loc[df['concluded'] == 1].pivot_table(index='court', columns='nature', values='age', aggfunc='mean', fill_value=0).round(2)
pmmu_timelines = df[df['time_lines'] == 1].pivot_table(index='court', columns='broad_case_type', values='time_lines', aggfunc='count', fill_value=0)
total_concluded_per_court = df[df['concluded'] == 1].pivot_table(index='court', columns='broad_case_type', values='time_lines', aggfunc='count', fill_value=0)
resolved_within_pmmu_timeline = pmmu_timelines / total_concluded_per_court

court_productivity = df.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='count', fill_value=0)
court_productivity = court_productivity.rename_axis(columns=None)

productivity_pivot_table = pd.pivot_table(
    df,
    values='concluded',  
    index='court',     
    columns='productivity_category', 
    aggfunc='count',   
    fill_value=0        
).rename(columns={'merit': 'Merit', 'non-merit': 'Non_Merit'})

monthly_stats = monthly_case_stats(df, 'registered', 'concluded')
adjourned_per_court = df.groupby(['court', 'reason_adj'])['adjourned'].sum().reset_index(name='count')
adjourned = df.groupby('court')['adjourned'].sum().reset_index(name='total_adjourned')
adjournable = df.groupby('court')['adjournable'].sum().reset_index(name='total_adjournable')
# deternine the rate of adjournments
adjourn_proportion = pd.merge(adjourned, adjournable, on=['court'])
adjourn_proportion['adjourn_proportion'] = (adjourn_proportion['total_adjourned']/adjourn_proportion['total_adjournable'])*100

In [None]:
#remove trailing on judge names
judge_names['judge_1'] = judge_names['judge_1'].str.strip()
judge_df = df[df['judge_1'].isin(judge_names['judge_1'])]
judge_productivity = judge_df.pivot_table(index='judge_1', columns='productivity', values='concluded', aggfunc='sum', fill_value=0)
judge_productivity = judge_productivity.rename_axis(columns=None)

In [80]:
df.groupby(pd.Grouper(key='activity_date', freq='M')).size().reset_index(name='count')

  df.groupby(pd.Grouper(key='activity_date', freq='M')).size().reset_index(name='count')


Unnamed: 0,activity_date,count
0,2024-07-31,516
1,2024-08-31,487
2,2024-09-30,432


In [None]:
# #remove white space at the begining and end of pending_baseline['case_number'] and pending_baseline['court']
# pending_baseline['case_number'] = pending_baseline['case_number'].str.strip()
# pending_baseline['court'] = pending_baseline['court'].str.strip()
# pending_baseline['case_number'] = pending_baseline ['court'] + "/" + pending_baseline['case_number']
# pending_baseline.drop(columns='case_type', inplace=True)

# # Perform a merge with indicator to find rows only in pending_baseline
# merged_df = pending_baseline.merge(df, on=['case_number', 'court'], how='left', indicator=True)
# # Filter to get only the rows that are in pending_baseline but not in df
# #only_in_pending_baseline = merged_df[merged_df['_merge'] == 'left_only']
# merged_df['resolved'] = merged_df.groupby('case_number')['concluded'].transform('max')
# merged_df['year_filed'] = merged_df['case_number'].str.split('/').str[-1]
# merged_df['year_filed'] = merged_df['year_filed'].astype(int)
# merged_df = merged_df.sort_values(by=['court', 'year_filed'], ascending=True)

In [None]:
# resolved_cases = analyze_court_outcomes(df, '2023-07-01', '2024-06-30', 'concluded')

In [82]:
# pivot of concluded cases by case_type per court 
resolved_pivot = df.pivot_table(index='court', columns='broad_case_type', values='concluded', aggfunc='sum', fill_value=0)

In [85]:
filed_pivot = df.pivot_table(index='court', columns='broad_case_type', values='registered', aggfunc='sum', fill_value=0)

In [86]:
filed_pivot

broad_case_type,Constitution Petition,Criminal Appeal,Criminal Application,Criminal Revision,Miscellaneous Application,Murder
court,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Kibera,11,73,65,153,0,5


In [90]:
# save data
column_order = ['Murder', 'Criminal Appeal', 'Criminal Application', 
                'Criminal Revision', 'Miscellaneous Application', 
                'Constitution Petition']

In [91]:
filed_pivot.to_csv('/home/stanoo/dcrt/data/HC/reports/kibera_filed_cases.csv', columns=column_order, index=True)
resolved_pivot.to_csv('/home/stanoo/dcrt/data/HC/reports/kibera_resolved_cases.csv', index=True)
#df.to_csv('/home/stanoo/dcrt/data-analysis/processed_df.csv', index=False)

In [92]:
# pivot of transfered cases by case_type per court 
transfer_pivot = df.pivot_table(index='court', columns='broad_case_type', values='transfer', aggfunc='sum', fill_value=0)

In [93]:
# save pivot of transfered cases by case_type per court 
transfer_pivot.to_csv('/home/stanoo/dcrt/data/HC/reports/kibera_transfer_cases.csv', columns=column_order, index=True)

In [94]:
df.groupby('court')['transfer'].sum().reset_index(name='total_transfered')

Unnamed: 0,court,total_transfered
0,Kibera,7


In [32]:
df.to_csv('/home/stanoo/dcrt/data-analysis/processed_df.csv', index=False)

In [67]:
court_productivity = df.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='count', fill_value=0)
court_productivity = court_productivity.rename_axis(columns=None)

In [None]:
court_productivity = df.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='count', fill_value=0)
court_productivity = court_productivity.rename_axis(columns=None)
judge_productivity = judge_df.pivot_table(index='judge_1', columns='productivity', values='concluded', aggfunc='sum', fill_value=0)
judge_productivity = judge_productivity.rename_axis(columns=None)

In [63]:
#remove trailing on judge names
df['judge_1'] = df['judge_1'].str.strip()

# create dataframe of judges only
judge_df_1 = df[df['judge_1'].isin(judge_names['judge_1'])]


In [64]:
court_productivity = judge_df_1.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='count', fill_value=0)
court_productivity = court_productivity.rename_axis(columns=None)

In [69]:
court_productivity.to_csv(f'{file_path}/reports/judge_productivity.csv', index=True)

In [None]:
#pd.set_option('display.max_columns', 65)
df = df.sort_values('activity_date')
cutoff_date = pd.Timestamp('2024-06-30')  
processed_df = calculate_judgment_time(df)
judgement_df = determine_scheduled_judgment(processed_df, cutoff_date)
on_time_proportion = get_on_time_delivery_proportions(judgement_df)

In [77]:
# processed_df.to_csv(f'{file_path}/reports/hc_processed_df.csv', index=False)

In [78]:
# judgement_df.to_csv(f'{file_path}/reports/hc_judgement_df.csv', index=False)

In [27]:
# def determine_judgment_scheduling(df, cutoff_date):
#     judgment_date_set_outcomes = ["Judgment Date Given", "Judgment On Notice", "Judgment Date Set"]
#     judgment_delivered_outcomes = ["Grant Revoked", "Judgment Delivered", 
#                                    "Judgment Delivered- Acquittal", "Judgment Delivered- Case Closed", 
#                                    "Judgment Delivered- Convicted"]
    
#     result = df.copy()
    
#     result['judgment_delayed'] = False
#     result['set_date'] = pd.NaT
#     result['delivery_date'] = pd.NaT
    
#     grouped = df.groupby('case_number')
    
#     for case_number, group in grouped:
#         group = group.sort_values('activity_date')
        
#         judgment_set_rows = group[group['outcome'].isin(judgment_date_set_outcomes)]
#         for _, judgment_set_row in judgment_set_rows.iterrows():
#             set_date = judgment_set_row['activity_date']
#             scheduled_date = judgment_set_row['next_date']
            
#             # Skip this case if the scheduled date is after the cutoff date
#             if scheduled_date > cutoff_date:
#                 continue
            
#             result.loc[result['case_number'] == case_number, 'set_date'] = scheduled_date
            
#             # Find activities on or after the scheduled_date
#             subsequent_activities = group[group['activity_date'] >= scheduled_date]
            
#             if not subsequent_activities.empty:
#                 first_activity = subsequent_activities.iloc[0]
#                 if first_activity['activity_date'] == scheduled_date:
#                     if first_activity['outcome'] in judgment_delivered_outcomes:
#                         result.loc[result['case_number'] == case_number, 'delivery_date'] = scheduled_date
#                     else:
#                         # If the outcome is not judgment delivery on the scheduled date, it's delayed
#                         result.loc[result['case_number'] == case_number, 'judgment_delayed'] = True
#                 else:
#                     # If the first activity after scheduling is after the scheduled date, it's delayed
#                     result.loc[result['case_number'] == case_number, 'judgment_delayed'] = True
                
#                 # Check for eventual judgment delivery
#                 eventual_delivery = subsequent_activities[subsequent_activities['outcome'].isin(judgment_delivered_outcomes)]
#                 if not eventual_delivery.empty:
#                     result.loc[result['case_number'] == case_number, 'delivery_date'] = eventual_delivery.iloc[0]['activity_date']
#             else:
#                 # If there are no activities on or after the scheduled date, we assume it's delayed
#                 result.loc[result['case_number'] == case_number, 'judgment_delayed'] = True
            
#             # Break after processing the first judgment set date for this case
#             break
    
#     # Remove cases where set_date is NaT (which means they were skipped due to cutoff date)
#     result = result[result['set_date'].notna()]
    
#     return result

In [33]:
# case_level = scheduled_df.groupby(['court', 'case_number']).agg({
#     'set_date': lambda x: x.notna().any(),  # Was the case scheduled?
#     'delivery_date': lambda x: x.notna().any(),  # Was the judgment delivered?
#     'judgment_delayed': 'any'  # Was the judgment delayed for this case?
# }).reset_index()
# # Now aggregate to the judge level
# judgement_stats = case_level.groupby('court').agg({
#     'set_date': 'sum',  # Total scheduled cases
#     'delivery_date': 'sum',  # Total delivered judgments
#     'judgment_delayed': 'sum'  # Total delayed cases
# }).reset_index()


In [41]:
# def determine_judgment_scheduled(df, cutoff_date):
#     judgment_date_set_outcomes = ["Judgment Date Given", "Judgment On Notice", "Judgment Date Set"]
#     judgment_delivered_outcomes = ["Grant Revoked", "Judgment Delivered", 
#                                    "Judgment Delivered- Acquittal", "Judgment Delivered- Case Closed", 
#                                    "Judgment Delivered- Convicted"]
    
#     result = df.copy()
    
#     result['judgment_status'] = 'Not Scheduled'  # Default status
#     result['set_date'] = pd.NaT
#     result['delivery_date'] = pd.NaT
    
#     grouped = df.groupby('case_number')
    
#     for case_number, group in grouped:
#         group = group.sort_values('activity_date')
        
#         judgment_set_rows = group[group['outcome'].isin(judgment_date_set_outcomes)]
#         for _, judgment_set_row in judgment_set_rows.iterrows():
#             set_date = judgment_set_row['activity_date']
#             scheduled_date = judgment_set_row['next_date']
            
#             if scheduled_date > cutoff_date:
#                 continue
            
#             result.loc[result['case_number'] == case_number, 'set_date'] = scheduled_date
#             result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Scheduled'
            
#             subsequent_activities = group[group['activity_date'] >= scheduled_date]
            
#             if not subsequent_activities.empty:
#                 first_activity = subsequent_activities.iloc[0]
#                 if first_activity['outcome'] in judgment_delivered_outcomes:
#                     result.loc[result['case_number'] == case_number, 'delivery_date'] = first_activity['activity_date']
#                     result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delivered'
#                 elif first_activity['activity_date'] > scheduled_date or first_activity['outcome'] not in judgment_delivered_outcomes:
#                     result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delayed'
#             elif cutoff_date >= scheduled_date:
#                 result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delayed'
            
#             break
    
#     result = result[result['set_date'].notna()]
    
#     return result

In [42]:
# scheduled_df = determine_judgment_scheduled(df, cutoff_date)

In [50]:
# ############################################
# def determine_judgment_scheduling_v2(df, cutoff_date):
#     judgment_date_set_outcomes = ["Judgment Date Given", "Judgment On Notice", "Judgment Date Set"]

    
#     judgment_delivered_outcomes = ["Grant Revoked", "Judgment Delivered", 
#                                    "Judgment Delivered- Acquittal", "Judgment Delivered- Case Closed", 
#                                    "Judgment Delivered- Convicted"]
    
#     result = df.copy()
    
#     result['judgment_status'] = 'Not Scheduled'  # Default status
#     result['set_date'] = pd.NaT
#     result['delivery_date'] = pd.NaT
    
#     grouped = df.groupby('case_number')
    
#     for case_number, group in grouped:
#         group = group.sort_values('activity_date')
        
#         judgment_set_rows = group[group['outcome'].isin(judgment_date_set_outcomes)]
#         for _, judgment_set_row in judgment_set_rows.iterrows():
#             set_date = judgment_set_row['activity_date']
#             scheduled_date = judgment_set_row['next_date']
            
#             if scheduled_date > cutoff_date:
#                 continue
            
#             result.loc[result['case_number'] == case_number, 'set_date'] = scheduled_date
#             result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Scheduled'
            
#             # Check for judgment delivery between set_date and scheduled_date
#             early_delivery = group[(group['activity_date'] > set_date) & 
#                                    (group['activity_date'] < scheduled_date) & 
#                                    (group['outcome'].isin(judgment_delivered_outcomes))]
            
#             if not early_delivery.empty:
#                 result.loc[result['case_number'] == case_number, 'delivery_date'] = early_delivery.iloc[0]['activity_date']
#                 result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delivered'
#             else:
#                 subsequent_activities = group[group['activity_date'] >= scheduled_date]
                
#                 if not subsequent_activities.empty:
#                     first_activity = subsequent_activities.iloc[0]
#                     if first_activity['outcome'] in judgment_delivered_outcomes:
#                         result.loc[result['case_number'] == case_number, 'delivery_date'] = first_activity['activity_date']
#                         result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delivered'
#                     elif first_activity['activity_date'] > scheduled_date:
#                         result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delayed'
#                 elif cutoff_date >= scheduled_date:
#                     result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delayed'
            
#             break
    
#     result = result[result['set_date'].notna()]
    
#     return result

In [51]:
# scheduled_df_v2 = determine_judgment_scheduling_v2(df, cutoff_date)

In [55]:
# # Add a new column to categorize delivery timing
# def categorize_delivery(row):
#     if row['judgment_status'] == 'Delivered':
#         if row['delivery_date'] < row['set_date']:
#             return 'Early'
#         elif row['delivery_date'] == row['set_date']:
#             return 'On Time'
#         else:
#             return 'Late'
#     return row['judgment_status']

# scheduled_df_v2['delivery_category'] = scheduled_df_v2.apply(categorize_delivery, axis=1)


In [62]:
# # Group by judge and case_number to ensure each case is counted once per judge
# case_level = scheduled_df_v2.groupby(['court', 'case_number'])['delivery_category'].last().reset_index()

# # Now aggregate to the judge level
# judgememt_stats = case_level.groupby('court')['delivery_category'].value_counts().unstack(fill_value=0).reset_index()


In [None]:

# # Rename columns for clarity
# judge_stats.columns.name = None
# judge_stats = judge_stats.rename(columns={
#     'Scheduled': 'scheduled_cases',
#     'Early': 'early_deliveries',
#     'On Time': 'on_time_deliveries',
#     'Late': 'late_deliveries',
#     'Delayed': 'delayed_cases'
# })

# # Ensure all columns exist
# for col in ['scheduled_cases', 'early_deliveries', 'on_time_deliveries', 'late_deliveries', 'delayed_cases']:
#     if col not in judge_stats.columns:
#         judge_stats[col] = 0

# # Calculate total scheduled and delivered cases
# judge_stats['scheduled_cases'] = judge_stats['early_deliveries'] + judge_stats['on_time_deliveries'] + judge_stats['late_deliveries'] + judge_stats['delayed_cases']
# judge_stats['delivered_cases'] = judge_stats['early_deliveries'] + judge_stats['on_time_deliveries'] + judge_stats['late_deliveries']

# # Calculate proportions
# judge_stats['early_proportion'] = judge_stats['early_deliveries'] / judge_stats['scheduled_cases']
# judge_stats['on_time_proportion'] = judge_stats['on_time_deliveries'] / judge_stats['scheduled_cases']
# judge_stats['late_proportion'] = judge_stats['late_deliveries'] / judge_stats['scheduled_cases']
# judge_stats['delay_proportion'] = judge_stats['delayed_cases'] / judge_stats['scheduled_cases']
# judge_stats['delivered_as_scheduled_proportion'] = (judge_stats['on_time_deliveries'] + judge_stats['early_deliveries']) / judge_stats['scheduled_cases']

# # Sort by delay proportion in descending order
# judge_stats = judge_stats.sort_values('delay_proportion', ascending=False)

# # Display the results
# print(judge_stats)

In [102]:
cutoff_date = pd.Timestamp('2024-06-30')

In [105]:
def determine_scheduled_judgment(df, cutoff_date):
    judgment_date_set_outcomes = ["Judgment Date Given", "Judgment On Notice", "Judgment Date Set"]
    judgment_delivered_outcomes = ["Grant Revoked", "Judgment Delivered", 
                                   "Judgment Delivered- Acquittal", "Judgment Delivered- Case Closed", 
                                   "Judgment Delivered- Convicted"]
    
    df['judgment_status'] = 'Not Scheduled'
    df['set_date'] = pd.NaT
    df['delivery_date'] = pd.NaT
    df['delivery_category'] = ''
    
    # Filter rows with judgment set outcomes
    judgment_set_rows = df[df['outcome'].isin(judgment_date_set_outcomes)]
    
    # Only consider rows with a scheduled date before or equal to the cutoff date
    valid_scheduled = judgment_set_rows[judgment_set_rows['next_date'] <= cutoff_date]
    
    # For each valid schedule, find the earliest set date
    earliest_schedule = valid_scheduled.sort_values('activity_date').groupby('case_number').first().reset_index()
    
    # Create a dictionary to map case numbers to their schedule dates and statuses
    case_to_set_date = dict(zip(earliest_schedule['case_number'], earliest_schedule['next_date']))
    case_to_status = {case: 'Scheduled' for case in earliest_schedule['case_number']}
    
    # Update the result dataframe with schedule information
    df['set_date'] = df['case_number'].map(case_to_set_date)
    df['judgment_status'] = df['case_number'].map(case_to_status).fillna('Not Scheduled')
    df['delivery_category'] = df['case_number'].map(case_to_status).fillna('')
    
    # Filter rows with judgment delivered outcomes
    judgment_delivered_rows = df[df['outcome'].isin(judgment_delivered_outcomes)]
    
    # Find the first delivery date after set date
    for case_number, group in earliest_schedule.groupby('case_number'):
        set_date = group['next_date'].values[0]
        delivery = judgment_delivered_rows[(judgment_delivered_rows['case_number'] == case_number) & 
                                           (judgment_delivered_rows['activity_date'] >= set_date)]
        if not delivery.empty:
            delivery_date = delivery.sort_values('activity_date').iloc[0]['activity_date']
            df.loc[df['case_number'] == case_number, 'delivery_date'] = delivery_date
            df.loc[df['case_number'] == case_number, 'judgment_status'] = 'Delivered'
            
            if delivery_date <= set_date:
                df.loc[df['case_number'] == case_number, 'delivery_category'] = 'On Time'
            else:
                df.loc[df['case_number'] == case_number, 'delivery_category'] = 'Delayed'
        else:
            if cutoff_date >= set_date:
                df.loc[df['case_number'] == case_number, 'judgment_status'] = 'Delayed'
                df.loc[df['case_number'] == case_number, 'delivery_category'] = 'Delayed'
    
    return df[df['set_date'].notna()]



In [68]:
# def determine_judgment_scheduling_v3(df, cutoff_date):
#     judgment_date_set_outcomes = ["Judgment Date Given", "Judgment On Notice", "Judgment Date Set"]
#     judgment_delivered_outcomes = ["Grant Revoked", "Judgment Delivered", 
#                                    "Judgment Delivered- Acquittal", "Judgment Delivered- Case Closed", 
#                                    "Judgment Delivered- Convicted"]
    
#     result = df.copy()
    
#     result['judgment_status'] = 'Not Scheduled'  # Default status
#     result['set_date'] = pd.NaT
#     result['delivery_date'] = pd.NaT
#     result['delivery_category'] = ''
    
#     grouped = df.groupby('case_number')
    
#     for case_number, group in grouped:
#         group = group.sort_values('activity_date')
        
#         judgment_set_rows = group[group['outcome'].isin(judgment_date_set_outcomes)]
#         for _, judgment_set_row in judgment_set_rows.iterrows():
#             set_date = judgment_set_row['activity_date']
#             scheduled_date = judgment_set_row['next_date']
            
#             if pd.isnull(scheduled_date) or scheduled_date > cutoff_date:
#                 continue
            
#             result.loc[result['case_number'] == case_number, 'set_date'] = scheduled_date
#             result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Scheduled'
#             result.loc[result['case_number'] == case_number, 'delivery_category'] = 'Scheduled'
            
#             # Check for judgment delivery after set_date
#             delivery = group[(group['activity_date'] >= set_date) & 
#                              (group['outcome'].isin(judgment_delivered_outcomes))]
            
#             if not delivery.empty:
#                 delivery_date = delivery.iloc[0]['activity_date']
#                 result.loc[result['case_number'] == case_number, 'delivery_date'] = delivery_date
#                 result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delivered'
                
#                 if delivery_date <= scheduled_date:
#                     result.loc[result['case_number'] == case_number, 'delivery_category'] = 'On Time'
#                 else:
#                     result.loc[result['case_number'] == case_number, 'delivery_category'] = 'Delayed'
#             else:
#                 if cutoff_date >= scheduled_date:
#                     result.loc[result['case_number'] == case_number, 'judgment_status'] = 'Delayed'
#                     result.loc[result['case_number'] == case_number, 'delivery_category'] = 'Delayed'
            
#             break
    
#     result = result[result['set_date'].notna()]
    
#     return result

In [69]:
# scheduled_df_v3 = determine_judgment_scheduling_v3(df, cutoff_date)

In [86]:
# def get_on_time_delivery_proportions_0(scheduled_cases):
#     # Group bjudgement_df.query('case_number =="Kiambu/HCCA/E291/2022"')y court and calculate statistics
#     court_stats = scheduled_cases.groupby('court').agg({
#         'case_number': 'count',
#         'delivery_category': lambda x: (x == 'On Time').sum()
#     }).rename(columns={
#         'case_number': 'total_scheduled',
#         'delivery_category': 'delivered_on_time'
#     })
    
#     # Calculate the proportion
#     court_stats['proportion_on_time'] = court_stats['delivered_on_time'] / court_stats['total_scheduled']
    
#     return court_stats

In [89]:
# # Calculate the proportion
# on_time_proportion = get_on_time_delivery_proportions(scheduled_df_v3)
# #on_time_proportion0 = get_on_time_delivery_proportions(scheduled_df_v3)
# #on_time_proportion = calculate_on_time_delivery_proportion(scheduled_df_v3)

In [43]:

# # Group by judge and case_number to ensure each case is counted once per judge
# case_level = scheduled_df.groupby(['court', 'case_number'])['judgment_status'].last().reset_index()

# case_level = scheduled_df.groupby(['court', 'case_number']).agg({
#     'set_date': lambda x: x.notna().any(),  # Was the case scheduled?
#     'delivery_date': lambda x: x.notna().any(),  # Was the judgment delivered?
#     'judgment_delayed': 'any'  # Was the judgment delayed for this case?
# }).reset_index()

# # Now aggregate to the judge level
# judge_stats = case_level.groupby('court')['judgment_status'].value_counts().unstack(fill_value=0).reset_index()


In [None]:

# # Rename columns for clarity
# judge_stats.columns.name = None
# judge_stats = judge_stats.rename(columns={
#     'Scheduled': 'scheduled_cases',
#     'Delivered': 'delivered_cases',
#     'Delayed': 'delayed_cases'
# })

# # Ensure all columns exist
# for col in ['scheduled_cases', 'delivered_cases', 'delayed_cases']:
#     if col not in judge_stats.columns:
#         judge_stats[col] = 0

# # Calculate total scheduled cases
# judge_stats['scheduled_cases'] = judge_stats['delivered_cases'] + judge_stats['delayed_cases']

# # Calculate proportions
# judge_stats['delay_proportion'] = judge_stats['delayed_cases'] / judge_stats['scheduled_cases']
# judge_stats['delivered_proportion'] = judge_stats['delivered_cases'] / judge_stats['scheduled_cases']

# # Sort by delay proportion in descending order
# judge_stats = judge_stats.sort_values('delay_proportion', ascending=False)


In [95]:

# # Group by judge and case_number to ensure each case is counted once per judge
# case_level = scheduled_df.groupby(['court', 'case_number']).agg({
#     'set_date': lambda x: x.notna().any(),  # Was the case scheduled?
#     'delivery_date': lambda x: x.notna().any(),  # Was the judgment delivered?
#     'judgment_delayed': 'any'  # Was the judgment delayed for this case?
# }).reset_index()

# # Now aggregate to the judge level
# judge_stats = case_level.groupby('court').agg({
#     'set_date': 'sum',  # Total scheduled cases
#     'delivery_date': 'sum',  # Total delivered judgments
#     'judgment_delayed': 'sum'  # Total delayed cases
# }).reset_index()

# # Rename columns for clarity
# judge_stats.columns = ['court', 'scheduled_cases', 'delivered_cases', 'delayed_cases']

# # Calculate the proportion of delayed cases among scheduled cases
# judge_stats['delay_proportion'] = judge_stats['delayed_cases'] / judge_stats['scheduled_cases']

# # Calculate the proportion of cases delivered on time
# judge_stats['delivered_on_time_proportion'] = (judge_stats['delivered_cases'] - judge_stats['delayed_cases']) / judge_stats['scheduled_cases']

# # Sort by delay proportion in descending order
# judge_stats = judge_stats.sort_values('delay_proportion', ascending=False)


In [78]:
# df = df.sort_values('activity_date')
# processed_df = calculate_judgment_time(df)
# scheduled_df = determine_judgment_scheduling(processed_df)

  result.loc[result['case_number'] == case_number, 'time_taken_days'] = time_taken_days


In [79]:
# # Group by judge and case_number to ensure each case is counted once per judge
# case_level = scheduled_df.groupby(['court', 'case_number']).agg({
#     'set_date': lambda x: x.notna().any(),  # Was the case scheduled?
#     'delivery_date': lambda x: x.notna().any(),  # Was the judgment delivered?
#     'judgment_delayed': 'any'  # Was the judgment delayed for this case?
# }).reset_index()

# # Now aggregate to the judge level
# judge_stats = case_level.groupby('court').agg({
#     'set_date': 'sum',  # Total scheduled cases
#     'delivery_date': 'sum',  # Total delivered judgments
#     'judgment_delayed': 'sum'  # Total delayed cases
# }).reset_index()

# # Rename columns for clarity
# judge_stats.columns = ['court', 'scheduled_cases', 'delivered_cases', 'delayed_cases']

# # Calculate the proportion of delayed cases among scheduled cases
# judge_stats['delay_proportion'] = judge_stats['delayed_cases'] / judge_stats['scheduled_cases']

# # Calculate the proportion of cases delivered as scheduled
# judge_stats['delivered_as_scheduled_proportion'] = (judge_stats['delivered_cases'] - judge_stats['delayed_cases']) / judge_stats['scheduled_cases']

# # Sort by number of delayed cases in descending order
# judge_stats = judge_stats.sort_values('delayed_cases', ascending=False)

In [82]:
# pd.set_option('display.max_columns', 65)

In [89]:
# filter where the set_date is not NaN 
set_df = scheduled_df[scheduled_df['set_date'].notna()]

In [90]:
set_df

(47096, 55)

In [None]:
scheduled_df = determine_judgment_scheduling(processed_df)
# Group by judge and case_number to ensure each case is counted once per judge
case_level = scheduled_df.groupby(['court', 'case_number']).agg({
    'first_scheduled_date': lambda x: x.notna().any(),  # Was the case scheduled?
    'judgment_delayed': 'any'  # Was the judgment delayed for this case?
}).reset_index()

# Now aggregate to the judge level
judge_stats = case_level.groupby('court').agg({
    'first_scheduled_date': 'sum',  # Total scheduled cases
    'judgment_delayed': 'sum'  # Total delayed cases
}).reset_index()

# Rename columns for clarity
judge_stats.columns = ['court', 'scheduled_cases', 'delayed_cases']

# Calculate the proportion of delayed cases among scheduled cases
judge_stats['delay_proportion'] = judge_stats['delayed_cases'] / judge_stats['scheduled_cases']

# Sort by number of delayed cases in descending order
judge_stats = judge_stats.sort_values('delayed_cases', ascending=False)

In [25]:
#concluded_cases = df.groupby('court').agg({'concluded':'sum'}).reset_index()
annual_df = df[
    (df['activity_date'] >= pd.to_datetime('2023-07-01')) &
    (df['activity_date'] <= pd.to_datetime('2024-06-30'))]

In [28]:

# save 
filed_cases.to_csv(f'{file_path}/reports/total_filed_cases.csv', index=False)
resolved_cases.to_csv(f'{file_path}/reports/total_resolved_cases.csv', index=False)


In [29]:
# group by court and month of activity_date month 
monthly_concluded_cases = df.groupby(['court', 'broad_case_type', df['activity_date'].dt.month_name()]).agg({'concluded':'sum'}).reset_index()
monthly_filed_cases = df.groupby(['court','broad_case_type',  df['activity_date'].dt.month_name()]).agg({'registered':'sum'}).reset_index()

In [None]:
monthly_concluded_cases

In [30]:
monthly_concluded_cases.to_csv(f'{file_path}/reports/all_concluded_cases.csv', index=False)
monthly_filed_cases.to_csv(f'{file_path}/reports/all_filed_cases.csv', index=False)

In [29]:
court_productivity.to_csv(f'{file_path}/reports/court_merit_productivity.csv', index=True)

In [28]:
# save data
column_order = ['Murder', 'Criminal Appeal', 'Criminal Application', 
                'Criminal Revision', 'Civil Suit', 'Civil Appeal', 'Miscellaneous Application', 
                'Constitution Petition', 'Judicial Review',
                  'Bankruptcy and Insolvency', 'Tax Appeal', 
                  'Adoption', 'Divorce', 'Probate Administration']
resolved_cases.to_csv(f'{file_path}/reports/annual_resolved_cases.csv', columns=column_order, index=True)
monthly_filed_cases.to_csv(f'{file_path}/reports/monthly_filed.csv', index=False)
monthly_concluded_cases.to_csv(f'{file_path}/reports/monthly_concluded.csv', index=False)
#court_productivity.to_csv(f'{file_path}/reports/court_productivity.csv', index=True)
#average_time_to_conclude.to_csv(f'{file_path}/reports/average_time_to_conclude.csv', index=True)
#resolved_within_pmmu_timeline.to_csv(f'{file_path}/reports/resolved_within_pmmu_timeline.csv', columns=column_order, index=True)
#proportion_resolved_within_timeline_per_court.to_csv(f'/{file_path}/reports/proportion_resolved_within_timeline.csv')
#judge_productivity.to_csv(f'{file_path}/reports/judge_productivity.csv', index=False)
#adjourned_per_court.to_csv(f'{file_path}/reports/adjourn_per_court.csv', index=False)
#adjourn_proportion.to_csv(f'{file_path}/reports/adjourn_proportion.csv', index=False)

In [29]:
# save data
column_order = ['Murder', 'Criminal Appeal', 'Criminal Application', 
                'Criminal Revision', 'Civil Suit', 'Civil Appeal', 'Miscellaneous Application', 
                'Constitution Petition', 'Judicial Review',
                  'Bankruptcy and Insolvency', 'Tax Appeal', 
                  'Adoption', 'Probate Administration']
filed_cases.to_csv(f'{file_path}/reports/annual_filed_cases.csv', columns=column_order, index=True)

### Determine Productivity per judge (Merit/Non Merit)

In [64]:
# df.groupby('judge_1')['concluded'].sum().sort_values(ascending=False).reset_index(name='count').to_csv(f'{file_path}/reports/judge_productivity.csv', index=False)

In [71]:
#judges = df['judge_1'].unique().tolist()
#judge_productivity = df.groupby('judge_1')['concluded'].sum().sort_values(ascending=False).reset_index(name='count')
#judge_productivity.to_csv(f'{output_path}/judge_productivity.csv', index=False)
#judge_df.groupby(['judge_1', 'court'])['court'].count()

In [72]:
# # productivity per court
# court_productivity = df.pivot_table(index='court', columns='productivity', values='concluded', aggfunc='sum', fill_value=0)
# court_pivot = court_productivity.rename_axis(columns=None)
# # matters handled by judge per court
# matters_handled = judge_df.groupby(['court','judge_1'])['court'].count().reset_index(name='count')
# # Resolved cases by judge per court
# judge_court_productivity = judge_df.groupby(['judge_1', 'court'])['concluded'].sum().reset_index(name='count')

## Pending Cases Analysis

In [105]:
unique_df_cases = merged_df[merged_df['resolved'] != 1]

In [106]:
unique_df = unique_df_cases.drop_duplicates(subset='case_number', keep='first')

In [107]:
unique_df = unique_df.reset_index(drop=True)

In [111]:
# Filter to get only the rows that are in pending_baseline but not in df
only_in_baseline = unique_df[unique_df['_merge'] == 'left_only']

In [None]:
only_in_baseline.groupby('court').size()

In [260]:
# def preprocess_dataframe(df):
#     # Sort DataFrame by 'activity_date' in descending order
#     df_sorted = df.sort_values(by=['court', 'year_filed'], ascending=False)
    
#     # Drop duplicates based on 'number_on_file' while keeping the first occurrence
#     df_unique_cases = df_sorted.drop_duplicates(subset='case_number', keep='first')
    
#     # Filter unresolved cases
#     unique_unresolved_cases = df_unique_cases[df_unique_cases['resolved'] == 0]
    
#     # Reset index
#     unique_unresolved_cases = unique_unresolved_cases.reset_index(drop=True)
    
#     return unique_unresolved_cases
# processed_df = preprocess_dataframe(merged_df)

In [110]:
unique_df.shape

(42628, 53)

In [None]:
unique_df.groupby('court')['court'].value_counts().sort_values(ascending=False)

In [None]:
processed_df.groupby('court')['court'].value_counts().sort_values(ascending=False)

In [None]:
# df = ['court', 'comingfor', 'outcome', 'activity_date', 'filed_date', 'activity_date_year', 'activity_date_month', 'nature', 'case_category', 'case_number']

In [None]:
# Convert both dataframes to sets of 'case_number' values
pending_baseline_cases = set(pending_baseline['case_number'].unique())

df_cases = set(df['case_number'].unique())

# Find the cases present in df but not in df_a
cases_not_in_pending = df_cases.difference(pending_baseline_cases)

# Print the cases
print("Cases present in df but not in df_a:")

In [193]:
cases_not_in_pending = pd.DataFrame(cases_not_in_pending)

In [194]:
cases_not_in_pending.shape

(85181, 1)

In [132]:
combined_df = pd.merge(df, pending_baseline, on='case_number', how='left', indicator=True)

In [133]:
combined_df = combined_df[combined_df['_merge'] == 'left_only'].drop(columns='_merge')

In [None]:
combined_df

In [None]:
combined_df = combined_df.sort_values(by=['court', 'activity_date', 'case_number'], ascending=True)

In [None]:
combined_df

In [142]:
# Convert df_a['case_number'] to a set
df_a_cases = set(pending_baseline['case_number'].unique())


In [146]:
filtered_df_closed = df[(df['concluded'] == 1) & (df['case_number'].isin(df_a_cases))]

In [None]:
filtered_df_closed

In [143]:
# Filter df based on 'outcome' == 1 and 'case_number' not in df_a_cases
filtered_df = df[(df['concluded'] == 1) & ~(df['case_number'].isin(df_a_cases))]

In [None]:
filtered_df.groupby('court')['case_number'].count().sort_values(ascending=False).reset_index(name='count')

In [None]:
filtered_df['resolved'] = filtered_df.groupby('case_number')['concluded'].transform('max')

In [152]:
  # Drop duplicates based on 'number_on_file' while keeping the first occurrence
filtered_df = filtered_df.drop_duplicates(subset='case_number', keep='first')
    

In [156]:
nakuru = filtered_df[filtered_df['court'] == 'Milimani Anti Corruption and Economic Crimes']

In [157]:
nakuru.to_csv(f'{output_path}/nakuru.csv', index=False)

In [158]:
pending_baseline.to_csv(f'{output_path}/pending_baseline_2.csv', index=False)

In [None]:
filtered_df.groupby('court')['case_number'].count().sort_values(ascending=False).reset_index(name='count')

In [145]:
pending_baseline[pending_baseline['case_number'] == 'Busia/HCCCMISC/E026/2023']

Unnamed: 0,court,case_number,case_type


#### Backlog determination

In [None]:
#import pandas as pd
# raw data of pending cases(next period baseline)
#df = pd.read_csv(f'{output_path}/raw_pending_cases.csv')

In [None]:
# The date for computation of backlog ought to be the end of the quarter
quarter_end = pd.to_datetime('2024-03-31')
df['end_date'] = quarter_end

In [25]:

# Define a function to categorize ages
def categorize_age(age):
    if age <= 365:  # 0-1 years
        return '0-1 years'
    elif age <= 3 * 365:  # 1-3 years
        return '1-3 years'
    else:
        return 'Over 3 years'


In [35]:
#df['filed_date'] = pd.to_datetime(df['filed_date'], format='%Y-%m-%d')
processed_df['pending_age'] = (processed_df['end_date'] - processed_df['filed_date']).dt.days
processed_df = processed_df[processed_df['pending_age'] >= 0]


In [None]:
processed_df['age_group'] = processed_df['pending_age'].apply(categorize_age)
backlog_category = processed_df.pivot_table(index='court', columns='age_group', values='unique_number', aggfunc='count', fill_value=0)

# Save the data to csv

In [249]:
column_order = ['Murder', 'Criminal Appeal', 'Criminal Application', 
                'Criminal Revision', 'Civil Suit', 'Civil Appeal', 'Miscellaneous Application', 
                'Constitution Petition', 'Judicial Review',
                  'Bankruptcy and Insolvency', 'Tax Appeal', 
                  'Adoption', 'Divorce', 'Probate Administratio']
# Total missing per month
missing_per_month.to_csv(f'{output_path}/missing_outcomes_per_month.csv', index=False)

# Total filed cases
filed_cases.to_csv(f'{output_path}/filed_cases.csv', columns=column_order, index=True)

# Total concluded cases
concluded_cases.to_csv(f'{output_path}/concluded_cases.csv', columns=column_order, index=True)

# Total filed, concluded and CCR per month
monthly_cases.to_csv(f'{output_path}/monthly_cases.csv', index=False)

# Productivity per court
df_pivot.to_csv(f'{output_path}/productivity.csv', index=True)

# Average time to conclude
average_time_to_conclude.to_csv(f'{output_path}/average_time_to_conclude.csv', index=True)

# Time lines
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)

judge_pivot.to_csv(f'{output_path}/judge_productivity.csv', index=True)

judge_court_productivity.to_csv(f'{output_path}/judge_court_productivity.csv', index=False)

court_pivot.to_csv(f'{output_path}/court_productivity.csv', index=True)

matters_handled.to_csv(f'{output_path}/judge_matters_handled.csv', index=False)

adjourned_per_court.to_csv(f'{output_path}/adjourned_per_court.csv', index=False)

adjourn_proportion.to_csv(f'{output_path}/adjourn_proportion.csv', index=False)

pending_cases.to_csv(f'{output_path}/hc_pending_cases.csv', index=True)

backlog_category.to_csv(f'{output_path}/hc_backlog.csv', index=True)

# raw data of pending cases(next period baseline)
processed_df.to_csv(f'{output_path}/raw_pending_cases.csv', index=False)


In [84]:
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)

In [92]:
### COA Saving
output_path = '/home/arch/devel/data/Report'
'''
column_order = ['murder', 'criminal_appeal', 'criminal_application', 
                'revision', 'suit', 'civil_appeal', 'misc_application', 
                'constitutional_petition', 'judicial_review',
                  'bankruptcy_and_insolvency', 'tax_appeal', 
                  'adoption', 'divorce', 'probate_and_admin']

'''
column_order  = ['COA Criminal Appeal', 'Criminal Applications', 'Civil Appeal', 'Civil Applications']
# Total missing per month
missing_per_month.to_csv(f'{output_path}/missing_outcomes_per_month.csv', index=False)

# Total filed cases
filed_cases.to_csv(f'{output_path}/filed_cases.csv', index=True)

# Total concluded cases
concluded_cases.to_csv(f'{output_path}/concluded_cases.csv', index=True)

# Total filed, concluded and CCR per month
monthly_cases.to_csv(f'{output_path}/monthly_cases.csv', index=False)

# Productivity per court
df_pivot.to_csv(f'{output_path}/productivity.csv', index=True)

# Average time to conclude
average_time_to_conclude.to_csv(f'{output_path}/average_time_to_conclude.csv', index=True)

# Time lines
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)


court_pivot.to_csv(f'{output_path}/court_productivity.csv', index=True)


adjourned_per_court.to_csv(f'{output_path}/adjourned_per_court.csv', index=False)

adjourn_proportion.to_csv(f'{output_path}/adjourn_proportion.csv', index=False)


In [None]:
adjourned_per_court.to_csv(f'{output_path}/KIBERA_adjourned_per_court.csv', index=False)

adjourn_proportion.to_csv(f'{output_path}/KIBERA_adjourn_proportion.csv', index=False)

In [137]:
proportion_resolved_within_timeline_per_court.to_csv(f'{output_path}/proportion_resolved_within_timeline.csv', index=True)

In [138]:
df.to_csv(f'{output_path}/coa_cases.csv', index=False)

### Exploratory analysis

In [177]:
def get_cases_per_quarter(df, column):
    # Group by quarters and count cases
    cases_per_quarter = df.groupby(pd.Grouper(key='activity_date', freq='QE'))[column].sum()

    # Reset index to make the quarters a column
    cases_per_quarter = cases_per_quarter.reset_index()

    # Rename the columns
    cases_per_quarter.columns = ['quarter', f'cases_{column}']

    return cases_per_quarter

In [105]:
def get_case_nature_per_quarter(df: pd.DataFrame, column: str, nature: str):
   
    # Set up date range for quarters
    quarters = df['activity_date'].dt.to_period('Q')

    # Filter cases where concluded == 1 and case nature == 'civil'
    filtered_df = df[(df[column] == 1) & (df['nature'] == nature)]

    # Group by quarter and count cases
    cases_per_quarter = filtered_df.groupby(quarters).size()

    return cases_per_quarter

In [7]:
# drop if activity_date is null
df = df.dropna(subset=['activity_date'])

In [178]:
quarterly_adjourned =  get_cases_per_quarter(df, 'adjourned')

In [181]:
quarterly_adjournable =  get_cases_per_quarter(df, 'adjournable')

In [182]:
quarterly_adjournable

Unnamed: 0,quarter,cases_adjournable
0,2023-09-30,87671
1,2023-12-31,102883
2,2024-03-31,86448


In [None]:
quarterly_concluded =  get_cases_per_quarter(df, 'concluded')

In [204]:
quarterly_concluded =  get_cases_per_quarter(df, 'concluded')
quarterly_registered =  get_cases_per_quarter(df, 'registered')


In [207]:
# merge quarterly_concluded and quarterly_registered on quarter column
merged_quarterly = pd.merge(quarterly_concluded, quarterly_registered, on='quarter')

In [211]:
df.drop('court', inplace=True, axis=1)

In [None]:
df.groupby(['court', 'month'])['concluded'].sum().reset_index(name='count')

In [106]:
quarterly_concluded_civil = get_case_nature_per_quarter(df, 'concluded', 'Civil')
quarterly_registered_civil = get_case_nature_per_quarter(df, 'registered', 'Civil')

quarterly_concluded_criminal = get_case_nature_per_quarter(df, 'concluded', 'Criminal')
quarterly_registered_criminal = get_case_nature_per_quarter(df, 'registered', 'Criminal')

In [None]:
# merge quarterly_concluded_civil quarterly_registered_civil quarterly_concluded_criminal quarterly_registered_criminal on quarter column 
merged_civil_criminal = pd.merge(quarterly_concluded_civil, quarterly_registered_civil, on='quarter')
merged_civil_criminal = pd.merge(merged_civil_criminal, quarterly_concluded_criminal, on='quarter')
merged_civil_criminal = pd.merge(merged_civil_criminal, quarterly_registered_criminal, on='quarter')

In [114]:
merged_civil_criminal.to_csv(f'{output_path}/quarterly_case_nature.csv', index=True)

TODO
### adjourn by event
### adjourn by case type

### backlog by case type
### check if there a courts that resolved more cases than pending

In [None]:
#df[(df['court'] == 'Milimani Commercial and Tax') & (df['activity_date_year'] == 2024)].groupby('judge_1').size().reset_index(name='count')
# group by case_type if outcome == 'Ruling Delivered- Case Closed' 
#df[df['productivity'] == 'judgment'].groupby('case_category').size().reset_index(name='count')
#court_productivity = df[df['productivity'] == 'ruling']
#court_productivity_pivot = court_productivity.pivot_table(index='court', columns='case_category', values='concluded', aggfunc='sum', fill_value=0)
#productivity_pivot = court_productivity_pivot.rename_axis(columns=None)
#df[df['productivity'] == 'ruling'].groupby('case_category').size().reset_index(name='count')
#df[df['outcome'] == 'Ruling Delivered- Accused Discharged'].groupby(['court', 'case_category'])['court'].size().reset_index(name='count')


In [44]:
pending_count = pending_df.groupby(['court', 'case_category']).size().reset_index(name='pending_count')

In [46]:
pending_pivot = pending_count.pivot_table(index='court', columns='case_category', values='pending_count', fill_value=0)

In [67]:
pending_pivot.to_csv('/home/arch/devel/data/pending_case_types.csv', index=True)

In [18]:
def categorize_concluded_cases(df):
    """
    Categorize concluded cases into '1-3 year' and 'over 3 year' based on age column.

    Args:
        df (pandas.DataFrame): DataFrame containing the data.

    Returns:
        pandas.DataFrame: DataFrame with an additional column 'age_category'.
    """
    # Filter only concluded cases
    df_concluded = df[df['concluded'] == 1]

    # Calculate age in years
    df_concluded['age_years'] = df_concluded['age'] / 365

    # Categorize based on age
    df_concluded.loc[:, 'age_category'] = df_concluded['age_years'].apply(lambda x: '1-3 year' if 1 <= x <= 3 else 'over 3 year')

    # Merge back to original DataFrame
    df = pd.merge(df, df_concluded[['case_number', 'age_category']], on='case_number', how='left')

    return df


In [None]:
df = categorize_concluded_cases(df)

In [None]:
# groupby judge_1 if court == 'Milimani Commercial and Tax' 
#df(df['court'] == 'Milimani Commercial and Tax').groupby([ 'judge_1',])['outcome'].size().reset_index(name='count')
df[df['court'] == 'Meru'].groupby(['judge_1'])['outcome'].size().reset_index(name='count')

In [None]:
# group by court if case_type == 'Murder Case' and registered == 1
df[(df['case_type'] == 'Criminal Appeal') & (df['outcome'] == 'File Transferred')].groupby('court').size().reset_index(name='count')

In [None]:
['comingfor', 'outcome',  'male_applicant', 'female_applicant',
       'organization_applicant', 'male_defendant', 'female_defendant',
       'organization_defendant', 'legalrep', 'court',
       'activity_date', 'filed_date', 'activity_date_year',
       'activity_date_month', 'nature', 'case_category', 'case_number',
       'registered', 'concluded', 'productivity', 'age', 'time_lines',
       'adjourned', 'adjournable']

In [184]:
   # Drop duplicates based on 'number_on_file' while keeping the first occurrence
df_unique_cases = df.drop_duplicates(subset='case_number', keep='last')
    

In [190]:
df_unique_cases.shape

(103156, 53)

In [32]:
filed_df = df[df['registered'] == 1]

In [None]:
filed_df

In [57]:
male_applicants = filed_df[(filed_df['male_applicant'] >= 1) & (filed_df['broad_case_type'] != 'Murder')].groupby('broad_case_type').size().reset_index(name='count')

In [56]:
female_applicants = filed_df[(filed_df['female_applicant'] > 0) & (filed_df['broad_case_type'] != 'Murder')].groupby('broad_case_type').size().reset_index(name='count')

In [36]:
male_accused = filed_df[(filed_df['male_defendant'] == 1) & (filed_df['broad_case_type'] == 'Murder')].groupby('broad_case_type').size().reset_index(name='count')

In [37]:
male_murder_defendants = filed_df[(filed_df['male_defendant'] == 1) & (filed_df['broad_case_type'] == 'Murder')].groupby('broad_case_type').size().reset_index(name='count')

In [38]:
female_murder_defendants = filed_df[(filed_df['female_defendant'] == 1) & (filed_df['broad_case_type'] == 'Murder')].groupby('broad_case_type').size().reset_index(name='count')

In [58]:
applicants = pd.merge(female_applicants, male_applicants, on='broad_case_type', how='outer', suffixes=('_female', '_male'))

In [45]:
respodents = pd.merge(female_murder_defendants, male_murder_defendants, on='broad_case_type', how='outer', suffixes=('_female', '_male'))

In [55]:
df[(df['broad_case_type'] == 'Murder') & (df['female_defendant'] > 0) & (df['registered'] == 1)].groupby('broad_case_type').size().reset_index(name='count')

Unnamed: 0,broad_case_type,count
0,Murder,202


In [50]:
df.columns

Index(['court', 'date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no',
       'filed_dd', 'filed_mon', 'filed_yyyy', 'original_court',
       'original_code', 'original_number', 'original_year', 'case_type',
       'judge_1', 'judge_2', 'judge_3', 'judge_4', 'judge_5', 'judge_6',
       'judge_7', 'comingfor', 'outcome', 'reason_adj', 'next_dd', 'next_mon',
       'next_yyyy', 'male_applicant', 'female_applicant',
       'organization_applicant', 'male_defendant', 'female_defendant',
       'organization_defendant', 'legalrep', 'applicant_witness',
       'defendant_witness', 'custody', 'other_details', 'activity_date',
       'filed_date', 'next_date', 'nature', 'broad_case_type', 'case_number',
       'concluded', 'registered', 'productivity', 'age', 'time_lines',
       'adjourned', 'adjournable'],
      dtype='object')

In [59]:
applicants.to_csv(f'{file_path}/reports/applicants_gender.csv', index=False)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))

bplot = ax.boxplot(pending_baseline['total_events'],
                     vert=False,  # Set vert to False for a vertical box plot
                     patch_artist=True) 

# Add labels for median, mean, etc.
# Add labels for median, lower quartile, upper quartile, and outliers
for line in bplot.keys():
    if line == 'medians':
        for median in bplot[line]:
            ax.text(median.get_xdata()[0], median.get_ydata()[0],
                    f'{median.get_xdata()[0]:.1f}', ha='center', va='bottom', color='red', fontsize=10)
    elif line == 'whiskers':
        for whisker in bplot[line]:
            ax.text(whisker.get_xdata()[0], whisker.get_ydata()[0],
                    f'{whisker.get_xdata()[0]:.1f}', ha='center', va='bottom', color='green', fontsize=10)
            ax.text(whisker.get_xdata()[1], whisker.get_ydata()[1],
                    f'{whisker.get_xdata()[1]:.1f}', ha='center', va='bottom', color='green', fontsize=10)
    elif line == 'fliers':
        for fliers in bplot[line]:
            for f in fliers.get_xdata():
                ax.text(f, fliers.get_ydata()[0], f'{f:.1f}', ha='center', va='bottom', color='blue', fontsize=10)


ax.set_title('Workload Analysis')
ax.set_xlabel('Cases per Judge')

# Set face color for the box plot
colors = ['lightgreen']
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)

plt.show()

data = [pending_baseline['total_events']]

# Create a box plot with horizontal orientation
fig, ax = plt.subplots()
bplot = ax.boxplot(data, vert=True, patch_artist=True)

# Add labels for median, mean, etc.
for line in bplot.keys():
    if line == 'medians':
        for median in bplot[line]:
            # Add label for median
            ax.text(median.get_xdata()[0], median.get_ydata()[0],
                    f'{median.get_xdata()[0]:.2f}', ha='center', va='bottom', color='red', fontsize=8)
    elif line == 'fliers':
        for fliers in bplot[line]:
            # Add labels for outliers
            for f in fliers.get_xdata():
                ax.text(f, fliers.get_ydata()[0], f'{f:.2f}', ha='center', va='bottom', color='blue', fontsize=8)

# Set labels and title
ax.set_xlabel('Workload')
ax.set_ylabel('Values')
ax.set_title('Workload Analysis')

plt.show()

In [35]:
# create a list of circuits
circuit_court = [
'Kapenguria',
'Garsen',
'Lodwar',
'Nyahururu',
'Kwale',
'Eldama',
'Iten',
'Kilgoris',
'Mandera',
'Maralal']



In [33]:
non_mobile = [
'Milimani Civil',
'Meru',
'Kericho']

In [34]:
# keep only the rows where 'court' is in the list
df2 = df1[df1['court'].isin(non_mobile)]

In [37]:
# keep only the rows where 'court' is in the list
df1 = df[df['court'].isin(circuit_court)]

In [43]:
judges = df2['judge_1'].unique()

In [None]:
df2.groupby('court').size().reset_index(name='count')

Hon Justice Jesse Nyaga
Hon Justice Lucy Gitari
Hon Justice Joseph Karanja


In [48]:
df2.groupby('judge_1').size().reset_index(name='judge')

Unnamed: 0,judge_1,judge
0,"Gitari, Lucy",1146
1,"Karanja, Joseph R.",554
2,"Njagi, Jesse Nyaga",1042


In [45]:
circut_judge = ['Njagi, Jesse Nyaga', 'Gitari, Lucy', 'Karanja, Joseph R.']

In [47]:
df2 = df2[df2['judge_1'].isin(circut_judge)]

In [50]:
df2.groupby(['judge_1', 'court'])['concluded'].sum().reset_index(name='count')

Unnamed: 0,judge_1,court,count
0,"Gitari, Lucy",Meru,101
1,"Karanja, Joseph R.",Kericho,105
2,"Njagi, Jesse Nyaga",Meru,0
3,"Njagi, Jesse Nyaga",Milimani Civil,98


In [51]:
# combine df1 and df2
combined_df = pd.concat([df1, df2])

In [53]:
combined_df.to_csv('court_circuits.csv', index=False)

In [52]:
combined_df.groupby(['judge_1', 'court'])['concluded'].sum().reset_index(name='count')

Unnamed: 0,judge_1,court,count
0,"Gikonyo, Francis Muthuku",Kilgoris,70
1,"Kipkorir, Weldon Kipyegon",Kapenguria,0
2,Anne Ong'injo,Garsen,0
3,Anne Ong'injo,Kwale,33
4,"Auka, Christine Kemuma",Kwale,0
...,...,...,...
62,Wambua.josephine Wayua,Kapenguria,0
63,"Wananda,robert Anuro",Iten,64
64,"Wananda,robert Anuro",Kapenguria,0
65,"Wanyanga, James Helekia Sijenyi",Maralal,0
