In [None]:
import pandas as pd
import os
from pathlib import Path
import pandas as pd
import os
from datetime import datetime
import logging
import matplotlib.pyplot as plt

In [None]:
col_names = ['date_dd', 'date_mon', 'date_yyyy', 'caseid_type', 'caseid_no',
       'filed_dd', 'filed_mon', 'filed_yyyy', 'original_court',
       'original_code', 'original_number', 'original_year', 'case_type',
       'judge_1', 'judge_2', 'judge_3', 'judge_4', 'judge_5', 'judge_6',
       'judge_7', 'comingfor', 'outcome', 'reason_adj', 'next_dd', 'next_mon',
       'next_yyyy', 'male_applicant', 'female_applicant',
       'organization_applicant', 'male_defendant', 'female_defendant',
       'organization_defendant', 'legalrep', 'applicant_witness',
       'defedant_witness', 'custody', 'other_details']

In [None]:
def generate_file_paths(root_folder, start_year, end_year):
    """Generates a list of file paths for Excel files within the specified financial year range."""
    logging.info(f"Entering generate_file_paths")
    file_paths = []
    for root, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".xlsx"):
                file_path = os.path.join(root, file)
                try:
                    year = int(os.path.basename(os.path.dirname(root)))
                    if start_year <= year <= end_year:
                        file_paths.append(file_path)
                except ValueError as ve:
                    logging.error(f"Error processing file path {file_path}: {ve}")
                    continue

    logging.info(f"Exiting generate_file_paths successfully")
    return file_paths

In [None]:
logging.basicConfig(level=logging.INFO)  # Set logging level to INFO

def process_file(file_path: str, col_names: list) -> pd.DataFrame | None:
    """
    Processes a single Excel file, extracting the court code and relevant data.

    Args:
        file_path (str): The path to the Excel file.
        col_names (list): List of column names for DataFrame.

    Returns:
        pd.DataFrame | None: A DataFrame containing the processed data, or None if an error occurred.
    """

    try:
        court_name = os.path.split(file_path)[0].split("\\")[-3]        
        try:
            df = pd.read_excel(file_path, header=4, names=col_names)  

            df = df.assign(court_name=court_name)
            df.reset_index(drop=True, inplace=True)
            
            # Print processing information
            logging.info(f"Processed file: {file_path}")
            
            return df
        except Exception as e:  
            print(f"Error processing file {file_path}: {e}")
            return None

    except ValueError:
        print(f"Skipping file {file_path} as code is not an integer.")
        return None


In [None]:
def process_files(file_paths: list[str], col_names: list[str]) -> pd.DataFrame:
    """
    Reads and processes Excel files from a list of paths sequentially,
    combining them into a DataFrame, and logs file processing information.

    Args:
        file_paths (list[str]): A list of file paths to process.
        col_names (list[str]): List of column names for DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing the combined data from all processed files.
    """

    results = []
    for file_path in file_paths:
        result = process_file(file_path, col_names)
        results.append(result)

        if result is not None:
            logging.info(f"Processed file: {file_path}")
        else:
            logging.warning(f"Unprocessed file: {file_path}")

    combined_df = pd.concat(results, ignore_index=True)
    return combined_df


In [None]:
root_folder = r"C:\Users\Gichimu\Desktop\Data Science\utility\data\coa"
start_year = 2023
end_year = 2024
file_paths = generate_file_paths(root_folder, start_year, end_year)

In [None]:
df = process_files(file_paths, col_names)

In [None]:
broad_case_types = ['COA  Criminal Appeal', 'Civil Appeal', 'Court of Appeal Election Petition Appeal', 'Criminal Applications']
df.loc[:, 'broad_case_type'] = df['case_type'].where(df['case_type'].isin(broad_case_types), 'Civil Applications')

In [None]:
df.to_csv('raw_coa-22_23.csv', index=False)

In [None]:
df = pd.read_csv(r'C:\Users\Gichimu\Desktop\Data Science\utility\utility\processor\coa-22_23.csv')

## Get bench sittings

In [None]:
def drop_nan_lists(column):
    result = []
    for item in column:
        if isinstance(item, list):
            if not any(pd.isna(x) for x in item):
                result.append(item)
        elif not pd.isna(item):
            result.append(item)
    return result

In [None]:
## Drop where the combined bench column contains null values
lean_df.loc[:, 'judge_list'] = lean_df['judge_list'].apply(lambda x: drop_nan_lists(x))

In [None]:
# strip judge names
lean_df['judge_1'] = lean_df['judge_1'].str.strip()

In [None]:
# where the judge list contains only one judge
single_bench_df = lean_df[lean_df['judge_list'].apply(lambda x: len(x) == 1)]

In [None]:
# where outcome is not yet assigned
not_assigned_df = lean_df[lean_df['judge_1'] == 'Not Yet Assigned']

In [None]:
#drop duplicates in not_assigned_df 
not_assigned_df.drop_duplicates(subset=['court_name', 'caseid_type', 'caseid_no', 'filed_dd', 'filed_mon', 'filed_yyyy'], keep='first', inplace=True)

In [None]:
not_assigned_df.groupby('court_name')['court_name'].value_counts()

In [None]:
not_assigned_df.groupby(['court_name', 'outcome'])['outcome'].value_counts().sort_values(ascending=False)

## Find the total of rows with missing outcome

In [None]:
lean_df[lean_df['outcome'].isnull()].groupby('court_name')['court_name'].value_counts().sort_values(ascending=False)

In [None]:
lean_df[lean_df['outcome'].isnull()].groupby(['court_name', 'comingfor'])['comingfor'].value_counts().sort_values(ascending=False)

### Number of adjourmnents per courts

In [None]:
mapped_working_df[mapped_working_df['outcome'] == 'Adjournment'].groupby('court_name').size().reset_index(name='count')


In [None]:
# Group mapped_working_df by court_name where outcome is "adjourned" and reason_for_hearing is missing
adjourned_df = lean_df[(lean_df['outcome'] == 'Adjournment') & lean_df['reason_adj'].isnull()].groupby('court_name')

In [None]:
adjourned_df['court_name'].value_counts()

### Where the matter was adjourned a but the reason was not indicated

In [None]:
lean_df[lean_df['outcome'].isnull()].groupby(['court_name', 'comingfor'])['comingfor'].value_counts().sort_values(ascending=False)

In [None]:
# group by 'comingfor' if outcome is null
missing_outcomes = lean_df[lean_df['outcome'].isnull()].groupby(['court_name', 'comingfor'])['comingfor'].value_counts()

In [None]:
missing_outcomes.to_csv('missing_outcomes.csv')

### Inconsistency in outcomes 

In [None]:
inconsistent = lean_df[(lean_df['judge_1'] == 'Not Yet Assigned') & (lean_df['outcome'] != 'Case Registered/Filed')]


In [None]:
inconsistent.groupby('court_name')['court_name'].value_counts()

## Create a group of of pannel

In [None]:
# Function to check similarity between lists
def are_lists_similar(list1, list2):
    return sorted(list1) == sorted(list2)

# Function to create groupings of similar lists
def create_groups(df, col_name):
    group_number = 1
    groups = {}
    
    for idx, row in df.iterrows():
        found_group = False
        for group_id, group in groups.items():
            if any(are_lists_similar(row[col_name], x) for x in group):
                groups[group_id].append(row[col_name])
                found_group = True
                break
        
        if not found_group:
            group_name = f"Group_{group_number}"
            groups[group_name] = [row[col_name]]
            group_number += 1
    
    return groups

In [None]:
# Apply the function to create groupings
groupings = create_groups(lean_df, 'judge_list')
# Create a mapping of list to group number
group_map = {}
for group_id, group in groupings.items():
    for item in group:
        group_map[str(item)] = group_id

# Apply the mapping to create a 'group' column in the DataFrame
lean_df['bench_panel'] = lean_df['judge_list'].apply(lambda x: group_map[str(x)])

In [None]:
# create a column of 1 if reason_adj is present otherwise 0
lean_df['reason_adj_indicator'] = lean_df['reason_adj'].notna().astype(int)

In [None]:
lean_df.groupby('reason_adj')['reason_adj_indicator'].count()

In [None]:
lean_df.to_csv('half_year_23-24.csv', index=False)

In [None]:
# Filter rows where reason_adj_indicator is 1
filtered_df = lean_df[lean_df['reason_adj_indicator'] == 1]

# Convert lists in 'judge_list' column to tuples
filtered_df['judge_list'] = filtered_df['judge_list'].apply(tuple)

# Group by 'bench_panel' and 'judge_list'
grouped_df = filtered_df.groupby(['bench_panel', 'judge_list']).size().reset_index(name='count')


In [None]:
# sort grouped_df by 'count' in descending order
benched_df = grouped_df.sort_values('count', ascending=False)
#save 
benched_df.to_csv('coa-panel-benched.csv', index=False)


## Drop where adjourment reason is not provided

In [None]:
def clean_data(df):
    # Drop rows with missing data in column: 'reason_adj'
    df = df.dropna(subset=['reason_adj'])
    return df

df_clean = clean_data(df.copy())
# drop if reason_adj is missing
lean_df = lean_df[~lean_df['reason_adj'].isnull()]

In [None]:
adjourn_group = lean_df[['reason_adj', 'bench_panel']].groupby('bench_panel')

In [None]:
adjourn_group.groupby('panel')['reason_adj'].apply(lambda x: ', '.join(x)).reset_index()

In [None]:
adjournents = adjourn_group.groupby('panel')['reason_adj'].value_counts().reset_index()

In [None]:
adjournents.to_csv('coa-panel-adjournment.csv')

In [None]:
#TODO check how many benches each judge appears in 


In [None]:
fy_data = df.groupby(['date_yyyy', 'date_mon'])['date_mon'].value_counts()