In [1]:
import pandas as pd
from datetime import datetime

In [2]:
def process_disease_data(input_csv):
    # Read the CSV file
    df = pd.read_csv(input_csv)
    
    # Filter for NCR cases only
    ncr_cases = df[(df['RegionRes'] == 'NCR') & (df['ProvRes'] == 'NCR')].copy()
    
    # Convert dates to datetime
    ncr_cases['DateRepConf'] = pd.to_datetime(ncr_cases['DateRepConf'])
    ncr_cases['DateRepRem'] = pd.to_datetime(ncr_cases['DateRepRem'])
    
    # Create date range from earliest conf to latest rem date
    date_range = pd.date_range(
        start=ncr_cases['DateRepConf'].min(),
        end=max(ncr_cases['DateRepRem'].max(), ncr_cases['DateRepConf'].max()),
        freq='D'
    )
    
    # Calculate daily confirmed and removed cases
    daily_data = []
    running_active_cases = 0
    
    for date in date_range:
        # Count new confirmed cases for this date
        new_cases = len(ncr_cases[ncr_cases['DateRepConf'].dt.date == date.date()])
        
        # Count removed cases for this date
        removed_cases = len(ncr_cases[ncr_cases['DateRepRem'].dt.date == date.date()])
        
        # Update running total
        running_active_cases = running_active_cases + new_cases - removed_cases
        
        daily_data.append({
            'date': date.strftime('%Y-%m-%d'),
            'new_confirmed_cases': new_cases,
            'removed_cases': removed_cases,
            'active_cases': running_active_cases
        })
    
    # Convert to DataFrame
    daily_df = pd.DataFrame(daily_data)
    
    # Save both daily confirmed cases and active cases
    daily_df.to_csv('disease_daily_cases.csv', index=False)
    
    # Also save a simplified version with just date and active cases
    active_cases_df = daily_df[['date', 'active_cases']]
    active_cases_df.to_csv('disease_active_cases.csv', index=False)
    
    print("Successfully created disease_daily_cases.csv and disease_active_cases.csv")
    print(f"\nDate range: from {daily_df['date'].min()} to {daily_df['date'].max()}")
    print(f"Maximum active cases: {daily_df['active_cases'].max()} on {daily_df.loc[daily_df['active_cases'].idxmax(), 'date']}")
    
    # Print some validation statistics
    total_confirmed = daily_df['new_confirmed_cases'].sum()
    total_removed = daily_df['removed_cases'].sum()
    final_active = daily_df['active_cases'].iloc[-1]
    
    print("\nValidation Statistics:")
    print(f"Total confirmed cases: {total_confirmed}")
    print(f"Total removed cases: {total_removed}")
    print(f"Final active cases: {final_active}")
    print(f"Balance check (should be 0): {total_confirmed - total_removed - final_active}")

In [3]:
process_disease_data('DOH_COVID_Data_Drop_20240103_2020-2023_Merged.csv')

  df = pd.read_csv(input_csv)
  ncr_cases['DateRepConf'] = pd.to_datetime(ncr_cases['DateRepConf'])
  ncr_cases['DateRepRem'] = pd.to_datetime(ncr_cases['DateRepRem'])


Successfully created disease_daily_cases.csv and disease_active_cases.csv

Date range: from 2020-03-08 to 2023-04-21
Maximum active cases: 96379 on 2021-04-17

Validation Statistics:
Total confirmed cases: 447385
Total removed cases: 447385
Final active cases: 0
Balance check (should be 0): 0
