In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import os
from pathlib import Path
import sys
from tqdm import tqdm
from matplotlib.backends.backend_pdf import PdfPages
import shap
from sklearn.inspection import PartialDependenceDisplay
from sklearn.model_selection import KFold # Import KFold

# Define project root based on notebook location (assuming this part is correct for your setup)
def find_project_root(current: Path, marker: str = ".git"):
    for parent in current.resolve().parents:
        if (parent / marker).exists():
            return parent
    return current.resolve() # fallback

PROJECT_ROOT = find_project_root(Path.cwd())
RAW_DIR = PROJECT_ROOT / "data" / "raw"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
EXTERNAL_DIR = PROJECT_ROOT / "data" / "external"
REPORTS_DIR = PROJECT_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
MODELS_DIR = PROJECT_ROOT / "models"
TABLES_DIR = REPORTS_DIR / "tables"

# Load the data once outside the train function for efficiency in a sweep
df = pd.read_csv(PROCESSED_DIR / "INDONESIA" /"monthly_dengue_env_id.csv")

# --- REGION REASSIGNMENT (Keep this for consistency, but it won't be used for grouping) ---
df['Region_Group'] = df['Region'].replace({'Maluku Islands': 'Maluku-Papua', 'Papua': 'Maluku-Papua'})
print("--- DataFrame after Region_Group creation ---")
print(df['Region_Group'].value_counts())
print("-" * 50)

df['YearMonth'] = pd.to_datetime(df['YearMonth']) # Ensure YearMonth is datetime

# Define variable categories
env_vars = [
    'temperature_2m', 'temperature_2m_min', 'temperature_2m_max',
    'precipitation', 'potential_evaporation_sum', 'total_evaporation_sum',
    'evaporative_stress_index', 'aridity_index',
    'temperature_2m_ANOM', 'temperature_2m_min_ANOM', 'temperature_2m_max_ANOM',
    'potential_evaporation_sum_ANOM', 'total_evaporation_sum_ANOM', 'precipitation_ANOM'
]

land_use_vars = [
    'Class_70', 'Class_60', 'Class_50', 'Class_40', 'Class_95',
    'Class_30', 'Class_20', 'Class_10', 'Class_90', 'Class_80'
]

climate_vars = ['ANOM1+2', 'ANOM3', 'ANOM4', 'ANOM3.4', 'DMI', 'DMI_East']
target = 'Incidence_Rate'

# Sort data by time and region
df = df.sort_values(['YearMonth', 'ID_2'])

--- DataFrame after Region_Group creation ---
Region_Group
Java             21168
Sumatra          18948
Sulawesi         11592
Kalimantan        8904
Maluku-Papua      7548
Nusa Tenggara     5712
Name: count, dtype: int64
--------------------------------------------------


In [None]:
# Histogram of Incidence Rate (IR)
plt.figure(figsize=(8, 5))
sns.histplot(df['Incidence_Rate'], bins=30, kde=True, color='skyblue')
plt.title('Histogram of Incidence Rate (IR)')
plt.xlabel('Incidence Rate')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Histogram of log-transformed Incidence Rate (IR)
plt.figure(figsize=(8, 5))
sns.histplot(np.log1p(df['Incidence_Rate']), bins=30, kde=True, color='salmon')
plt.title('Histogram of log(Incidence Rate + 1)')
plt.xlabel('log(Incidence Rate + 1)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Histogram of Incidence Rate (IR) with quantile separation
quantiles = pd.qcut(df['Incidence_Rate'], 3, labels=['Low', 'Medium', 'High'])
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x='Incidence_Rate', hue=quantiles, bins=30, palette=['green', 'orange', 'red'], multiple='stack')
plt.title('Histogram of Incidence Rate (IR) by Quantile')
plt.xlabel('Incidence Rate')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Print counts for each quantile
print('Quantile counts:')
print(quantiles.value_counts().sort_index())

In [None]:
# Histogram of Incidence Rate (IR) by quantile, showing only IR <= 50
quantiles = pd.qcut(df['Incidence_Rate'], 3, labels=['Low', 'Medium', 'High'])
df_plot = df[df['Incidence_Rate'] <= 50]
quantiles_plot = quantiles[df['Incidence_Rate'] <= 50]
plt.figure(figsize=(8, 5))
sns.histplot(data=df_plot, x='Incidence_Rate', hue=quantiles_plot, bins=30, palette=['green', 'orange', 'red'], multiple='stack')
plt.title('Histogram of Incidence Rate (IR â‰¤ 50) by Quantile')
plt.xlabel('Incidence Rate')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Print counts for each quantile (all data)
print('Quantile counts (all data):')
print(quantiles.value_counts().sort_index())

In [None]:
# Print the value ranges for each quantile (low, medium, high)
quantile_bins = pd.qcut(df['Incidence_Rate'], 3)
labels = ['Low', 'Medium', 'High']
for i, interval in enumerate(quantile_bins.cat.categories):
    print(f"{labels[i]}: {interval.left:.2f} to {interval.right:.2f}")

In [None]:
# Log-transform IR, separate into three quantiles, and plot histogram
log_IR = np.log1p(df['Incidence_Rate'])
log_quantiles = pd.qcut(log_IR, 3, labels=['Low', 'Medium', 'High'])
plt.figure(figsize=(8, 5))
sns.histplot(x=log_IR, hue=log_quantiles, bins=30, palette=['green', 'orange', 'red'], multiple='stack')
plt.title('Histogram of log(Incidence Rate + 1) by Quantile')
plt.xlabel('log(Incidence Rate + 1)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Print counts for each quantile
print('Quantile counts (log-transformed IR):')
print(log_quantiles.value_counts().sort_index())

# Print the value ranges for each quantile
log_quantile_bins = pd.qcut(log_IR, 3)
labels = ['Low', 'Medium', 'High']
for i, interval in enumerate(log_quantile_bins.cat.categories):
    print(f"{labels[i]}: {interval.left:.2f} to {interval.right:.2f}")

In [None]:
# Count cases above and below 2.1 per 100000 for Incidence Rate (before log transform)
threshold = 2.1
below = (df['Incidence_Rate'] < threshold).sum()
above = (df['Incidence_Rate'] >= threshold).sum()
print(f"Cases below {threshold} per 100000: {below}")
print(f"Cases above or equal to {threshold} per 100000: {above}")

In [None]:
# Count cases where Incidence Rate is below 2.1 and exactly 0
threshold = 2.1
below_mask = df['Incidence_Rate'] < threshold
zero_below = (df['Incidence_Rate'][below_mask] == 0).sum()
print(f"Cases below {threshold} per 100000 and exactly 0: {zero_below}")

In [None]:
# Classify IR into four quantiles: Near-zero, Low-risk, Mid-risk, High-risk
classes = pd.Series(index=df.index, dtype='object')
classes[df['Incidence_Rate'] == 0] = 'Near-zero'
nonzero_mask = df['Incidence_Rate'] > 0
nonzero_IR = df.loc[nonzero_mask, 'Incidence_Rate']
nonzero_quantiles = pd.qcut(nonzero_IR, 3, labels=['Low-risk', 'Mid-risk', 'High-risk'])
classes[nonzero_mask] = nonzero_quantiles.values

# Print counts for each class
print(classes.value_counts())

# Optionally, show the value ranges for each nonzero class
quantile_bins = pd.qcut(nonzero_IR, 3)
labels = ['Low-risk', 'Mid-risk', 'High-risk']
for i, interval in enumerate(quantile_bins.cat.categories):
    print(f"{labels[i]}: {interval.left:.2f} to {interval.right:.2f}")

In [None]:
# Classify IR into four quantiles based on log-transformed values
log_IR = np.log1p(df['Incidence_Rate'])
classes_log = pd.Series(index=df.index, dtype='object')
classes_log[df['Incidence_Rate'] == 0] = 'Near-zero'
nonzero_mask = df['Incidence_Rate'] > 0
nonzero_log_IR = log_IR[nonzero_mask]
nonzero_log_quantiles = pd.qcut(nonzero_log_IR, 3, labels=['Low-risk', 'Mid-risk', 'High-risk'])
classes_log[nonzero_mask] = nonzero_log_quantiles.values

# Print counts for each class
print(classes_log.value_counts())

# Show the value ranges for each nonzero class (log-transformed)
quantile_bins_log = pd.qcut(nonzero_log_IR, 3)
labels = ['Low-risk', 'Mid-risk', 'High-risk']
for i, interval in enumerate(quantile_bins_log.cat.categories):
    print(f"{labels[i]}: {interval.left:.2f} to {interval.right:.2f} (log(IR+1))")

In [None]:
# Add log-based class label to df and save

df['IR_class_log'] = classes_log
output_path = PROCESSED_DIR / 'monthly_dengue_env_id_with_class.csv'
df.to_csv(output_path, index=False)
print(f"Saved DataFrame with IR_class_log to {output_path}")

In [None]:
df