In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.spatial import cKDTree

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [None]:
# Load datasets
train = pd.read_csv("data/Train.csv")
test = pd.read_csv("data/Test.csv")
toilets = pd.read_csv("data/toilets.csv") # Information on toilet locations
waste_management = pd.read_csv("data/waste_management.csv") # Additional information on waste management locations in the area.
water_sources = pd.read_csv("data/water_sources.csv") # Additional information on water sources in the area.

In [None]:
test.info()

In [None]:
# Show unique categories for object columns in train_df
cols = ['Category_Health_Facility_UUID', 'Disease', 'Month', 'Year']

for col in cols:
    print(f"\n{col}:")
    print(f"Number of unique values: {train[col].nunique()}")
    print(f"Unique values: {train[col].unique()}")

In [None]:
# Combine train and test datasets for consistent preprocessing
hospital_data = pd.concat([train, test])

In [None]:
# Drop unnecessary columns from supplementary datasets
for df in [toilets, waste_management, water_sources]:
    df.drop(columns=['Year', 'Month'], inplace=True)

In [None]:
# Rename columns for clarity
def rename_columns(df, prefix):
    for col in df.columns:
        if col not in ['Month_Year_lat_lon', 'lat_lon']:
            df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)

rename_columns(toilets, "toilet")
rename_columns(waste_management, "waste")
rename_columns(water_sources, "water")


In [None]:
# Fill missing values in the 'Total' count of diseases column
hospital_data['Total'].fillna(0, inplace=True)

In [None]:
# Drop rows with missing latitude and longitude in water sources
water_sources.dropna(subset=['water_Transformed_Latitude'], inplace=True)

In [None]:
# Visualize locations for a specific year and month
# Note the months/year should be in the given timeframe [2019, 2023]
def plot_locations(year=2022, month=1, month_name='January'):
    if year < 2019 or year > 2023:
        print("Invalid year. Please choose a year between 2019 and 2023.")
        return

    if month < 1 or month > 12:
        print("Invalid month. Please choose a month between 1 and 12.")
        return

    if month_name.capitalize() not in ['January', 'February', 'March',
                                       'April', 'May', 'June', 'July',
                                       'August', 'September', 'October',
                                       'November', 'December']:
        print("Invalid month name. Please choose from 'January' to 'December'.")
        return

    plt.figure(figsize=(12, 8))
    subsets = [
        (hospital_data.query(f"Year == {year} and Month == {month}"), 'Transformed', 'Hospital', 's'),
        (water_sources.query(f"water_Month_Year == '{month}_{year}'"), 'water_Transformed', 'Water', 'o'),
        (waste_management.query(f"waste_Month_Year == '{month}_{year}'"), 'waste_Transformed', 'Waste', 'x'),
        (toilets.query(f"toilet_Month_Year == '{month}_{year}'"), 'toilet_Transformed', 'Toilet', '^'),
    ]
    for df, prefix, label, marker in subsets:
        plt.scatter(df[f'{prefix}_Longitude'], df[f'{prefix}_Latitude'], label=label, alpha=0.6, marker=marker)
    plt.title(f'Locations ({month_name.capitalize()} {year})')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
plot_locations()

In [None]:
plot_locations(year=2023, month=2, month_name='February')

In [None]:
plot_locations(year=2023, month=12, month_name='December')

In [None]:
def find_nearest(hospital_df, location_df, lat_col, lon_col, id_col):
    # Create a cKDTree for efficient nearest neighbour search
    tree = cKDTree(location_df[[lat_col, lon_col]].values)
    nearest = {}
    # Loop through each hospital and find the nearest site in location_df
    for _, row in hospital_df.iterrows():
        _, idx = tree.query([row['Transformed_Latitude'], row['Transformed_Longitude']])
        nearest[row['ID']] = location_df.iloc[idx][id_col]
    return nearest


In [None]:
# Ensure unique identifier columns exist in all supplementary datasets
for df, prefix in [(toilets, 'toilet'), (waste_management, 'waste'), (water_sources, 'water')]:
    df[f"{prefix}_Month_Year_lat_lon"] = (
        df[f"{prefix}_Month_Year"] + '_' +
        df[f"{prefix}_Transformed_Latitude"].astype(str) + '_' +
        df[f"{prefix}_Transformed_Longitude"].astype(str)
    )

In [None]:
toilets.head()

In [None]:
# Merge datasets with nearest locations
merged_data = hospital_data.copy()
datasets = [
    (toilets, 'toilet', 'toilet_Month_Year_lat_lon'),
    (waste_management, 'waste', 'waste_Month_Year_lat_lon'),
    (water_sources, 'water', 'water_Month_Year_lat_lon'),
]

In [None]:
for df, prefix, id_col in datasets:
    nearest = find_nearest(merged_data, df, f"{prefix}_Transformed_Latitude", f"{prefix}_Transformed_Longitude", id_col)
    nearest_df = pd.DataFrame(list(nearest.items()), columns=['ID', id_col])
    merged_data = merged_data.merge(nearest_df, on="ID").merge(df, on=id_col)

## EDA

In [None]:
# Box plot of Total by Disease
# Sort categories by median Total for clearer ordering
order = merged_data.groupby('Disease')['Total'].median().sort_values().index.tolist()
data = [merged_data.loc[merged_data['Disease'] == c, 'Total'].values for c in order]

plt.figure(figsize=(16, 6))
plt.boxplot(data, tick_labels=order, showfliers=False)  # set showfliers=True if you want to display outliers
plt.xticks(rotation=90)
plt.xlabel('Disease')
plt.ylabel('Total')
plt.title('Distribution of Total by Disease (sorted by median)')
plt.tight_layout()
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf

# Set Date as index
df = merged_data.set_index("Date")

# Full monthly index over your date range
all_months = pd.date_range(df.index.min(), df.index.max(), freq="MS")  # month start

# Aggregate over locations → regional totals, one column per disease
regional = (
    df
    .groupby("Disease")["Total"]
    .resample("MS")            # monthly start frequency
    .sum()
    .unstack("Disease")       # index = Date, columns = diseases
    .reindex(all_months)      # ensure all months in range
    .fillna(0)                # months with no cases → 0
)

regional.index.name = "Date"

for disease in regional.columns:
    ts = regional[disease]

    # Skip diseases that are always zero
    if ts.sum() == 0:
        continue

    plot_pacf(ts, lags=24)
    plt.title(f"Regional ACF – {disease}")
    plt.show()


## Feature engineering

In [None]:
def make_time_series_features(
    df: pd.DataFrame,
    id_cols=("Disease", "Location", "Category_Health_Facility_UUID"),
    target_col="Total",
    year_col="Year",
    month_col="Month",
    day_col=None,              # set to a column name if you have Day, else uses 1
    lags=(1, 2, 3, 6, 12),
    rolling_windows=(3, 6),
    add_month_sin_cos=True,
    add_time_index=True,
    drop_na_lags=True,
):
    """
    Create time-series features for disease outbreak prediction.

    Parameters
    ----------
    df : DataFrame
        Input data with at least id_cols, target_col, Year, Month (and optionally Day).
    id_cols : tuple
        Columns that define a single time series (per disease-location-facility).
    target_col : str
        Name of the target variable (e.g. 'Total').
    year_col, month_col, day_col : str
        Date components. If day_col is None, day=1 is used.
    lags : iterable of int
        Lags (in months) of the target to create.
    rolling_windows : iterable of int
        Window sizes (in months) for rolling statistics of the target.
    add_month_sin_cos : bool
        Whether to add cyclical month features.
    add_time_index : bool
        Whether to add a global time_index (days since min date).
    drop_na_lags : bool
        Whether to drop rows where any lag / rolling feature is NaN.

    Returns
    -------
    DataFrame
        A copy of df with new feature columns added.
    """

    df = df.copy()

    # --- 1. Build Date column ---
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"])
    else:
        if day_col is not None and day_col in df.columns:
            df["Date"] = pd.to_datetime(
                dict(
                    year=df[year_col].astype(int),
                    month=df[month_col].astype(int),
                    day=df[day_col].astype(int),
                )
            )
        else:
            df["Date"] = pd.to_datetime(
                dict(
                    year=df[year_col].astype(int),
                    month=df[month_col].astype(int),
                    day=1,
                )
            )

    # Ensure consistent sorting
    df = df.sort_values(list(id_cols) + ["Date"])

    # --- 2. Global time_index / diff_date ---
    if add_time_index:
        df = df.sort_values("Date")
        ref_date = df["Date"].min()
        df["time_index"] = (df["Date"] - ref_date).dt.days

    # --- 3. Month + seasonality features ---
    df["month"] = df["Date"].dt.month
    df["year"] = df["Date"].dt.year

    if add_month_sin_cos:
        df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
        df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

    # --- 4. Lag features per series (Disease-Location-Facility) ---
    df = df.sort_values(list(id_cols) + ["Date"])
    grouped = df.groupby(list(id_cols), group_keys=False)

    created_cols = []

    # Lags of the target
    for lag in lags:
        col_name = f"{target_col}_lag{lag}"
        df[col_name] = grouped[target_col].shift(lag)
        created_cols.append(col_name)

    # Rolling stats of the target (using past data only)
    for window in rolling_windows:
        mean_col = f"{target_col}_roll_mean_{window}"
        std_col = f"{target_col}_roll_std_{window}"

        df[mean_col] = grouped[target_col].shift(1).rolling(window).mean()
        df[std_col] = grouped[target_col].shift(1).rolling(window).std()

        created_cols.extend([mean_col, std_col])

    # --- 5. Optionally drop rows with missing lag/rolling features ---
    if drop_na_lags and created_cols:
        df = df.dropna(subset=created_cols).reset_index(drop=True)

    return df


## Modeling

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
import numpy as np

target_col = "Total"

# 1) Feature engineering on the FULL merged data (train + 2023 test)
fe_df = make_time_series_features(merged_data)

# 2) Drop junk merge-key columns (but KEEP 'ID' for submission)
drop_obj_cols = [
    "toilet_Month_Year_lat_lon", "toilet_Month_Year",
    "lat_lon_x", "Month_Year_lat_lon_x",
    "waste_Month_Year_lat_lon", "waste_Month_Year",
    "lat_lon_y", "Month_Year_lat_lon_y",
    "water_Month_Year_lat_lon", "water_Month_Year",
    "lat_lon", "Month_Year_lat_lon",
]
fe_df = fe_df.drop(columns=[c for c in drop_obj_cols if c in fe_df.columns])

# 3) Sort by time (important for lags + TimeSeriesSplit)
fe_df = fe_df.sort_values("Date").reset_index(drop=True)

# 4) Define train (labelled) vs test (2023 competition) masks
#    If 'Total' is NaN for 2023, this also protects you:
train_mask = (fe_df["Year"] < 2023) & fe_df[target_col].notna()
test_mask  = fe_df["Year"] == 2023

# 5) Define categorical & numeric feature columns
cat_cols = ["Disease", "Location", "Category_Health_Facility_UUID"]
cat_cols = [c for c in cat_cols if c in fe_df.columns]

numeric_cols = [
    c for c in fe_df.select_dtypes(include="number").columns
    if c != target_col
]

feature_cols = numeric_cols + cat_cols

# 6) Build train features / target
X_train = fe_df.loc[train_mask, feature_cols]
y_train = fe_df.loc[train_mask, target_col]

# 7) Build 2023 test features (no y_test here)
X_test = fe_df.loc[test_mask, feature_cols]
ids_test = fe_df.loc[test_mask, "ID"]   # needed for submission

# 8) Preprocessor + model pipeline
preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", model),
])

# 9) 5-fold time-series CV on training data only
tscv = TimeSeriesSplit(n_splits=5)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

cv_scores = cross_val_score(
    pipe,
    X_train,
    y_train,
    cv=tscv,
    scoring=mae_scorer,
    n_jobs=-1,
)

print("CV MAE per fold:", -cv_scores)
print("Mean CV MAE:", -cv_scores.mean())


#### Make predictions on test

In [None]:
# Fit on all training data (< 2023)
pipe.fit(X_train, y_train)

# Predict for the 2023 competition test set
y_pred_2023 = pipe.predict(X_test)

# Build submission DataFrame
submission = pd.DataFrame({
    "ID": ids_test,          # the ID column from fe_df for 2023 rows
    "Total": y_pred_2023,    # IMPORTANT: column name must be 'Total'
})

submission.to_csv("submission_random_forest.csv", index=False)

print(submission.dtypes)
print(submission.head())