In [26]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import joblib
from datetime import datetime
import sys

# === Load Data & Models ===
try:
    df = pd.read_csv("data/dataset2.csv")
    df["date"] = pd.to_datetime(df["date"])

    model = load_model("model/model.keras", compile=False)
    model.compile(optimizer="adam", loss="mse")

    scalerX = joblib.load("model/scaler_X.pkl")
    scalerY = joblib.load("model/scaler_Y.pkl")
    encoder = joblib.load("model/encoder.pkl")

except FileNotFoundError as e:
    sys.exit(f"[FATAL] Missing required file: {e.filename}. Please check dataset and model files.")


def safe_numeric(value, default=0.0):
    """Return value if it is finite, otherwise return default."""
    if pd.isna(value) or np.isinf(value):
        return default
    return value


def predict_future_cases(problem_type, target_date_str, df, model, scalerX, scalerY, encoder, sequence_length=30):
    """
    Predict future reported cases & workforce for a given problem type and date.
    Handles missing values safely to avoid NaN -> int errors.
    """
    np.random.seed(42)
    problem_type_input = problem_type.strip().title()

    try:
        target_date = pd.Timestamp(target_date_str)
    except ValueError:
        raise ValueError("Invalid date format. Use YYYY-MM-DD.")

    today = pd.Timestamp(datetime.today().date())
    if target_date <= today:
        raise ValueError("Target date must be in the future from today.")
    if (target_date - today).days > 60:
        raise ValueError("Target date too far in future (>60 days). Limit your forecast window.")

    subset = df[df["problem_type"] == problem_type_input].sort_values("date").reset_index(drop=True)
    if subset.empty:
        raise ValueError(f"No historical data for problem type '{problem_type_input}'.")

    # Fill missing numeric values in history to avoid NaNs
    numeric_cols = ["reported_cases", "workforce_required", "severity_score", 
                    "weather_score", "rainfall_mm", "problem_severity_interaction"]
    for col in numeric_cols:
        if col in subset.columns:
            subset[col] = subset[col].fillna(subset[col].median())  # safe fallback

    seq = subset.tail(sequence_length).copy()
    if len(seq) < sequence_length:
        last_row = seq.iloc[-1].copy()
        last_date = last_row["date"]
        for i in range(sequence_length - len(seq)):
            new_row = last_row.copy()
            new_row["date"] = last_date + pd.Timedelta(days=(i + 1))
            seq = pd.concat([seq, pd.DataFrame([new_row])], ignore_index=True)

    while target_date not in seq["date"].values:
        last_seq = seq.tail(sequence_length).copy()
        next_date = seq["date"].max() + pd.Timedelta(days=1)

        new_row = last_seq.iloc[-1].copy()
        new_row["date"] = next_date
        new_row["day_of_week"] = next_date.weekday()
        new_row["month"] = next_date.month
        new_row["is_weekend"] = int(next_date.weekday() >= 5)
        new_row["holiday_flag"] = 0

        prev_cases = safe_numeric(last_seq["reported_cases"].iloc[-1], default=0)
        avg_cases = safe_numeric(last_seq["reported_cases"].tail(3).mean(), default=prev_cases)

        new_row["prev_day_cases"] = prev_cases
        new_row["prev_3day_avg_cases"] = avg_cases

        new_row["severity_score"] = safe_numeric(last_seq["severity_score"].iloc[-1], 1)
        new_row["weather_score"] = np.clip(
            safe_numeric(last_seq["weather_score"].iloc[-1], 0) + np.random.normal(0, 0.05), 0.0, 1.0
        )
        new_row["rainfall_mm"] = max(0.0, safe_numeric(last_seq["rainfall_mm"].iloc[-1], 0) + np.random.normal(0, 2))
        new_row["problem_severity_interaction"] = safe_numeric(
            last_seq["problem_severity_interaction"].iloc[-1], 1
        )

        # === Prepare Model Input (Exactly Like Training) ===
        X_seq_to_predict = last_seq.drop(columns=["date", "reported_cases", "workforce_required"])
        X_seq_to_predict = pd.concat([
            X_seq_to_predict.drop(columns=["problem_type", "region"]),
            pd.DataFrame(
                encoder.transform(X_seq_to_predict[["problem_type", "region"]]),
                columns=encoder.get_feature_names_out(["problem_type", "region"])
            )
        ], axis=1)

        X_scaled = scalerX.transform(X_seq_to_predict)
        X_input = np.array([X_scaled])

        y_pred_scaled = model.predict(X_input, verbose=0)
        predicted_cases = safe_numeric(scalerY.inverse_transform(y_pred_scaled)[0, 0], 0)

        severity_factor = {1: 1, 2: 1.5, 3: 2, 4: 3}
        sev = int(max(1, min(4, round(new_row["severity_score"]))))
        predicted_workforce = int(max(0, predicted_cases * severity_factor[sev]))

        new_row["reported_cases"] = predicted_cases
        new_row["workforce_required"] = predicted_workforce
        seq = pd.concat([seq, pd.DataFrame([new_row])], ignore_index=True)

    target_row = seq[seq["date"] == target_date].iloc[-1]

    return {
        "requested_target_date": target_date.strftime("%Y-%m-%d"),
        "problem_type": problem_type_input,
        "predicted_cases": int(round(safe_numeric(target_row["reported_cases"], 0))),
        "predicted_workforce": int(round(safe_numeric(target_row["workforce_required"], 0)))
    }


if __name__ == "__main__":
    problem_type = "Garbage & Waste"
    target_date_str = "2025-10-20"

    try:
        result = predict_future_cases(problem_type, target_date_str, df, model, scalerX, scalerY, encoder)
        print("\n=== Future Prediction ===")
        print(f"Date: {result['requested_target_date']}")
        print(f"Problem Type: {result['problem_type']}")
        print(f"Predicted Reported Cases: {result['predicted_cases']}")
        print(f"Predicted Workforce Required: {result['predicted_workforce']}\n")
    except Exception as e:
        print(f"[ERROR] Prediction failed: {e}")


KeyboardInterrupt: 