In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import plotly.express as px


In [8]:
BASE = Path("input")
OUT = Path("output")
OUT.mkdir(parents=True, exist_ok=True)

FILES = {
    "outbreaks": BASE / "Outbreaks.csv",
    "dons_raw": BASE / "DONsRaw.csv",
    "unique_dons": BASE / "UniqueDONs.csv",
    "covid_outbreaks": BASE / "COVIDOutbreaks.csv",
    "icd": BASE / "icd1011.csv",
    "who_covid": BASE / "WHO-COVID-19-global-data.csv",
    "isocodes": BASE / "isocodes.csv",
    "readme": BASE / "README.pdf",
}

In [9]:
def read_csv_smart(path: Path) -> pd.DataFrame:
    # читает CSV с несколькими вариантами кодировок
    for enc in (None, "utf-8", "utf-8-sig", "latin1"):
        try:
            return pd.read_csv(path, encoding=enc)  # type: ignore
        except Exception:
            pass
    raise RuntimeError(f"Failed to read CSV: {path}")

if not FILES["outbreaks"].exists():
    raise FileNotFoundError("input/Outbreaks.csv not found")

raw = read_csv_smart(FILES["outbreaks"])  # читаем файл

rename_map = {
    "Country": "country",
    "iso2": "iso2",
    "iso3": "iso3",
    "Year": "year",
    "Disease": "disease",
    "DONs": "source",
    "Definition": "definition",
}

# проверяем, что все нужные колонки есть
missing = [c for c in rename_map if c not in raw.columns]
if missing:
    raise ValueError(f"Columns missing in Outbreaks.csv: {missing}")

cols = list(rename_map.keys())
df = raw[cols].rename(columns=rename_map).copy()  # переименовываем и копируем

# очищаем строки, обрезаем пробелы
for c in ["country", "disease", "source", "definition"]:
    df[c] = df[c].astype(str).str.strip()
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df = df.dropna(subset=["country", "iso3", "year", "disease"]).reset_index(drop=True)
    clean_path = OUT / "outbreaks_clean.csv"

# сохраняем очищенные данные
df.to_csv(clean_path, index=False)

# считаем количество вспышек по годам
year_counts = (
    df.groupby("year", dropna=True)
      .size()
      .rename("outbreaks")
      .reset_index()
      .sort_values("year")
)

# топ-20 болезней по количеству вспышек
top_diseases = (
    df.groupby("disease")
      .size()
      .rename("n_outbreaks")
      .sort_values(ascending=False)
      .head(20)
      .reset_index()
)

# топ-20 стран по количеству вспышек
top_countries = (
    df.groupby(["country", "iso3"])
      .size()
      .rename("n_outbreaks")
      .sort_values(ascending=False)
      .head(20)
      .reset_index()
)

# количество вспышек по стране и году
country_year = (
    df.groupby(["iso3", "country", "year"], dropna=True)
      .size()
      .rename("n_outbreaks")
      .reset_index()
      .sort_values(["year", "iso3"])
)

# сохраняем таблицу по странам и годам
country_year.to_csv(OUT / "outbreaks_country_year_counts.csv", index=False)

# строим график количества вспышек по годам
plt.figure(figsize=(10, 5))
plt.plot(year_counts["year"], year_counts["outbreaks"], marker="o")
plt.title("Number of recorded outbreaks per year")
plt.xlabel("Year")
plt.ylabel("Outbreaks")
plt.grid(True, linestyle="--", alpha=0.4)
plt.tight_layout()
plt.savefig(OUT / "outbreaks_per_year.png")
plt.close()

# анимированная карта вспышек по странам и годам
fig_anim = px.choropleth(
    country_year,
    locations="iso3",
    color="n_outbreaks",
    hover_name="country",
    animation_frame="year",
    locationmode="ISO-3",
    color_continuous_scale="Viridis",
    title="Infectious disease outbreaks per country by year (1996–2024)",
)
fig_anim.update_layout(
    coloraxis_colorbar=dict(title="Outbreaks"),
    margin=dict(l=0, r=0, t=50, b=0),
)
fig_anim.write_html(OUT / "outbreaks_map_animated.html", include_plotlyjs=True)

# готовим список из 12 наиболее частых болезней
diseases_menu = (
    df.groupby("disease")
      .size()
      .sort_values(ascending=False)
      .head(12)
      .index
      .tolist()
)

# считаем количество вспышек по стране для каждой болезни
disease_country_totals = (
    df[df["disease"].isin(diseases_menu)]
    .groupby(["disease", "iso3", "country"])
    .size()
    .rename("n_outbreaks")
    .reset_index()
)

# первая карта (по первой болезни)
first = diseases_menu[0]
df0 = disease_country_totals[disease_country_totals["disease"] == first]
fig_drop = px.choropleth(
    df0,
    locations="iso3",
    color="n_outbreaks",
    hover_name="country",
    locationmode="ISO-3",
    color_continuous_scale="Viridis",
    title=""
)

# добавляем остальные болезни как слои
for dis in diseases_menu[1:]:
    dfi = disease_country_totals[disease_country_totals["disease"] == dis]
    figi = px.choropleth(
        dfi,
        locations="iso3",
        color="n_outbreaks",
        hover_name="country",
        locationmode="ISO-3",
        color_continuous_scale="Viridis",
    )
    fig_drop.add_trace(figi.data[0])

# создаем кнопки переключения между болезнями
buttons = []
for i, dis in enumerate(diseases_menu):
    visible = [False] * len(diseases_menu)
    visible[i] = True
    buttons.append(
        dict(
            label=dis,
            method="update",
            args=[
                {"visible": visible},
                {"title": f"Outbreaks by disease (totals 1996–2024) — {dis}"},
            ],
        )
    )

# обновляем layout с выпадающим меню
fig_drop.update_layout(
    updatemenus=[
        dict(
            type="dropdown",
            x=0.01, y=1.05,
            showactive=True,
            buttons=buttons,
        )
    ],
    margin=dict(l=0, r=0, t=50, b=0),
    width=1200,
    height=800,
)

# делаем первую болезнь видимой
for j, tr in enumerate(fig_drop.data):
    tr.visible = (j == 0)

# сохраняем карту
fig_drop.write_html(OUT / "outbreaks_map_by_disease.html", include_plotlyjs=True)