In [1]:
# Install packages (quietly)
!pip -q install pandas numpy matplotlib seaborn statsmodels patsy pyjanitor tqdm

import os, re, io, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
warnings.filterwarnings("ignore")
plt.rcParams["figure.dpi"] = 120

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.discrete.discrete_model import NegativeBinomial as NB2
from patsy import dmatrices
from patsy import bs
from tqdm import tqdm
tqdm.pandas()


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/215.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.4/215.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import files
from zipfile import ZipFile

uploaded = files.upload()        # choose your ZIP (e.g., "ap stat dataset.zip")
zip_name = list(uploaded.keys())[0]
print("Uploaded:", zip_name)

os.makedirs("/content/data", exist_ok=True)
with ZipFile(io.BytesIO(uploaded[zip_name])) as zf:
    zf.extractall("/content/data")

# Show extracted files
for root, _, files_ in os.walk("/content/data"):
    for f in files_:
        if f.lower().endswith((".csv", ".xlsx", ".txt")):
            print(os.path.join(root, f))

os.makedirs("/content/outputs", exist_ok=True)


Saving ap stat dataset.zip to ap stat dataset.zip
Uploaded: ap stat dataset.zip
/content/data/pollutants_and_hosp.csv


In [3]:
# Find first CSV (edit if you want a specific one)
csv_candidates = []
for root, _, files_ in os.walk("/content/data"):
    for f in files_:
        if f.lower().endswith(".csv"):
            csv_candidates.append(os.path.join(root, f))

assert len(csv_candidates) > 0, "No CSV found inside the ZIP. Please check the archive."
CSV_PATH = csv_candidates[0]
print("Using CSV:", CSV_PATH)

# Read with a separator guess
for sep in [",",";","\t","|"]:
    try:
        df_raw = pd.read_csv(CSV_PATH, sep=sep, engine="python")
        if df_raw.shape[1] > 3:
            break
    except Exception:
        pass

df = df_raw.copy()

# Clean names: lower, underscores, strip punctuation
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(r"[^\w]+", "_", regex=True)
              .str.replace("__+", "_", regex=True)
              .str.strip("_")
)

print("Shape:", df.shape)
df.head(3)


Using CSV: /content/data/pollutants_and_hosp.csv
Shape: (1462, 21)


Unnamed: 0,data,sih_sul_i_old,sih_sul_j_old,sih_sul_i_kid,sih_sul_j_kid,sih_gv_i_old,sih_gv_j_old,sih_gv_i_kid,sih_gv_j_kid,pm25_gv,...,so2_gv,no2_gv,co_gv,o3_gv,co_sul,o3_sul,no2_sul,so2_sul,pm10_sul,pm25_sul
0,01/01/2015,0,3,0,1,11,5,1,9,11.680254,...,10.667604,7.143889,296.19325,31.951042,298.583542,35.724583,9.857808,25.510417,27.001052,5.857146
1,02/01/2015,0,1,0,1,10,6,0,8,12.645833,...,10.544443,9.440139,341.62775,30.016771,372.735464,36.766667,15.922458,37.404937,34.650446,9.369452
2,03/01/2015,1,2,0,1,2,2,1,2,11.020833,...,9.020417,9.303472,344.044,30.358958,327.966098,34.597917,12.472983,26.540933,33.638609,8.496304


In [4]:
# --- Date detection ---
date_candidates = [c for c in df.columns if c in ["date","data","dt","day","day_date"] or re.search(r"^(date|data)", c)]
assert len(date_candidates) > 0, f"Couldn't auto-detect a date column. Found: {df.columns.tolist()[:20]}"
DATE_COL = date_candidates[0]

def try_parse_date(s):
    for fmt in ("%Y-%m-%d","%d/%m/%Y","%d-%m-%Y","%m/%d/%Y"):
        try:
            return pd.to_datetime(s, format=fmt, errors="raise")
        except Exception:
            pass
    return pd.to_datetime(s, errors="coerce")

df["date"] = try_parse_date(df[DATE_COL])
assert df["date"].notna().any(), "Date parsing failed—please set DATE_COL or add a format."
df = df.sort_values("date").reset_index(drop=True)

# --- Guess outcomes & pollutants ---
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Outcomes (daily counts) heuristic
guess_outcomes = [c for c in num_cols if re.search(r"(sih|hospital|admi|resp|icd|cases|count)", c)]
if len(guess_outcomes) > 12:
    means = df[guess_outcomes].mean().sort_values()
    guess_outcomes = means.index.tolist()[:12]

# Pollutants
pollutant_keys = ["pm25","pm10","no2","so2","co","o3"]
guess_pollutants = [c for c in num_cols if any(k in c for k in pollutant_keys)]

# Region suffixes (e.g., _gv, _sul)
suffixes = set()
for c in guess_outcomes + guess_pollutants:
    m = re.search(r"_(gv|sul|north|south|east|west)$", c)
    if m: suffixes.add(m.group(1))
suffixes = list(suffixes) if suffixes else ["gv","sul"]

print("Guessed DATE_COL:", DATE_COL)
print("Guessed OUTCOMES:", guess_outcomes)
print("Guessed POLLUTANTS:", guess_pollutants)
print("Detected/assumed region suffixes:", suffixes)

# ---- EDIT below if needed ----
OUTCOMES   = guess_outcomes      # e.g., ["sih_gv_j_kid", "sih_gv_j_old", ...]
POLLUTANTS = guess_pollutants    # e.g., ["no2_gv","co_gv","pm10_gv", "no2_sul", ...]
REGION_SUFFIX_PRIORITY = ["gv","sul"]
# --------------------------------

# Time helpers
df["dow"]  = df["date"].dt.day_name().str[:3]
df["year"] = df["date"].dt.year
df["t"]    = (df["date"] - df["date"].min()).dt.days.astype(int)

print("Final OUTCOMES:", OUTCOMES)
print("Final POLLUTANTS:", POLLUTANTS)


Guessed DATE_COL: data
Guessed OUTCOMES: ['sih_sul_i_old', 'sih_sul_j_old', 'sih_sul_i_kid', 'sih_sul_j_kid', 'sih_gv_i_old', 'sih_gv_j_old', 'sih_gv_i_kid', 'sih_gv_j_kid']
Guessed POLLUTANTS: ['pm25_gv', 'pm10_gv', 'so2_gv', 'no2_gv', 'co_gv', 'o3_gv', 'co_sul', 'o3_sul', 'no2_sul', 'so2_sul', 'pm10_sul', 'pm25_sul']
Detected/assumed region suffixes: ['gv', 'sul']
Final OUTCOMES: ['sih_sul_i_old', 'sih_sul_j_old', 'sih_sul_i_kid', 'sih_sul_j_kid', 'sih_gv_i_old', 'sih_gv_j_old', 'sih_gv_i_kid', 'sih_gv_j_kid']
Final POLLUTANTS: ['pm25_gv', 'pm10_gv', 'so2_gv', 'no2_gv', 'co_gv', 'o3_gv', 'co_sul', 'o3_sul', 'no2_sul', 'so2_sul', 'pm10_sul', 'pm25_sul']


In [5]:
# Table 1: Descriptive statistics (save for report)
desc = df[sorted(set(OUTCOMES + POLLUTANTS))].describe().T
desc.to_csv("/content/outputs/descriptive_stats.csv")
desc.head(10)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
co_gv,1462.0,398.980719,87.866808,189.541667,337.502458,389.209333,451.858792,752.367232
co_sul,1445.0,303.176184,95.516844,85.983,242.43,293.034583,350.116042,1926.607986
no2_gv,1451.0,16.822951,6.602466,4.425,11.735208,15.605833,21.491267,41.885652
no2_sul,1459.0,6.697719,3.03775,1.425639,4.431506,6.188553,8.409635,20.101683
o3_gv,1462.0,29.484663,9.541588,9.208472,22.399358,28.317708,35.065486,78.57625
o3_sul,1109.0,37.803455,15.443234,0.318182,28.56875,36.694583,45.955417,339.02
pm10_gv,1462.0,24.207882,7.691013,8.333333,18.709725,22.936632,28.533282,58.625
pm10_sul,1451.0,20.779864,7.539263,4.172766,15.35437,19.641146,25.125401,56.539015
pm25_gv,1457.0,10.947671,3.350204,1.8,8.625,10.208333,12.520833,35.289474
pm25_sul,1462.0,5.743247,2.731668,1.498056,3.784722,5.196135,6.890375,19.564083


In [7]:
# Time-series (outcomes)
for y in OUTCOMES:
    ax = df.plot(x="date", y=y, figsize=(8,3), legend=False, title=f"Daily {y}")
    ax.set_xlabel("")
    plt.tight_layout()
    plt.savefig(f"/content/outputs/ts_{y}.png")
    plt.close()

# Time-series (pollutants)
for p in POLLUTANTS:
    ax = df.plot(x="date", y=p, figsize=(8,3), legend=False, title=f"Daily {p}")
    ax.set_xlabel("")
    plt.tight_layout()
    plt.savefig(f"/content/outputs/ts_{p}.png")
    plt.close()


In [8]:
# Pollutant correlation (matrix + heatmap)
if len(POLLUTANTS) >= 2:
    cm = df[POLLUTANTS].corr(method="pearson")
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, vmin=-1, vmax=1, cmap="coolwarm", square=True, cbar_kws={"shrink":0.8})
    plt.title("Pollutant correlation")
    plt.tight_layout()
    plt.savefig("/content/outputs/corr_pollutants.png")
    plt.close()
    cm.to_csv("/content/outputs/corr_pollutants.csv")
