# GECCO2018 Water Quality — Notebook 02: Time-Aware Visualizations

Este notebook foca em visualizações orientadas a série temporal do dataset GECCO2018.

Objetivos:
- Carregar o dataset e aplicar limpezas básicas (parse de `Time`, coerção de tipos, remoção de nulos).
- Visualizar séries temporais das variáveis sensoriais com indicação de `EVENT`.
- Explorar distribuições e correlações entre sensores.
- Salvar figuras principais em `reports/figures/`.

In [None]:
# Ensure project root on sys.path for `import src.*`
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
print("sys.path bootstrapped with:", PROJECT_ROOT)

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from src.utils.paths import ensure_directories_exist, FIGURES_DIR
from src.data.loaders import load_gecco2018_csv

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (14, 5)

In [None]:
# Ensure data/ and reports/figures directories exist
ensure_directories_exist()
print("FIGURES_DIR:", FIGURES_DIR)

In [None]:
# Load dataset
try:
    df = load_gecco2018_csv()
except FileNotFoundError as e:
    print(e)
    raise

print(df.shape)
df.head(10)

In [None]:
# Basic cleaning: drop unnamed, parse time, coerce types, drop nulls
# Drop unnamed index column if present
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# Parse Time
df["Time"] = pd.to_datetime(df["Time"], errors="coerce")

# Coerce numeric columns
numeric_cols = ["Tp", "Cl", "pH", "Redox", "Leit", "Trueb", "Cl_2", "Fm", "Fm_2"]
for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Ensure EVENT is boolean
if df["EVENT"].dtype != bool:
    df["EVENT"] = df["EVENT"].astype(str).str.lower().isin(["true", "1", "t", "yes"])

# Drop rows with nulls in Time or numeric sensors
df_clean = df.dropna(subset=["Time"] + numeric_cols)

# Sort by time and set index
df_clean = df_clean.sort_values("Time").reset_index(drop=True)
df_ts = df_clean.set_index("Time").sort_index()

print("After cleaning:", df_ts.shape)
df_ts.head(5)

In [None]:
# Info and missingness check
print(df_ts.info())
df_ts.isnull().sum()

In [None]:
# Event count and timeframe
print("EVENT true count:", int(df_ts["EVENT"].sum()))
print("Time range:", df_ts.index.min(), "->", df_ts.index.max())

In [None]:
# Time series plots with event overlays
import matplotlib.pyplot as plt

def plot_timeseries_with_events(df_ts, columns, event_col="EVENT", max_cols_per_fig=3, save=False, prefix="ts"):
    cols = list(columns)
    for start in range(0, len(cols), max_cols_per_fig):
        subset = cols[start:start+max_cols_per_fig]
        n = len(subset)
        fig, axes = plt.subplots(n, 1, figsize=(14, 4*n), sharex=True)
        if n == 1:
            axes = [axes]
        for ax, col in zip(axes, subset):
            ax.plot(df_ts.index, df_ts[col], color="C0", linewidth=0.8)
            ax.set_ylabel(col)
            ax.set_title(f"{col} over time")
            if event_col in df_ts.columns:
                event_times = df_ts.index[df_ts[event_col]]
                if len(event_times) > 0:
                    ymin, ymax = ax.get_ylim()
                    ax.vlines(event_times, ymin=ymin, ymax=ymax, color="crimson", alpha=0.2, linewidth=0.5)
        plt.tight_layout()
        if save:
            from pathlib import Path as _Path
            out = _Path(FIGURES_DIR) / f"{prefix}_{start//max_cols_per_fig+1}.png"
            fig.savefig(out, dpi=150, bbox_inches="tight")
        plt.show()

sensor_cols = ["Tp", "Cl", "pH", "Redox", "Leit", "Trueb", "Cl_2", "Fm", "Fm_2"]
plot_timeseries_with_events(df_ts, sensor_cols, save=True, prefix="timeseries")

In [None]:
# Distributions
numeric_cols = df_ts.select_dtypes(include=[np.number]).columns.tolist()
df_ts[numeric_cols].hist(bins=30, figsize=(16, 12))
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numeric_cols = df_ts.select_dtypes(include=[np.number]).columns.tolist()
corr = df_ts[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=False, cmap="vlag", center=0)
plt.title("Correlation heatmap (numeric features)")
plt.tight_layout()
plt.show()

In [None]:
# Resampled means as a smoother view
sensor_cols = ["Tp", "Cl", "pH", "Redox", "Leit", "Trueb", "Cl_2", "Fm", "Fm_2"]
window = "1H"
df_roll = df_ts[sensor_cols].resample(window).mean()
_ = df_roll.plot(subplots=False, figsize=(14,6), linewidth=1.2)
plt.title(f"Resampled mean ({window})")
plt.tight_layout()
plt.show()

In [None]:
# Save correlation heatmap to figures
from pathlib import Path as _Path
numeric_cols = df_ts.select_dtypes(include=[np.number]).columns.tolist()
corr = df_ts[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=False, cmap="vlag", center=0)
plt.title("Correlation heatmap (numeric features)")
plt.tight_layout()
out = _Path(FIGURES_DIR) / "correlation_heatmap.png"
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.show()
print("Saved:", out)

## Resumo
- `df_ts`: DataFrame limpo e indexado por tempo.
- Visualizações: séries temporais com marcação de eventos, distribuições e correlação.
- Figuras salvas em `reports/figures/`.