# GECCO2018 Water Quality — EDA

This notebook explores the dataset structure, basic statistics, and distributions to inform model design.


In [None]:
# Imports and setup
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from src.data.loaders import load_gecco2018_csv
from src.features.preprocess import impute_and_scale
from src.visualization.plots import plot_pairwise_histograms


In [None]:
# Load data
try:
    df = load_gecco2018_csv()
except FileNotFoundError as e:
    print(e)
    raise

print(df.shape)
df.head()


In [None]:
# Basic info and missingness
print(df.info())
df.describe(include='all').T.head(20)


In [None]:
# Distribution plots
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
plot_pairwise_histograms(df[numeric_cols], max_cols=12)


In [None]:
# Preprocessing quick pass
scaled_df, scaler = impute_and_scale(df)
print(scaled_df.shape)
scaled_df.head()


In [None]:
# Baseline anomaly scores
from src.models.baselines import run_all_baselines

X = scaled_df.values
scores = run_all_baselines(X)
for name, s in scores.items():
    print(name, np.asarray(s).shape, np.nanmean(s))
