In [4]:
import pandas as pd
from ydata_profiling import ProfileReport
from pathlib import Path
import json

RAW = Path("https://raw.githubusercontent.com/martinsasia/Aeropuertos/main/data/raw/aeropuertos.csv")
DOCS = Path("https://raw.githubusercontent.com/martinsasia/Aeropuertos/tree/main/docs")
#INTERIM = Path("data/interim")
#DOCS.mkdir(exist_ok=True, parents=True)
#INTERIM.mkdir(exist_ok=True, parents=True)

# 1) Carga
flights = pd.read_csv(RAW/"S2_flights.csv")
airports = pd.read_csv(RAW/"S1_airports.csv")

# 2) Perfilado HTML
ProfileReport(flights, title="Profiling - Flights", explorative=True)\
    .to_file(DOCS/"flights_profile.html")
ProfileReport(airports, title="Profiling - Airports", explorative=True)\
    .to_file(DOCS/"airports_profile.html")

# 3) Resumen estructural
def schema_summary(df):
    return (pd.DataFrame({
        "column": df.columns,
        "dtype": [str(t) for t in df.dtypes],
        "null_pct": df.isna().mean().values,
        "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
    }))

sch_f = schema_summary(flights)
sch_a = schema_summary(airports)
sch_f.to_csv(INTERIM/"schema_flights.csv", index=False)
sch_a.to_csv(INTERIM/"schema_airports.csv", index=False)

# 4) Chequeos mínimos
summary = {
    "flights": {
        "rows": len(flights),
        "cols": flights.shape[1],
        "min_rows_ok": len(flights) >= 10000,
        "min_cols_ok": flights.shape[1] >= 15
    },
    "airports": {
        "rows": len(airports),
        "cols": airports.shape[1]
    }
}
(DOCS/"profiling_summary.json").write_text(json.dumps(summary, indent=2))

# 5) Sugerencias rápidas (ejemplo)
suggestions = {
    "possible_keys": {
        "airports": ["IATA", "ICAO"],
        "flights_to_airports_join": {
            "left": ["Origin", "Destination"],
            "right": "IATA"
        }
    },
    "datetime_candidates": [c for c in flights.columns if "sched" in c.lower() or "actual" in c.lower()],
}
(DOCS/"profiling_suggestions.json").write_text(json.dumps(suggestions, indent=2))

print("✅ Profiling completado. Revisa docs/*.html y docs/profiling_summary.json")


URLError: <urlopen error no host given>