# Imports and Libraries

In [27]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path("./Data")
FLIGHTS_PATH = DATA_DIR / "Flight_on_time_HIX_CA.csv"
WEATHER_PATH = DATA_DIR / "weather_cleaned_HIX.csv"


In [31]:
# flights
f = pd.read_csv(FLIGHTS_PATH)
f.columns = [c.lower().strip().replace(" ", "_") for c in f.columns]
if not {"flightdate","scheduled_departure_time"}.issubset(f.columns):
    raise ValueError("Expected 'FlightDate' and 'scheduled_departure_time' in flights.")

f["dep_dt"] = pd.to_datetime(
    f["flightdate"].astype(str).str.strip() + " " + f["scheduled_departure_time"].astype(str).str.strip(),
    errors="coerce",
    utc=False
)
bad_f = f["dep_dt"].isna().sum()
if bad_f:
    print(f"Dropping {bad_f} flights with invalid dep_dt")
f = f.dropna(subset=["dep_dt"]).sort_values("dep_dt").reset_index(drop=True)

# weather
w = pd.read_csv(WEATHER_PATH)
w.columns = [c.lower().strip().replace(" ", "_") for c in w.columns]
if "datetime" not in w.columns:
    raise ValueError("Expected 'datetime' column in weather.")
w["wx_dt"] = pd.to_datetime(w["datetime"], errors="coerce", utc=False)
bad_w = w["wx_dt"].isna().sum()
if bad_w:
    print(f"Dropping {bad_w} weather rows with invalid wx_dt")
w = w.dropna(subset=["wx_dt"]).sort_values("wx_dt").reset_index(drop=True)


Dropping 66 flights with invalid dep_dt


In [32]:
print("FLIGHTS dep_dt range:", f["dep_dt"].min(), "→", f["dep_dt"].max())
print("WEATHER wx_dt range:", w["wx_dt"].min(), "→", w["wx_dt"].max())


FLIGHTS dep_dt range: 2019-12-01 00:15:00 → 2019-12-31 21:27:00
WEATHER wx_dt range: 2019-11-30 00:00:00 → 2019-12-31 23:00:00


In [33]:
f["dep_key"] = f["dep_dt"].view("int64")
w["wx_key"]  = w["wx_dt"].view("int64")

f = f.sort_values("dep_key").reset_index(drop=True)
w = w.sort_values("wx_key").reset_index(drop=True)

TOL_NS = pd.Timedelta("3H").value

merged = pd.merge_asof(
    f, w,
    left_on="dep_key",
    right_on="wx_key",
    direction="backward",
    tolerance=TOL_NS
)

merged["wx_missing"] = merged["wx_dt"].isna()
merged["wx_staleness_min"] = (merged["dep_dt"] - merged["wx_dt"]).dt.total_seconds() / 60

merged[["flightdate","scheduled_departure_time","dep_dt","wx_dt","wx_staleness_min","wx_missing"]].head(10)


  f["dep_key"] = f["dep_dt"].view("int64")
  w["wx_key"]  = w["wx_dt"].view("int64")
  TOL_NS = pd.Timedelta("3H").value


Unnamed: 0,flightdate,scheduled_departure_time,dep_dt,wx_dt,wx_staleness_min,wx_missing
0,2019-12-01,00:15:00,2019-12-01 00:15:00,2019-12-01 00:00:00,15.0,False
1,2019-12-01,00:15:00,2019-12-01 00:15:00,2019-12-01 00:00:00,15.0,False
2,2019-12-01,00:15:00,2019-12-01 00:15:00,2019-12-01 00:00:00,15.0,False
3,2019-12-01,00:15:00,2019-12-01 00:15:00,2019-12-01 00:00:00,15.0,False
4,2019-12-01,06:00:00,2019-12-01 06:00:00,2019-12-01 06:00:00,0.0,False
5,2019-12-01,06:00:00,2019-12-01 06:00:00,2019-12-01 06:00:00,0.0,False
6,2019-12-01,06:15:00,2019-12-01 06:15:00,2019-12-01 06:00:00,15.0,False
7,2019-12-01,06:20:00,2019-12-01 06:20:00,2019-12-01 06:00:00,20.0,False
8,2019-12-01,06:55:00,2019-12-01 06:55:00,2019-12-01 06:00:00,55.0,False
9,2019-12-01,07:00:00,2019-12-01 07:00:00,2019-12-01 07:00:00,0.0,False


In [34]:
n_total = len(merged)
n_match = (~merged["wx_missing"]).sum()
print(f"Rows: {n_total:,}")
print(f"Matched within tolerance: {n_match:,} ({(n_match/n_total if n_total else np.nan):.1%})")

print("\nStaleness (minutes) — matched only:")
print(merged.loc[~merged["wx_missing"], "wx_staleness_min"].describe())


Rows: 5,190
Matched within tolerance: 5,190 (100.0%)

Staleness (minutes) — matched only:
count    5190.000000
mean       27.764162
std        18.491144
min         0.000000
25%        10.000000
50%        30.000000
75%        45.000000
max        59.000000
Name: wx_staleness_min, dtype: float64
