In [None]:
from airline_revenue_analytics.viz.charts import apply_style, PLOT_COLORS
apply_style()
PASS_COLOR = "#D9F2E6"
FAIL_COLOR = "#FCE4E4"
NEG_BG_COLOR = FAIL_COLOR


In [None]:
# Project paths (booking pipeline)
from pathlib import Path
import sys

def find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "pyproject.toml").exists() and (p / "src" / "airline_revenue_analytics").exists():
            return p
    return start

REPO_ROOT = find_repo_root(Path.cwd())
PROJECT_ROOT = REPO_ROOT
SRC_ROOT = REPO_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))

from airline_revenue_analytics.config import get_paths

PATHS = get_paths("booking")
DATA_DIR = REPO_ROOT / "data"
RAW_DIR = PATHS.data_raw
DB_PATH = PATHS.db_path
OUTPUT_DIR = PATHS.outputs_root
FIG_DIR = PATHS.figures
TAB_DIR = PATHS.tables
ART_DIR = PATHS.artifacts

def _rel(p: Path) -> str:
    try:
        return str(Path(p).resolve().relative_to(REPO_ROOT))
    except Exception:
        return Path(p).name

print("REPO_ROOT:", REPO_ROOT.name)
print("DB_PATH:", _rel(DB_PATH))
print("OUTPUT_DIR:", _rel(OUTPUT_DIR))


 
00 — Setup & Data Intake
Goal: prepare environment, locate SQLite DB under data/raw/, load core tables,
and export Step 2.1-2.2 evidence (shapes, samples).


In [None]:
# Environment & paths (robust to different notebook working dirs)
import sys, warnings, pathlib
warnings.filterwarnings("ignore")

print("Python:", sys.version.split()[0])

# Try several common locations for data/raw relative to current CWD
CANDIDATE_RAW_DIRS = [RAW_DIR]

RAW_DIR = None
for cand in CANDIDATE_RAW_DIRS:
    if cand.exists():
        RAW_DIR = cand.resolve()
        break
# If none exists yet, still define a default (first one); DB search will raise if missing.
if RAW_DIR is None:
    RAW_DIR = CANDIDATE_RAW_DIRS[0].resolve()

OUT_DIR = OUTPUT_DIR
FIG_DIR = OUT_DIR / "figures"
TAB_DIR = OUT_DIR / "tables"
for d in (FIG_DIR, TAB_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("RAW_DIR:", RAW_DIR)
print("OUT_DIR:", _rel(OUT_DIR))

In [None]:
import numpy as np, pandas as pd, matplotlib
import matplotlib.pyplot as plt

import sys, pathlib
# 把仓库根目录加入 sys.path，这样 Python 能找到 src/
sys.path.append(str(PROJECT_ROOT))

from airline_revenue_analytics.io import find_sqlite, load_core_tables, replace_literal_N
from airline_revenue_analytics.features.segment import parse_json_en

print("numpy:", np.__version__, "| pandas:", pd.__version__, "| matplotlib:", matplotlib.__version__)

In [None]:
# Find DB and load core tables
try:
    db_path = find_sqlite(RAW_DIR)
    print("Using DB:", db_path)
except FileNotFoundError as e:
    raise FileNotFoundError(
        f"{e}\n\n> Tip: put your SQLite file under one of these paths:\n"
        + "\n".join([f"  - {p.resolve()}" for p in CANDIDATE_RAW_DIRS])
    )

tables = load_core_tables(db_path)
print("Loaded tables:", list(tables.keys()))

In [None]:
# Replace textual '\N' sentinels with NaN and export basic evidence
snap = []
for name, df in tables.items():
    df = replace_literal_N(df)
    tables[name] = df  # keep cleaned version
    snap.append((name, df.shape[0], df.shape[1]))
    # Save 5-row sample for appendix evidence
    df.head(5).to_csv(TAB_DIR / f"sample_{name}.csv", index=False)

shapes_df = pd.DataFrame(snap, columns=["table","rows","cols"]).sort_values("table")
shapes_df.to_csv(TAB_DIR/"table_shapes.csv", index=False)
pd.DataFrame({"table": list(tables.keys())}).to_csv(TAB_DIR/"table_list.csv", index=False)

print("Saved evidence CSVs to:", _rel((TAB_DIR)))
shapes_df

In [None]:
# Optional: parse multilingual JSON text to English for lookups (if present)
if "airports_data" in tables and "airport_name" in tables["airports_data"].columns:
    tables["airports_data"]["airport_name_en"] = parse_json_en(tables["airports_data"]["airport_name"])

if "aircrafts_data" in tables and "model" in tables["aircrafts_data"].columns:
    tables["aircrafts_data"]["model_en"] = parse_json_en(tables["aircrafts_data"]["model"])

print("Parsed multilingual JSON to English where applicable.")

In [None]:
# Quick glance (safe if table exists)
if "bookings" in tables:
    display(tables["bookings"].head(3))

# Export a columns overview for each table (helps writing data dictionary)
cols_rows = []
for name, df in tables.items():
    for c in df.columns:
        cols_rows.append((name, c, str(df[c].dtype)))
pd.DataFrame(cols_rows, columns=["table","column","dtype"]).to_csv(TAB_DIR/"columns_overview.csv", index=False)

print("Wrote:", _rel((TAB_DIR/"columns_overview.csv")))

 
**Next:** open `01_data_understanding.ipynb` for Step 2.3–2.4 (EDA & data quality).
Figures will be saved under `outputs/booking/figures/`.
