In [4]:
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import plotly.express as px
import plotly.graph_objects as go

# Configuration
DATA_PATH = "hotel_bookings.csv"
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

# Matplotlib typography
plt.rcParams.update({"figure.dpi": 160,"savefig.dpi": 200,"font.size": 11,"axes.titlesize": 14,"axes.labelsize": 11,"legend.fontsize": 10})

# Plotly template for interactive visualizations
PLOTLY_TEMPLATE = "plotly_white"

# Load and clean data
df = pd.read_csv(DATA_PATH)
df["is_canceled"] = df["is_canceled"].astype(int)
# children can have NA
if "children" in df.columns:
    df["children"] = df["children"].fillna(0)
# Ensure month ordering
month_order = ["January","February","March","April","May","June","July","August","September","October","November","December"]
df["arrival_date_month"] = pd.Categorical(df["arrival_date_month"], categories=month_order, ordered=True)
# Build an "arrival_date" from year + month + day
month_to_num = {m: i+1 for i, m in enumerate(month_order)}
df["arrival_month_num"] = df["arrival_date_month"].astype(str).map(month_to_num)
df["arrival_date"] = pd.to_datetime(dict(year=df["arrival_date_year"], month=df["arrival_month_num"], day=df["arrival_date_day_of_month"]),errors="coerce")
# New features for storyteling
df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
df["party_size"] = df["adults"] + df["children"] + df["babies"]
df["has_children"] = (df["children"] > 0) | (df["babies"] > 0)
# Handle ADR outliers
adr_p01, adr_p99 = df["adr"].quantile([0.01, 0.99])
df["adr_clip"] = df["adr"].clip(lower=adr_p01, upper=adr_p99)
df["booking_month"] = df["arrival_date"].dt.to_period("M").dt.to_timestamp()

# Save matplotlib figure
def save_mpl(fig, filename):
    fig.tight_layout()
    fig.savefig(os.path.join(OUTDIR, filename), bbox_inches="tight")
    plt.close(fig)


# ILLUSTRATION 1 - TIME SERIES: Aggregate bookings by month and hotel
monthly = (df.groupby(["booking_month","hotel"], dropna=False).agg(bookings=("hotel","size"),cancel_rate=("is_canceled","mean"),adr=("adr_clip","mean")).reset_index().sort_values("booking_month"))
fig1 = go.Figure()
for h in monthly["hotel"].dropna().unique():
    sub = monthly[monthly["hotel"] == h]
    fig1.add_trace(go.Scatter(x=sub["booking_month"], y=sub["bookings"],mode="lines", name=f"Bookings — {h}", yaxis="y1"))
fig1.add_trace(go.Scatter(x=monthly.groupby("booking_month")["cancel_rate"].mean().index,y=monthly.groupby("booking_month")["cancel_rate"].mean().values,mode="lines", name="Cancellation rate (overall)", yaxis="y2",line=dict(dash="dash")))
fig1.update_layout(template=PLOTLY_TEMPLATE, title="Monthly demand and cancellation risk (seasonality + trend)",xaxis_title="Arrival month",yaxis=dict(title="Bookings (count)", side="left"),yaxis2=dict(title="Cancellation rate", overlaying="y", side="right", tickformat=".0%"),legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),margin=dict(l=60, r=60, t=70, b=50))


# ILLUSTRATION 2 - CATEGORIC + NUMERIC: plot showing lead time distributions
df_v2 = df.copy()
df_v2["Canceled?"] = np.where(df_v2["is_canceled"]==1, "Canceled", "Not canceled")
fig2 = px.violin(df_v2, x="Canceled?", y="lead_time", color="Canceled?",box=True, points="suspectedoutliers",template=PLOTLY_TEMPLATE,title="Lead time differs strongly between canceled vs. completed bookings")
fig2.update_layout(xaxis_title="Outcome",yaxis_title="Lead time (days)", legend_title_text="Outcome")


# ILLUSTRATION 3 - RELATIONAL: Market segment and distribution channel
pivot = (df.groupby(["market_segment","distribution_channel"], dropna=False).agg(bookings=("is_canceled","size"), cancel_rate=("is_canceled","mean")).reset_index())
pivot = pivot[pivot["bookings"] >= 150].copy()
heat = pivot.pivot(index="market_segment", columns="distribution_channel", values="cancel_rate")
fig3 = go.Figure(data=go.Heatmap(z=heat.values,x=heat.columns.astype(str),y=heat.index.astype(str),colorbar=dict(title="Cancel rate", tickformat=".0%")))
fig3.update_layout(template=PLOTLY_TEMPLATE,title="Cancellation rate by market segment and distribution channel (frequent combinations only)",xaxis_title="Distribution channel",yaxis_title="Market segment",margin=dict(l=90, r=40, t=70, b=50))


# ILLUSTRATION 4 - RELATIONAL
df_s = df.sample(min(len(df), 25000), random_state=42).copy() #Sample
df_s["Canceled?"] = np.where(df_s["is_canceled"]==1, "Canceled", "Not canceled")
fig4 = px.scatter(df_s, x="lead_time", y="adr_clip",color="Canceled?",opacity=0.35,trendline="lowess",template=PLOTLY_TEMPLATE,title="Relationship between lead time and ADR (with LOWESS trend) by cancellation outcome",labels={"lead_time":"Lead time (days)", "adr_clip":"ADR (clipped to 1st–99th pct)"})
fig4.update_layout(legend=dict(orientation="h", y=1.02, x=0))


# ILLUSTRATION 5 - MAP: By guest country
by_country = (df.groupby("country", dropna=False).agg(bookings=("country","size"),cancel_rate=("is_canceled","mean"),adr=("adr_clip","mean")).reset_index())
by_country = by_country[by_country["bookings"] >= 200].copy()
by_country["country"] = by_country["country"].astype(str)
fig5 = px.choropleth(by_country,locations="country", color="cancel_rate",hover_name="country",hover_data={"bookings":":,","adr":":.1f","cancel_rate":":.0%"},color_continuous_scale="Reds",template=PLOTLY_TEMPLATE,title="Global cancellation rate by guest country (countries with ≥200 bookings)")
fig5.update_layout(margin=dict(l=10, r=10, t=70, b=10))


# ILLUSTRATION 6 - FLOW
flow = df.copy()
flow["Outcome"] = np.where(flow["is_canceled"]==1, "Canceled", "Not canceled")
top_segments = flow["market_segment"].value_counts().head(6).index
flow = flow[flow["market_segment"].isin(top_segments)].copy()
links1 = (flow.groupby(["market_segment","deposit_type"]).size().reset_index(name="value"))
links2 = (flow.groupby(["deposit_type","Outcome"]).size().reset_index(name="value"))
nodes = pd.Index(pd.concat([links1["market_segment"],links1["deposit_type"],links2["Outcome"]]).unique())
node_map = {k:i for i,k in enumerate(nodes)}
source = []
target = []
value  = []
for _, r in links1.iterrows():
    source.append(node_map[r["market_segment"]])
    target.append(node_map[r["deposit_type"]])
    value.append(int(r["value"]))
for _, r in links2.iterrows():
    source.append(node_map[r["deposit_type"]])
    target.append(node_map[r["Outcome"]])
    value.append(int(r["value"]))
fig6 = go.Figure(data=[go.Sankey(node=dict(pad=18,thickness=18,label=[str(n) for n in nodes],),link=dict(source=source,target=target,value=value))])
fig6.update_layout(template=PLOTLY_TEMPLATE,title="Flow of bookings: Market segment → Deposit type → Outcome (top segments)",margin=dict(l=10, r=10, t=70, b=10))

In [5]:
# Export Plotly figures as HTML fragments
import plotly.io as pio

pio.write_html(fig1, file="outputs/viz1.html", include_plotlyjs="cdn", full_html=False)
pio.write_html(fig2, file="outputs/viz2.html", include_plotlyjs=False, full_html=False)
pio.write_html(fig3, file="outputs/viz3.html", include_plotlyjs=False, full_html=False)
pio.write_html(fig4, file="outputs/viz4.html", include_plotlyjs=False, full_html=False)
pio.write_html(fig5, file="outputs/viz5.html", include_plotlyjs=False, full_html=False)
pio.write_html(fig6, file="outputs/viz6.html", include_plotlyjs=False, full_html=False)


In [6]:
# Assemble final storytelling HTML using Jinja2
from jinja2 import Template

def load_html(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()
with open("report_template.html", "r", encoding="utf-8") as f:
    template = Template(f.read())
html = template.render(viz1=load_html("outputs/viz1.html"),viz2=load_html("outputs/viz2.html"),viz3=load_html("outputs/viz3.html"),viz4=load_html("outputs/viz4.html"),viz5=load_html("outputs/viz5.html"),viz6=load_html("outputs/viz6.html"),)
with open("Hotel_Booking_Story.html", "w", encoding="utf-8") as f:
    f.write(html)
print("Scrollable storytelling report created: Hotel_Booking_Story.html")


Scrollable storytelling report created: Hotel_Booking_Story.html
