In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler



In [None]:
data_path = Path("../processed data/hotel_bookings_final.pkl")
df = pd.read_pickle(data_path)

# Separate features and target
X = df.drop(columns=["is_canceled"])
y = df["is_canceled"]

# Scale features for PCA stability
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)



In [None]:
pca = PCA(n_components=2, random_state=42)
pca_components = pca.fit_transform(X_scaled)

explained_var = pca.explained_variance_ratio_
print("Explained variance ratios:", explained_var)
print("Cumulative variance (2 components):", explained_var.sum())


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_components[:,0], y=pca_components[:,1], hue=y, palette="coolwarm", alpha=0.6)
plt.title("PCA: Bookings Colored by Cancellation")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(title="is_canceled")
plt.tight_layout()
plt.show()



In [None]:
# PCA loadings to see which features drive each component
loadings = pd.DataFrame(pca.components_, columns=X.columns, index=["PC1","PC2"])

# Top absolute loadings per component
for pc in loadings.index:
    top_features = loadings.loc[pc].abs().sort_values(ascending=False).head(10)
    print(f"\nTop contributors for {pc}:")
    print(top_features)



In [None]:
# Save PCA components for downstream visualization
output_dir = Path("../processed data")
output_dir.mkdir(parents=True, exist_ok=True)

pca_df = pd.DataFrame(pca_components, columns=["PC1","PC2"])
pca_df["is_canceled"] = y.reset_index(drop=True)

pca_df.to_pickle(output_dir / "hotel_pca_components.pkl")
loadings.to_pickle(output_dir / "hotel_pca_loadings.pkl")

pca_df.head()



In [None]:
from sklearn.cluster import KMeans

# Cluster on the 2D PCA space
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(pca_components)

pca_df["cluster"] = clusters
pca_df.head()


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=pca_df["PC1"],
    y=pca_df["PC2"],
    hue=pca_df["cluster"].astype(str),
    palette="tab10",
    alpha=0.6
)
plt.title("PCA Clusters (KMeans on PC1 & PC2)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()


In [None]:
# Summarize top positive/negative loadings for each component
n_top = 6
for pc in ["PC1", "PC2"]:
    comp = loadings.loc[pc]
    top_pos = comp.sort_values(ascending=False).head(n_top)
    top_neg = comp.sort_values().head(n_top)
    print(f"\n{pc} — top +loadings:")
    print(top_pos)
    print(f"\n{pc} — top -loadings:")
    print(top_neg)



## How to read these components
- **PC1**: Driven by the largest positive/negative loadings above. High positive scores come from the +loading features; high negative scores from the -loading features. Interpret by checking which operational/booking behaviors these features represent.
- **PC2**: Same idea—use the loadings table and the top +/- summaries to label this axis (e.g., “long lead / higher ADR” vs “short lead / lower ADR” if those appear in your outputs).
- **Clusters**: Points are grouped in PCA space; inspect each cluster’s centroid (via the KMeans labels) to see which PC directions they lean toward. You can also merge `pca_df` back to the original data on index to profile clusters by room type, deposit type, or lead time.
- **Next step (optional)**: Add a small profiling cell to compute per-cluster means of the original key features (lead_time, adr, deposit_type indicators, etc.) to narrate “Cluster 0 looks like X, Cluster 1 like Y.”


In [None]:
# Quick cluster profile on a few interpretable (scaled) features
key_cols = [
    "lead_time",
    "adr",
    "total_of_special_requests",
    "previous_cancellations",
    "booking_changes",
    "deposit_type_Non Refund",
    "deposit_type_Refundable",
    "market_segment_Online TA",
    "market_segment_Groups",
    "distribution_channel_Online TA",
]

present_cols = [c for c in key_cols if c in df.columns]
cluster_profile = (
    pd.concat([df.reset_index(drop=True), pca_df[["cluster"]]], axis=1)
      .groupby("cluster")[present_cols]
      .mean()
)

print("Cluster profile (scaled feature means):")
cluster_profile
