In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler



In [None]:
data_path = Path("../processed data/hotel_bookings_final.pkl")
df = pd.read_pickle(data_path)

# Separate features and target
X = df.drop(columns=["is_canceled"])
y = df["is_canceled"]

# Scale features for PCA stability
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)



In [None]:
pca = PCA(n_components=2, random_state=42)
pca_components = pca.fit_transform(X_scaled)

explained_var = pca.explained_variance_ratio_
print("Explained variance ratios:", explained_var)
print("Cumulative variance (2 components):", explained_var.sum())


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_components[:,0], y=pca_components[:,1], hue=y, palette="coolwarm", alpha=0.6)
plt.title("PCA: Bookings Colored by Cancellation")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(title="is_canceled")
plt.tight_layout()
plt.show()



In [None]:
# PCA loadings to see which features drive each component
loadings = pd.DataFrame(pca.components_, columns=X.columns, index=["PC1","PC2"])

# Top absolute loadings per component
for pc in loadings.index:
    top_features = loadings.loc[pc].abs().sort_values(ascending=False).head(10)
    print(f"\nTop contributors for {pc}:")
    print(top_features)



In [None]:
# Save PCA components for downstream visualization
output_dir = Path("../processed data")
output_dir.mkdir(parents=True, exist_ok=True)

pca_df = pd.DataFrame(pca_components, columns=["PC1","PC2"])
pca_df["is_canceled"] = y.reset_index(drop=True)

pca_df.to_pickle(output_dir / "hotel_pca_components.pkl")
loadings.to_pickle(output_dir / "hotel_pca_loadings.pkl")

pca_df.head()

