In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import sqlite3, pandas as pd
import numpy as np
CLEAN_DB = "/content/drive/MyDrive/is5126/data/reviews_clean.db"
conn = sqlite3.connect(CLEAN_DB)



# Summary

This notebook applies Gaussian Mixture Models to segment hotels into experience-based competitive sets and performs cluster-aware benchmarking. By comparing hotels only against similar peers, we identify meaningful performance gaps and best practices, enabling hotel managers to allocate improvement resources more effectively.

# Load Data
We first load hotel-level performance features computed from cleaned review data. These features summarize guest experience across service, cleanliness, value, location, and consistency, and form the basis for fair competitive benchmarking.

In [44]:
hotel = pd.read_sql("SELECT * FROM hotel_features;", conn)
hotel.shape, hotel.head()

((3863, 12),
    offering_id  n_reviews  avg_overall  avg_service  avg_cleanliness  \
 0        72572        377     4.381963     4.557423         4.642254   
 1        72579        189     3.703704     4.073446         4.159091   
 2        72586        231     3.796537     4.051643         4.066667   
 3        72598         60     2.966667     3.290909         3.222222   
 4        73236         23     3.173913     4.217391         2.913043   
 
    avg_value  avg_location  var_overall  n_service  n_cleanliness  n_value  \
 0   4.299720      4.528701     0.761266        357            355      357   
 1   3.965909      4.114650     1.213796        177            176      176   
 2   3.910377      4.406250     1.149079        213            210      212   
 3   3.109091      3.113208     1.832222         55             54       55   
 4   3.739130      4.190476     1.534972         23             23       23   
 
    n_location  
 0         331  
 1         157  
 2         192  
 3 

To ensure fair comparisons, we restrict benchmarking to hotels with sufficient review volume and aspect coverage. This avoids noisy estimates driven by sparse or incomplete feedback.

In [45]:
hotel_stable = hotel [
    (hotel["n_reviews"] >= 50) &
    (hotel["n_service"] >= 20) &
    (hotel["n_cleanliness"] >= 20) &
    (hotel["n_value"] >= 20) &
    (hotel["n_location"] >= 20)
].copy()

hotel_stable.shape

(2089, 12)

# Feature Engineering
Feature engineering focuses on interpretable experience signals, such as service–value gaps and experience consistency, which are directly actionable for hotel managers.

In [46]:
# cap variance to reduce outlier influence on covariance
cap = hotel_stable["var_overall"].quantile(0.95)
hotel_stable["var_overall_capped"] = hotel_stable["var_overall"].clip(upper=cap)

# manager-actionable "profile" gaps
hotel_stable["service_value_gap"] = hotel_stable["avg_service"] - hotel_stable["avg_value"]
hotel_stable["clean_value_gap"]   = hotel_stable["avg_cleanliness"] - hotel_stable["avg_value"]

# consistency proxy
hotel_stable["consistency"] = 1 / (1 + hotel_stable["var_overall_capped"])


This checks that each column are filled with quantifiable values.

In [47]:
FEATURES = [
    "avg_service",
    "avg_cleanliness",
    "avg_value",
    "avg_location",
    "service_value_gap",
    "clean_value_gap",
    "consistency"
]

X = hotel_stable[FEATURES].copy()
X.isna().mean()


Unnamed: 0,0
avg_service,0.0
avg_cleanliness,0.0
avg_value,0.0
avg_location,0.0
service_value_gap,0.0
clean_value_gap,0.0
consistency,0.0


# Standardization + PCA output shape
Features are standardized and optionally decorrelated using PCA to improve clustering stability and interpretability.

In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional PCA (helps because aspects are correlated)
pca = PCA(n_components=0.9, random_state=42)
X_pca = pca.fit_transform(X_scaled)

X_pca.shape

(2089, 4)

The number of hotel segments is selected using Bayesian Information Criterion (BIC), ensuring an optimal trade-off between model complexity and explanatory power.

In [49]:
from sklearn.mixture import GaussianMixture

bics = []
models = {}

for k in range(2, 9):
    gmm = GaussianMixture(
        n_components=k,
        covariance_type="full",
        random_state=42
    )
    gmm.fit(X_pca)
    bics.append((k, gmm.bic(X_pca)))
    models[k] = gmm

bics


[(2, np.float64(22922.32678672775)),
 (3, np.float64(22560.33139026581)),
 (4, np.float64(22531.96753206027)),
 (5, np.float64(22514.882035148457)),
 (6, np.float64(22521.824299794152)),
 (7, np.float64(22562.73694503004)),
 (8, np.float64(22623.73582820059))]

In [50]:
best_k = min(bics, key=lambda x: x[1])[0]
best_k
best_gmm = models[best_k]


In [51]:
hotel_stable["cluster"] = best_gmm.predict(X_pca)
probs = best_gmm.predict_proba(X_pca)
hotel_stable["cluster_confidence"] = probs.max(axis=1)

hotel_stable[["cluster","cluster_confidence"]].describe()


Unnamed: 0,cluster,cluster_confidence
count,2089.0,2089.0
mean,1.487315,0.803933
std,1.268147,0.162824
min,0.0,0.350026
25%,0.0,0.679531
50%,1.0,0.8363
75%,3.0,0.954042
max,4.0,1.0


# Cluster profile table (means per cluster)
Cluster profiles reveal distinct hotel experience archetypes, highlighting systematic differences in service quality, value perception, location strength, and consistency across segments.


*   High service, low value → premium but expensive
*   High location, low consistency → great spot, uneven experience



In [52]:
cluster_profile = (
    hotel_stable
    .groupby("cluster")[FEATURES + ["avg_overall", "n_reviews", "var_overall"]]
    .agg(["mean", "median", "count"])
)

cluster_profile


Unnamed: 0_level_0,avg_service,avg_service,avg_service,avg_cleanliness,avg_cleanliness,avg_cleanliness,avg_value,avg_value,avg_value,avg_location,...,consistency,avg_overall,avg_overall,avg_overall,n_reviews,n_reviews,n_reviews,var_overall,var_overall,var_overall
Unnamed: 0_level_1,mean,median,count,mean,median,count,mean,median,count,mean,...,count,mean,median,count,mean,median,count,mean,median,count
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,4.191094,4.188543,618,4.308882,4.311399,618,4.1461,4.12021,618,4.305431,...,618,4.047149,4.039046,618,239.564725,151.5,618,1.070643,1.027624,618
1,4.515832,4.518072,515,4.644613,4.649789,515,4.245813,4.235915,515,4.652251,...,515,4.42165,4.424936,515,409.180583,305.0,515,0.719689,0.688947,515
2,4.059395,4.063141,396,4.25743,4.271227,396,3.689619,3.694382,396,4.374244,...,396,3.878464,3.880876,396,459.54798,345.0,396,1.159169,1.146064,396
3,3.638451,3.643773,440,3.706016,3.72892,440,3.644851,3.656892,440,3.951188,...,440,3.42118,3.430876,440,215.920455,132.0,440,1.460166,1.427735,440
4,2.786111,2.859874,120,2.708199,2.729259,120,2.926631,2.982451,120,3.465243,...,120,2.520349,2.546232,120,132.416667,77.0,120,1.687274,1.651944,120


# Competitive benchmarking a target hotel
We select a target hotel to demonstrate how competitive benchmarking can guide hotel-specific improvement decisions.

In [53]:
target_id = hotel_stable.sort_values("cluster_confidence", ascending=False)["offering_id"].iloc[0]
target_id


np.int64(673661)

In [54]:
target = hotel_stable[hotel_stable["offering_id"] == target_id].iloc[0]
target_cluster = int(target["cluster"])
target_cluster


4

### Define the competitors

Competitors are defined as hotels within the same GMM-derived segment, ensuring comparisons are made between properties with similar guest experience profiles rather than arbitrary categories.

In [55]:
competitors = hotel_stable[
    (hotel_stable["cluster"] == target_cluster) &
    (hotel_stable["offering_id"] != target_id) &
    (hotel_stable["cluster_confidence"] >= 0.6)
].copy()

competitors.shape


(109, 18)

### Find closest peers within the cluster (distance in feature space)
Within each competitive segment, we further identify the closest peers based on distance in feature space, yielding a refined set of directly comparable hotels.

In [56]:
# build index map for scaled matrix for distance computations
hotel_stable = hotel_stable.reset_index(drop=True)
X_use = X_pca  # distance in PCA space

idx_map = {oid: i for i, oid in enumerate(hotel_stable["offering_id"].values)}
t_idx = idx_map[target_id]

# competitor indices
comp_idx = np.array([idx_map[i] for i in competitors["offering_id"].values])

dists = np.linalg.norm(X_use[comp_idx] - X_use[t_idx], axis=1)
competitors["dist"] = dists

closest_peers = competitors.sort_values("dist").head(20)
closest_peers[["offering_id","dist","avg_overall","n_reviews","var_overall"]].head(10)


Unnamed: 0,offering_id,dist,avg_overall,n_reviews,var_overall
2709,267183,0.489438,1.661972,71,0.92799
1170,99287,0.662587,1.69863,73,0.950272
319,81246,1.312138,1.674699,83,1.183336
2966,559383,1.595113,1.86,50,1.0804
2095,217613,1.631803,2.09375,96,1.293294
1259,100508,1.941247,1.923077,65,1.301775
2371,224229,2.189233,1.994624,186,1.284917
391,82105,2.398832,2.101266,79,1.230252
2945,549441,2.400674,2.054795,73,1.449052
1843,119928,2.451746,1.952381,84,1.283447


### Gap analysis (where should the hotel invest?)
Gap analysis quantifies how the target hotel performs relative to its competitive set, highlighting priority areas for improvement as well as existing competitive advantages.

gap < 0: underperforming peers

gap > 0: outperforming peers

In [57]:
cluster_mean = hotel_stable[hotel_stable["cluster"] == target_cluster][FEATURES + ["avg_overall","var_overall"]].mean()

gap_df = pd.DataFrame({
    "metric": FEATURES + ["avg_overall","var_overall"],
    "target": [target[m] for m in FEATURES + ["avg_overall","var_overall"]],
    "cluster_avg": [cluster_mean[m] for m in FEATURES + ["avg_overall","var_overall"]],
})
gap_df["gap"] = gap_df["target"] - gap_df["cluster_avg"]

gap_df.sort_values("gap")


Unnamed: 0,metric,target,cluster_avg,gap
0,avg_service,1.797619,2.786111,-0.988492
1,avg_cleanliness,1.771084,2.708199,-0.937114
8,var_overall,0.803939,1.687274,-0.883335
7,avg_overall,1.674157,2.520349,-0.846192
2,avg_value,2.120482,2.926631,-0.806149
3,avg_location,3.24,3.465243,-0.225243
4,service_value_gap,-0.322863,-0.140519,-0.182343
5,clean_value_gap,-0.349398,-0.218432,-0.130966
6,consistency,0.554343,0.389659,0.164684


### What are similar hotels doing better than us?
The weakest-performing dimension relative to peers is identified to prioritize limited improvement resources.

In [58]:
weakest_metric = gap_df.sort_values("gap").iloc[0]["metric"]
weakest_metric

'avg_service'

### Best-practice peers
Best-performing hotels within the same segment provide concrete benchmarks and best practices that the target hotel can emulate aka hotels in the same segment that excel where the target struggles.

In [59]:
best_practices = (
    hotel_stable[hotel_stable["cluster"] == target_cluster]
    .sort_values(weakest_metric, ascending=False)
    .head(10)[["offering_id", weakest_metric, "avg_overall", "n_reviews", "var_overall"]]
)

best_practices


Unnamed: 0,offering_id,avg_service,avg_overall,n_reviews,var_overall
332,83988,3.932203,3.758065,62,1.989854
2016,1724101,3.612245,3.211538,52,1.820636
1284,220106,3.455882,3.123288,146,2.450554
21,73889,3.380435,3.031579,95,2.430582
664,95286,3.378378,3.082803,157,2.165118
2058,1950129,3.372549,2.672727,55,1.783802
1439,235456,3.357143,3.093023,86,2.223905
689,98073,3.354167,2.796296,54,2.310357
1876,1176612,3.352941,3.092593,54,1.750686
1176,122721,3.309091,2.982759,58,2.120392


## Write GMM results to DB

In [None]:
# keep only what you need
segments = hotel_stable[["offering_id","cluster","cluster_confidence"]].copy()

segments.to_sql("hotel_segments_gmm", conn, if_exists="replace", index=False)

# quick verify
pd.read_sql("SELECT cluster, COUNT(*) n_hotels FROM hotel_segments_gmm GROUP BY cluster ORDER BY cluster;", conn)


In [60]:
conn.close()
