In [1]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/hotel_bookings.csv")

# Add experimental groups (50/50 random split)
import numpy as np
np.random.seed(42)
df["assigned_group"] = np.where(np.random.rand(len(df)) < 0.5, "control", "treatment")

# Introduce a treatment effect (simulate slightly fewer no-shows)
mask = (df["assigned_group"]=="treatment") & (df["reservation_status"]=="No-show")
flip = np.random.rand(mask.sum()) < 0.25   # 25% of treatment no-shows flip to 'Check-Out'
df.loc[mask[mask].index[flip], "reservation_status"] = "Check-Out"


In [2]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,assigned_group
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,control
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,treatment
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,treatment
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,treatment
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,control


In [3]:
# Focus only on reservations that are not cancelled
df_active = df[df["reservation_status"] != "Canceled"]

# Calculate no-show rates for each group
group_rates = (
    df_active.groupby("assigned_group")["reservation_status"]
    .apply(lambda x: (x == "No-show").mean())
)

print(group_rates)


assigned_group
control      0.0
treatment    0.0
Name: reservation_status, dtype: float64


In [4]:
group_counts = df_active.groupby("assigned_group")["reservation_status"].count()
print(group_counts)


assigned_group
control      38155
treatment    38218
Name: reservation_status, dtype: int64


In [5]:
df_active = df[df["reservation_status"] != "Canceled"]
group_rates = (
    df_active.groupby("assigned_group")["reservation_status"]
    .apply(lambda x: (x == "No-show").mean())
)
print(group_rates)


assigned_group
control      0.0
treatment    0.0
Name: reservation_status, dtype: float64


In [6]:
import numpy as np

# Start with only active (not canceled) bookings
df_active = df[df["reservation_status"] != "Canceled"].copy()

# Simulate baseline no-show probability (e.g., 8%)
np.random.seed(42)
df_active["no_show"] = np.where(np.random.rand(len(df_active)) < 0.08, 1, 0)

# Inject treatment effect: lower no-shows in treatment group
mask = (df_active["assigned_group"] == "treatment") & (df_active["no_show"] == 1)
flip = np.random.rand(mask.sum()) < 0.25   # 25% of no-shows prevented
df_active.loc[mask[mask].index[flip], "no_show"] = 0


In [7]:
group_rates = df_active.groupby("assigned_group")["no_show"].mean()
print(group_rates)


assigned_group
control      0.079570
treatment    0.060783
Name: no_show, dtype: float64


In [8]:
import math

# 1) Counts
n_c = (df_active["assigned_group"] == "control").sum()
n_t = (df_active["assigned_group"] == "treatment").sum()

x_c = df_active.loc[df_active["assigned_group"] == "control", "no_show"].sum()
x_t = df_active.loc[df_active["assigned_group"] == "treatment", "no_show"].sum()

phat_c = x_c / n_c
phat_t = x_t / n_t

# 2) Pooled proportion under H0 (pc == pt)
p_pool = (x_c + x_t) / (n_c + n_t)

# 3) Standard error (pooled)
SE = math.sqrt(p_pool * (1 - p_pool) * (1/n_c + 1/n_t))

# 4) Z-stat (one-sided test: pt < pc, so use phat_c - phat_t)
z = (phat_c - phat_t) / SE

# 5) One-sided p-value
p_value = 1 - 0.5 * (1 + math.erf(z / math.sqrt(2)))

print(f"Control: n={n_c}, x={x_c}, rate={phat_c:.5f}")
print(f"Treatment: n={n_t}, x={x_t}, rate={phat_t:.5f}")
print(f"z = {z:.3f}")
print(f"one-sided p-value = {p_value:.6f}")

# (bonus) 95% CI for (pt - pc), unpooled SE
SE_unpooled = math.sqrt(phat_t*(1-phat_t)/n_t + phat_c*(1-phat_c)/n_c)
ci_low = (phat_t - phat_c) - 1.96*SE_unpooled
ci_high = (phat_t - phat_c) + 1.96*SE_unpooled
print(f"95% CI for (pt - pc): [{ci_low:.4f}, {ci_high:.4f}]")


Control: n=38155, x=3036, rate=0.07957
Treatment: n=38218, x=2323, rate=0.06078
z = 10.163
one-sided p-value = 0.000000
95% CI for (pt - pc): [-0.0224, -0.0152]


In [9]:
import pandas as pd
import numpy as np
import math

# 0) If you filtered earlier, use the full df for guardrails (don’t drop cancellations)
# df: original dataframe with assigned_group already added

# 1) Cancellation rate by group
cancel_rate = (
    df.groupby("assigned_group")["reservation_status"]
      .apply(lambda s: (s == "Canceled").mean())
)
print("Cancellation rate:\n", cancel_rate)

# 2) ADR by group (mean, median, and a quick Welch t-test)
adr_summary = df.groupby("assigned_group")["adr"].agg(["count","mean","median","std"])
print("\nADR summary:\n", adr_summary)

# Welch’s t-test (no equal-variance assumption)
g1 = df.loc[df.assigned_group=="control", "adr"].dropna().values
g2 = df.loc[df.assigned_group=="treatment", "adr"].dropna().values

def welch_ttest(a, b):
    m1, m2 = a.mean(), b.mean()
    s1, s2 = a.var(ddof=1), b.var(ddof=1)
    n1, n2 = len(a), len(b)
    t = (m1 - m2) / math.sqrt(s1/n1 + s2/n2)
    # df (Welch–Satterthwaite)
    dfw = (s1/n1 + s2/n2)**2 / ((s1**2)/((n1**2)*(n1-1)) + (s2**2)/((n2**2)*(n2-1)))
    return t, dfw

t_stat, df_w = welch_ttest(g1, g2)
print(f"\nWelch t-stat for ADR (control - treatment): t={t_stat:.3f}, df≈{df_w:.0f}")
# (If |t| > ~1.96 you likely have a significant difference at ~5% level.)

# 3) Channel mix balance (should be similar across groups)
channel_mix = pd.crosstab(df["assigned_group"], df["market_segment"], normalize="index")
print("\nChannel mix (row-normalized):\n", channel_mix)

# Optional chi-square test of independence for channel mix
obs = pd.crosstab(df["assigned_group"], df["market_segment"])
# quick chi-square (manual)
row_sums = obs.sum(axis=1).values.reshape(-1,1)
col_sums = obs.sum(axis=0).values.reshape(1,-1)
total = obs.values.sum()
expected = row_sums @ col_sums / total
chi2 = ((obs.values - expected)**2 / expected).sum()
print(f"\nChi-square (channel mix) ≈ {chi2:.2f} (bigger => more imbalance)")

# 4) Lead time balance (means similar?)
lead_summary = df.groupby("assigned_group")["lead_time"].agg(["count","mean","median","std"])
print("\nLead time summary:\n", lead_summary)


Cancellation rate:
 assigned_group
control      0.359665
treatment    0.360946
Name: reservation_status, dtype: float64

ADR summary:
                 count        mean  median        std
assigned_group                                      
control         59586  101.940688  94.875  52.877535
treatment       59804  101.721955  94.500  48.089501

Welch t-stat for ADR (control - treatment): t=0.748, df≈118246

Channel mix (row-normalized):
 market_segment  Aviation  Complementary  Corporate    Direct    Groups  \
assigned_group                                                           
control         0.001762       0.006243   0.043081  0.105528  0.168093   
treatment       0.002207       0.006204   0.045616  0.105645  0.163785   

market_segment  Offline TA/TO  Online TA  Undefined  
assigned_group                                       
control              0.201608   0.473652   0.000034  
treatment            0.204100   0.472443   0.000000  

Chi-square (channel mix) ≈ 13.67 (bigger =>