In [55]:
import pandas as pd
import numpy as np
from glob import glob
from scipy.stats import mannwhitneyu


In [56]:
# load all daily csv files
df = pd.concat((pd.read_csv(f) for f in glob("../data/raw/*")), ignore_index=True)

print("Shape:", df.shape)
df.head()


Shape: (31567, 30)


Unnamed: 0,product_id,brand,gender,category,master_category,sub_category,article_type,mrp,price,discount,...,has_multiple_sizes,snapshot_date,season,is_fast_fashion,promotion_tags,has_promotion,year,preferred_delivery_tag,delivery_promise,source_sort
0,36716624,House of Sal,Women,Dresses,Apparel,Dress,Dresses,1990,1419,571,...,True,2026-01-13,Summer,True,,False,2025,EXPRESS,Delivery By Jan 15,popularity
1,33810216,all about you,Women,Dresses,Apparel,Dress,Dresses,2999,779,2220,...,True,2026-01-13,Fall,True,,False,2025,EXPRESS,Delivery By Jan 15,popularity
2,31157428,SANSKRUTIHOMES,Women,Dresses,Apparel,Dress,Dresses,2899,724,2175,...,True,2026-01-13,Spring,True,Crazy_Deal|Festive_Price_Crash|Myntra_Unique,True,2024,EXPRESS,Delivery By Jan 15,popularity
3,30082902,Bannos Swagger,Women,Dresses,Apparel,Dress,Dresses,4799,863,3936,...,True,2026-01-13,Fall,True,Crazy_Deal|GST_Benefit_Included|Festive_Price_...,True,2024,EXPRESS,Delivery By Jan 16,popularity
4,36266565,Phosphorus,Women,Dresses,Apparel,Dress,Dresses,2599,1039,1560,...,True,2026-01-13,Fall,True,,False,2025,EXPRESS,Delivery By Jan 15,popularity


In [57]:
# convert some columns to numeric
for col in ["mrp", "price", "discount", "rating", "rating_count"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# discount percentage
df["discount_pct"] = np.where(df["mrp"] > 0, df["discount"] / df["mrp"], np.nan)

# keeping only valid discount values
df = df[(df["discount_pct"].notna()) & (df["discount_pct"] >= 0) & (df["discount_pct"] <= 1)]


In [58]:
# feature indicating how many unique days each product appears
days_present = df.groupby("product_id")["snapshot_date"].nunique().reset_index(name="days_present")

df = df.merge(days_present, on="product_id", how="left")


In [59]:
# create continuence buckets
df["continuence_bucket"] = pd.cut(df["days_present"], bins=[0, 2, 5, 8, 11], labels=["1_2_days", "3_5_days", "6_8_days", "9_11_days"])

df["continuence_bucket"].value_counts()


continuence_bucket
9_11_days    17912
6_8_days      6977
3_5_days      4647
1_2_days      2031
Name: count, dtype: int64

In [60]:
# discount buckets
discount_bins = [0, 0.4, 0.6, 0.75, 1.0]
discount_labels = ["<40%", "40–60%", "60–75%", "75%+"]

df["discount_bucket"] = pd.cut(df["discount_pct"], bins=discount_bins, labels=discount_labels, include_lowest=True)

df["discount_bucket"].value_counts()


discount_bucket
75%+      13929
60–75%    10546
40–60%     5056
<40%       2036
Name: count, dtype: int64

In [61]:
# Checking how many products fall in each cohort
cohort_summary = df.groupby("continuence_bucket").agg(rows=("product_id", "size"),
                                                      unique_products=("product_id", "nunique"),
                                                      rated_rows_pct=("rating_count", lambda x: (x.gt(0).mean() * 100)),
                                                      avg_discount_pct=("discount_pct", "mean"))
      
cohort_summary
    

  cohort_summary = df.groupby("continuence_bucket").agg(rows=("product_id", "size"),


Unnamed: 0_level_0,rows,unique_products,rated_rows_pct,avg_discount_pct
continuence_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_2_days,2031,1478,81.733136,0.644431
3_5_days,4647,1166,79.018722,0.700981
6_8_days,6977,960,79.303426,0.711524
9_11_days,17912,1540,82.481018,0.704672


Conclusion:
- Discount level is not constant across cohorts.
- stable products have higher average discount than transient.

In [62]:
# Comparing stable vs transient products on rating and discount
rated = df[df["rating_count"] > 0]

stable = rated[rated["continuence_bucket"] == "9_11_days"]
transient = rated[rated["continuence_bucket"] == "1_2_days"]

summary_overall = pd.DataFrame({
    "cohort": ["transient", "stable"],
    "unique_products": [transient["product_id"].nunique(), stable["product_id"].nunique()],
    "avg_rating": [transient["rating"].mean(), stable["rating"].mean()],
    "median_rating": [transient["rating"].median(), stable["rating"].median()],
    "avg_discount_pct": [transient["discount_pct"].mean(), stable["discount_pct"].mean()],
})

summary_overall


Unnamed: 0,cohort,unique_products,avg_rating,median_rating,avg_discount_pct
0,transient,1197,4.027189,4.131827,0.643213
1,stable,1253,4.076077,4.183369,0.678792


Stable products are slightly higher rated than transient products.


In [63]:
# calculating weighted avg rating to check the importance of products with more reviews
def weighted_avg_rating(x):
    x = x[x["rating_count"] > 0]
    if len(x) == 0:
        return np.nan
    return np.average(x["rating"], weights=x["rating_count"])

weighted_summary = rated.groupby("continuence_bucket").apply(weighted_avg_rating).to_frame("weighted_avg_rating")

weighted_summary


  weighted_summary = rated.groupby("continuence_bucket").apply(weighted_avg_rating).to_frame("weighted_avg_rating")
  weighted_summary = rated.groupby("continuence_bucket").apply(weighted_avg_rating).to_frame("weighted_avg_rating")


Unnamed: 0_level_0,weighted_avg_rating
continuence_bucket,Unnamed: 1_level_1
1_2_days,4.242926
3_5_days,4.224635
6_8_days,4.25744
9_11_days,4.266864


Stable group has the strongest weighted trust with 4.266864 wighted avg rating.

In [64]:
#Effect size
def cohens_d(a, b):
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)

    a = a[~np.isnan(a)]
    b = b[~np.isnan(b)]

    n1, n2 = len(a), len(b)
    if n1 < 2 or n2 < 2:
        return np.nan

    s1, s2 = np.var(a, ddof=1), np.var(b, ddof=1)
    pooled_std = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    return (np.mean(a) - np.mean(b)) / pooled_std if pooled_std > 0 else np.nan

d_rating = cohens_d(stable["rating"], transient["rating"])
d_rating


np.float64(0.11204271818296153)

Effect size is small(0.11204), so the practical difference is limited, but it is consistent.

In [65]:
# Running statistical tests to check if the rating difference is random or consistent

_, p_val = mannwhitneyu(stable["rating"], transient["rating"],alternative="two-sided")

# bootstrap confidence interval for mean difference
def bootstrap_mean_diff(a, b, n_boot=2000, seed=42):
    rng = np.random.default_rng(seed)
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)

    a = a[~np.isnan(a)]
    b = b[~np.isnan(b)]

    diffs = []
    for _ in range(n_boot):
        sa = rng.choice(a, size=len(a), replace=True)
        sb = rng.choice(b, size=len(b), replace=True)
        diffs.append(np.mean(sa) - np.mean(sb))

    diffs = np.array(diffs)
    return np.mean(diffs), np.percentile(diffs, 2.5), np.percentile(diffs, 97.5)

mean_diff, ci_low, ci_high = bootstrap_mean_diff(stable["rating"], transient["rating"])

print("p-value: ", p_val)
print("mean_diff, ci_low, ci_high:", mean_diff, ci_low, ci_high)

p-value:  4.041942331602596e-05
mean_diff, ci_low, ci_high: 0.04831004503135777 0.02429942335765518 0.07254941646954202


- p value is 4.04e-05(significant), which further shows that rating distributions differ between stable and transient
- CI is completely positive, which implies that stable mean rating is reliably higher, but it could also be due to large amount of data.

It shows the effect is small but consistent.

In [66]:
# compare stable vs transient inside each discount bucket
pivot = (rated[rated["continuence_bucket"].isin(["1_2_days", "9_11_days"])].groupby(["discount_bucket", "continuence_bucket"], observed=True)
        .agg(unique_products=("product_id", "nunique"),
             avg_rating=("rating", "mean"),
             median_rating=("rating", "median")).reset_index())

pivot


Unnamed: 0,discount_bucket,continuence_bucket,unique_products,avg_rating,median_rating
0,<40%,1_2_days,113,4.173426,4.238521
1,<40%,9_11_days,118,4.151557,4.237313
2,40–60%,1_2_days,246,4.072073,4.208179
3,40–60%,9_11_days,363,4.150602,4.240941
4,60–75%,1_2_days,535,4.005509,4.142857
5,60–75%,9_11_days,557,4.092441,4.1875
6,75%+,1_2_days,337,3.976961,4.051244
7,75%+,9_11_days,498,4.005295,4.125809


Conclusion:

- Stable > transient is most visible in the mid-high discount range (40–75%)
- Both cohorts drop in 75%+ (trust-risk zone)

This further shows that extreme discounts correlate with lower ratings, and stable products maintain slightly better trust at the same discount depth.