In [None]:
import numpy as np
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

# Simulated example data
durations = np.random.exponential(scale=180, size=100)  # days to upgrade or censor
event_observed = np.random.binomial(1, 0.7, size=100)  # 1 if upgraded, 0 if censored

kmf = KaplanMeierFitter()
kmf.fit(durations, event_observed, label="Green Tier Upgrade")

plt.figure(figsize=(8,5))
kmf.plot_survival_function()
plt.title("Kaplan–Meier Curve: Time to Upgrade from Green Tier")
plt.xlabel("Days since entry to Green Tier")
plt.ylabel("Proportion still in Green Tier")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

# Simulated data setup
np.random.seed(42)

# Create synthetic durations (days until upgrade or censor)
# and events (1=upgraded, 0=censored) for 3 frequency groups

n = 100  # number of customers per group

# Low frequency: slower upgrades (longer times)
durations_low = np.random.exponential(scale=250, size=n)
events_low = np.random.binomial(1, 0.6, size=n)  # 60% upgraded

# Medium frequency: medium speed upgrades
durations_med = np.random.exponential(scale=180, size=n)
events_med = np.random.binomial(1, 0.7, size=n)

# High frequency: faster upgrades
durations_high = np.random.exponential(scale=120, size=n)
events_high = np.random.binomial(1, 0.8, size=n)

kmf = KaplanMeierFitter()

plt.figure(figsize=(10,6))

# Plot Low frequency group
kmf.fit(durations_low, event_observed=events_low, label="Low Visit Frequency")
kmf.plot_survival_function()

# Plot Medium frequency group
kmf.fit(durations_med, event_observed=events_med, label="Medium Visit Frequency")
kmf.plot_survival_function()

# Plot High frequency group
kmf.fit(durations_high, event_observed=events_high, label="High Visit Frequency")
kmf.plot_survival_function()

plt.title("Kaplan–Meier Curves by Visit Frequency Group")
plt.xlabel("Days Since Entering Green Tier")
plt.ylabel("Proportion Still in Green Tier (Not Upgraded)")
plt.legend()
plt.grid(True)
plt.show()

# Statistical test example: Low vs High frequency groups
results = logrank_test(durations_low, durations_high,
                       event_observed_A=events_low,
                       event_observed_B=events_high)
print(f"Log-rank test p-value (Low vs High frequency): {results.p_value:.4f}")


In [None]:
factors = ['Visit Frequency', 'Average Spend', 'App Usage', 'Reminder Sent']
hazard_ratios = [1.5, 1.2, 1.8, 2.0]

plt.figure(figsize=(7,4))
bars = plt.bar(factors, hazard_ratios, color='skyblue')
plt.axhline(1, color='red', linestyle='--')
plt.title("Cox Model: Impact on Upgrade Speed (Hazard Ratios)")
plt.ylabel("Hazard Ratio (>1 = faster upgrade)")
plt.ylim(0, 2.5)
for bar, hr in zip(bars, hazard_ratios):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() - 0.15, f"{hr:.2f}", ha='center', color='black', fontsize=12)
plt.show()


In [None]:
import seaborn as sns
import pandas as pd

matrix = pd.DataFrame({
    'Green': [0.7, 0.1, 0.0],
    'Silver': [0.25, 0.8, 0.05],
    'Gold': [0.0, 0.08, 0.9],
    'Exit': [0.05, 0.02, 0.05]
}, index=['Green', 'Silver', 'Gold'])

plt.figure(figsize=(6,5))
sns.heatmap(matrix, annot=True, cmap='Blues', cbar=False, fmt=".2f")
plt.title("Markov Chain: Monthly Tier Transition Probabilities")
plt.ylabel("Current Tier")
plt.xlabel("Next Tier")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Simulated visit frequencies (visits per month)
before = np.random.poisson(lam=2.5, size=100)
after = before + np.random.choice([0,1], size=100, p=[0.7,0.3])  # some increase

plt.figure(figsize=(7,5))
plt.boxplot([before, after], labels=['Before Reminder', 'After Reminder'])
plt.title("Visit Frequency Before vs After Reminder")
plt.ylabel("Average Monthly Visits")
plt.show()


In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Parameters
num_customers = 10
start_date = pd.Timestamp('2024-01-01')
end_date = pd.Timestamp('2025-07-31')
date_range = pd.date_range(start_date, end_date, freq='D')

# Generate random transaction dates for each customer (biweekly approx)
def random_dates(num, start, end):
    return pd.to_datetime(np.random.choice(pd.date_range(start, end), num, replace=False)).sort_values()

data = []

for cust_id in range(1, num_customers + 1):
    num_tx = np.random.randint(20, 40)  # transactions per customer
    tx_dates = random_dates(num_tx, start_date, end_date)
    cumulative_points = 0
    group_num = 0
    
    for i, tx_date in enumerate(tx_dates):
        # Randomly assign transaction category
        if np.random.rand() < 0.1:
            # 10% chance of upgrade/downgrade
            category = np.random.choice(['Upgrade', 'Downgrade'])
            point_reset_flag = 1
            # Points at upgrade equal cumulative_points
            issued_points = cumulative_points
            # Reset cumulative points after upgrade/downgrade
            cumulative_points = 0
            group_num += 1
        else:
            category = 'Normal'
            point_reset_flag = 0
            issued_points = np.random.randint(10, 100)
            cumulative_points += issued_points
        
        data.append({
            'LOYALTY_CUSTOMER_REF': f'CUST{cust_id:03d}',
            'LOYALTY_TRX_DATE': tx_date,
            'LOYALTY_TRX_CATEGORY_REF': category,
            'ISSUED_LOYALTY_POINTS': issued_points,
            'POINT_RESET_FLAG': point_reset_flag,
            'GROUP': group_num
        })

df = pd.DataFrame(data)

# Calculate cumulative points per customer and group (mimic SQL logic)
df = df.sort_values(['LOYALTY_CUSTOMER_REF', 'LOYALTY_TRX_DATE'])
df['CUMULATIVE_POINTS'] = df.groupby(['LOYALTY_CUSTOMER_REF', 'GROUP'])['ISSUED_LOYALTY_POINTS'].cumsum()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

# === Data Generation ===
num_customers = 10
start_date = pd.Timestamp('2024-01-01')
end_date = pd.Timestamp('2025-07-31')

def random_dates(num, start, end):
    return pd.to_datetime(np.random.choice(pd.date_range(start, end), num, replace=False)).sort_values()

data = []

for cust_id in range(1, num_customers + 1):
    num_tx = np.random.randint(20, 40)  # transactions per customer
    tx_dates = random_dates(num_tx, start_date, end_date)
    cumulative_points = 0
    group_num = 0
    
    for tx_date in tx_dates:
        if np.random.rand() < 0.1:
            category = np.random.choice(['Upgrade', 'Downgrade'])
            point_reset_flag = 1
            issued_points = cumulative_points
            cumulative_points = 0
            group_num += 1
        else:
            category = 'Normal'
            point_reset_flag = 0
            issued_points = np.random.randint(10, 100)
            cumulative_points += issued_points
        
        data.append({
            'LOYALTY_CUSTOMER_REF': f'CUST{cust_id:03d}',
            'LOYALTY_TRX_DATE': tx_date,
            'LOYALTY_TRX_CATEGORY_REF': category,
            'ISSUED_LOYALTY_POINTS': issued_points,
            'POINT_RESET_FLAG': point_reset_flag,
            'GROUP': group_num
        })

df = pd.DataFrame(data)
df = df.sort_values(['LOYALTY_CUSTOMER_REF', 'LOYALTY_TRX_DATE'])
df['CUMULATIVE_POINTS'] = df.groupby(['LOYALTY_CUSTOMER_REF', 'GROUP'])['ISSUED_LOYALTY_POINTS'].cumsum()

# === Analysis Functions with plots ===

def time_to_threshold(df, threshold=1000):
    threshold_dates = (
        df[df['CUMULATIVE_POINTS'] >= threshold]
        .groupby('LOYALTY_CUSTOMER_REF')['LOYALTY_TRX_DATE']
        .min()
        .reset_index()
        .rename(columns={'LOYALTY_TRX_DATE': 'DATE_THRESHOLD_REACHED'})
    )
    
    first_dates = (
        df.groupby('LOYALTY_CUSTOMER_REF')['LOYALTY_TRX_DATE']
        .min()
        .reset_index()
        .rename(columns={'LOYALTY_TRX_DATE': 'FIRST_TX_DATE'})
    )
    
    result = pd.merge(first_dates, threshold_dates, on='LOYALTY_CUSTOMER_REF', how='left')
    result['DAYS_TO_THRESHOLD'] = (result['DATE_THRESHOLD_REACHED'] - result['FIRST_TX_DATE']).dt.days
    return result

def plot_time_to_threshold(df_res):
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_res, x='LOYALTY_CUSTOMER_REF', y='DAYS_TO_THRESHOLD', palette='viridis')
    plt.title('Days to Reach 1000 Points Threshold')
    plt.ylabel('Days')
    plt.xlabel('Customer')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def visit_based_pace(df):
    visits = (
        df.groupby('LOYALTY_CUSTOMER_REF')
        .agg(TOTAL_POINTS=('ISSUED_LOYALTY_POINTS', 'sum'), 
             TOTAL_VISITS=('LOYALTY_TRX_DATE', 'count'))
        .reset_index()
    )
    visits['POINTS_PER_VISIT'] = visits['TOTAL_POINTS'] / visits['TOTAL_VISITS']
    return visits

def plot_visit_based_pace(df_visits):
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_visits, x='LOYALTY_CUSTOMER_REF', y='POINTS_PER_VISIT', palette='coolwarm')
    plt.title('Average Points per Visit by Customer')
    plt.ylabel('Points per Visit')
    plt.xlabel('Customer')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def max_span_since_last_upgrade(df):
    upgrades = df[df['POINT_RESET_FLAG'] == 1]
    
    spans = []
    for cust, group_df in df.groupby('LOYALTY_CUSTOMER_REF'):
        upgrade_dates = upgrades[upgrades['LOYALTY_CUSTOMER_REF'] == cust]['LOYALTY_TRX_DATE']
        last_upgrade = upgrade_dates.max() if not upgrade_dates.empty else None
        last_tx = group_df['LOYALTY_TRX_DATE'].max()
        if last_upgrade is None:
            span = (last_tx - group_df['LOYALTY_TRX_DATE'].min()).days
        else:
            span = (last_tx - last_upgrade).days
        spans.append({'LOYALTY_CUSTOMER_REF': cust, 'DAYS_SINCE_LAST_UPGRADE': span})
    return pd.DataFrame(spans)

def plot_max_span(df_spans):
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_spans, x='LOYALTY_CUSTOMER_REF', y='DAYS_SINCE_LAST_UPGRADE', palette='magma')
    plt.title('Days Since Last Upgrade per Customer')
    plt.ylabel('Days Since Last Upgrade')
    plt.xlabel('Customer')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def flag_zero_upgrades(df):
    upgrades_count = df[df['POINT_RESET_FLAG'] == 1].groupby('LOYALTY_CUSTOMER_REF').size().reset_index(name='UPGRADE_COUNT')
    all_customers = df['LOYALTY_CUSTOMER_REF'].unique()
    upgrades_flag = pd.DataFrame({'LOYALTY_CUSTOMER_REF': all_customers})
    upgrades_flag = upgrades_flag.merge(upgrades_count, on='LOYALTY_CUSTOMER_REF', how='left').fillna(0)
    upgrades_flag['NO_UPGRADE_FLAG'] = (upgrades_flag['UPGRADE_COUNT'] == 0).astype(int)
    return upgrades_flag[['LOYALTY_CUSTOMER_REF', 'NO_UPGRADE_FLAG']]

def plot_zero_upgrade_flag(df_flag):
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_flag, x='LOYALTY_CUSTOMER_REF', y='NO_UPGRADE_FLAG', palette='Set2')
    plt.title('Customers with Zero Upgrades Flag (1 = No Upgrades)')
    plt.ylabel('No Upgrade Flag')
    plt.xlabel('Customer')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def cohort_analysis(df):
    df['COHORT_MONTH'] = df.groupby('LOYALTY_CUSTOMER_REF')['LOYALTY_TRX_DATE'].transform('min').dt.to_period('M')
    df['TX_MONTH'] = df['LOYALTY_TRX_DATE'].dt.to_period('M')
    cohort_data = df.groupby(['COHORT_MONTH', 'TX_MONTH']).size().unstack(fill_value=0)
    return cohort_data

def plot_cohort_analysis(cohort_df):
    plt.figure(figsize=(12, 8))
    sns.heatmap(cohort_df, annot=True, fmt='d', cmap='YlGnBu')
    plt.title('Cohort Analysis: Transaction Counts by Cohort Month and Transaction Month')
    plt.ylabel('Cohort Month')
    plt.xlabel('Transaction Month')
    plt.tight_layout()
    plt.show()

# === Run and plot ===

time_thresh_res = time_to_threshold(df)
plot_time_to_threshold(time_thresh_res)

visit_pace_res = visit_based_pace(df)
plot_visit_based_pace(visit_pace_res)

max_span_res = max_span_since_last_upgrade(df)
plot_max_span(max_span_res)

zero_upgrade_flag_res = flag_zero_upgrades(df)
plot_zero_upgrade_flag(zero_upgrade_flag_res)

cohort_res = cohort_analysis(df)
plot_cohort_analysis(cohort_res)


In [None]:
# Calculate time differences and points differences per customer transaction
df = df.sort_values(['LOYALTY_CUSTOMER_REF', 'LOYALTY_TRX_DATE'])

df['PREV_DATE'] = df.groupby('LOYALTY_CUSTOMER_REF')['LOYALTY_TRX_DATE'].shift(1)
df['PREV_POINTS'] = df.groupby('LOYALTY_CUSTOMER_REF')['CUMULATIVE_POINTS'].shift(1)

df['DAYS_DIFF'] = (df['LOYALTY_TRX_DATE'] - df['PREV_DATE']).dt.days
df['POINTS_DIFF'] = df['CUMULATIVE_POINTS'] - df['PREV_POINTS']

df['VELOCITY'] = df['POINTS_DIFF'] / df['DAYS_DIFF']
df.loc[df['DAYS_DIFF'] == 0, 'VELOCITY'] = np.nan  # avoid divide by zero

# Plot velocity vectors (points/day) over time per customer
plt.figure(figsize=(14, 7))
for cust in df['LOYALTY_CUSTOMER_REF'].unique():
    cust_data = df[df['LOYALTY_CUSTOMER_REF'] == cust]
    plt.plot(cust_data['LOYALTY_TRX_DATE'], cust_data['VELOCITY'], marker='o', label=cust)

plt.title('Velocity of Points Accumulation (Points/Day) Over Time by Customer')
plt.xlabel('Transaction Date')
plt.ylabel('Velocity (Points/Day)')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
from lifelines import KaplanMeierFitter

# Prepare survival data
df_sorted = df.sort_values(['LOYALTY_CUSTOMER_REF', 'LOYALTY_TRX_DATE'])
first_tx = df_sorted.groupby('LOYALTY_CUSTOMER_REF')['LOYALTY_TRX_DATE'].min().reset_index()
upgrade_tx = df_sorted[df_sorted['POINT_RESET_FLAG'] == 1].groupby('LOYALTY_CUSTOMER_REF')['LOYALTY_TRX_DATE'].min().reset_index()

surv_data = first_tx.merge(upgrade_tx, on='LOYALTY_CUSTOMER_REF', how='left', suffixes=('_first', '_upgrade'))
surv_data['EVENT_OCCURRED'] = surv_data['LOYALTY_TRX_DATE_upgrade'].notna().astype(int)

# Calculate duration (days) between first transaction and upgrade or censor date (end of data)
end_date = df['LOYALTY_TRX_DATE'].max()
surv_data['DURATION'] = (surv_data['LOYALTY_TRX_DATE_upgrade'].fillna(end_date) - surv_data['LOYALTY_TRX_DATE_first']).dt.days

# Fit Kaplan-Meier
kmf = KaplanMeierFitter()
kmf.fit(durations=surv_data['DURATION'], event_observed=surv_data['EVENT_OCCURRED'])

# Plot survival curve
plt.figure(figsize=(10, 6))
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve: Time to Upgrade Event')
plt.xlabel('Days Since First Transaction')
plt.ylabel('Probability of No Upgrade Yet')
plt.grid(True)
plt.show()
