In [None]:
import numpy as np
import pandas as pd
import random
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n_samples = 10000
data = {
    'user_id': np.arange(1, n_samples + 1),
    'date': pd.date_range(start='2020-01-01', periods=n_samples, freq='D').strftime('%d-%m-%Y'),
    'marketing_mode': np.random.choice(['sms', 'obd', 'email'], size=n_samples),
}

# Simulate retention: SMS, OBD, and Email retention probabilities
retention_probs = {'sms': 0.6, 'obd': 0.5, 'email': 0.4}

def retention_date_func(marketing_mode):
    if np.random.rand() < retention_probs[marketing_mode]:
        return pd.Timestamp('2020-01-01') + pd.DateOffset(days=random.randint(1, 5))
    else:
        return pd.NaT

# Add retention_date column based on marketing mode's retention probability
data['retention_date'] = [retention_date_func(mode) for mode in data['marketing_mode']]

# Create a DataFrame
df = pd.DataFrame(data)

# Convert date formats
df['retention_date'] = df['retention_date'].dt.strftime('%d-%m-%Y')

# Display the first few rows of the generated data
print("Generated Data:\n", df.head())

# Create a retention column: 1 if retained, 0 if not retained
df['retained'] = df['retention_date'].notna().astype(int)

# Group data by marketing mode and calculate retention rates
retention_rates = df.groupby('marketing_mode')['retained'].mean()
print("\nRetention Rates by Marketing Mode:\n", retention_rates)

# A/B Testing: Perform Chi-Square test to see if there's a significant difference
contingency_table = pd.crosstab(df['marketing_mode'], df['retained'])

# Chi-Square Test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print("\nChi-Square Test Results:")
print("Chi2 Statistic:", chi2)
print("p-value:", p)

if p < 0.05:
    print("Result: The marketing mode has a significant effect on retention.")
else:
    print("Result: No significant effect of marketing mode on retention.")


Generated Data:
    user_id        date marketing_mode retention_date
0        1  01-01-2020          email     06-01-2020
1        2  02-01-2020            sms     05-01-2020
2        3  03-01-2020          email            NaN
3        4  04-01-2020          email            NaN
4        5  05-01-2020            sms     05-01-2020

Retention Rates by Marketing Mode:
 marketing_mode
email    0.395878
obd      0.500448
sms      0.597496
Name: retained, dtype: float64

Chi-Square Test Results:
Chi2 Statistic: 270.50353567290375
p-value: 1.8234907346928163e-59
Result: The marketing mode has a significant effect on retention.


In [None]:
df

Unnamed: 0,user_id,date,marketing_mode,retention_date,retained
0,1,01-01-2020,email,06-01-2020,1
1,2,02-01-2020,sms,05-01-2020,1
2,3,03-01-2020,email,,0
3,4,04-01-2020,email,,0
4,5,05-01-2020,sms,05-01-2020,1
...,...,...,...,...,...
9995,9996,14-05-2047,email,,0
9996,9997,15-05-2047,sms,05-01-2020,1
9997,9998,16-05-2047,obd,05-01-2020,1
9998,9999,17-05-2047,obd,02-01-2020,1
