# Synthetic Marketing Analytics Project
This notebook demonstrates a complete workflow for synthetic marketing analytics, including data generation, cleaning, customer segmentation, and A/B testing. The workflow is based on the code in `marketing_analytics.py`.

## 1. Import Required Libraries
We import numpy, pandas, KMeans from scikit-learn, and proportions_ztest from statsmodels for data processing, clustering, and statistical testing.

In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from statsmodels.stats.proportion import proportions_ztest
from pathlib import Path

## 2. Generate Synthetic Marketing Data
We generate a synthetic marketing dataset with customer demographics, RFM features, group assignment, and purchase outcomes. The data is saved to a CSV file.

In [3]:
RNG = np.random.default_rng(seed=42)

def generate_data(path: Path, n: int = 5000) -> pd.DataFrame:
    """Generate a complex, dirty synthetic marketing dataset."""
    customers = np.arange(1, n + 1)
    ages = RNG.integers(18, 70, size=n)
    genders = RNG.choice(["M", "F", "male", "female", "Other", "unknown", "FEMALE", "MALE"], size=n)
    income = RNG.normal(50000, 20000, size=n)
    region = RNG.choice(["North", "South", "East", "West", "Unknown", None], size=n)
    signup_date = pd.to_datetime("2020-01-01") + pd.to_timedelta(RNG.integers(0, 2000, size=n), unit="D")
    loyalty_score = RNG.uniform(0, 1, size=n)
    preferred_channel = RNG.choice(["Email", "SMS", "App", "Web", "Phone", None], size=n)
    device_type = RNG.choice(["Mobile", "Desktop", "Tablet", "Other", None], size=n)
    recency = RNG.integers(1, 365, size=n).astype(float)
    frequency = RNG.integers(1, 30, size=n).astype(float)
    monetary = RNG.gamma(2.0, 100.0, size=n) * RNG.uniform(0.5, 2.0, size=n)
    account_age = (pd.Timestamp("2025-08-19") - signup_date).days
    last_purchase_date = signup_date + pd.to_timedelta(RNG.integers(0, 1800, size=n), unit="D")
    group = RNG.choice(["A", "B", "C", "D"], size=n)
    conv_prob = {"A": 0.05, "B": 0.08, "C": 0.03, "D": 0.10}
    purchase = [RNG.random() < conv_prob.get(g, 0.05) for g in group]

    # Add interaction and non-linear features
    income = np.abs(income)
    loyalty_score = np.clip(loyalty_score + 0.2 * (np.array(purchase)), 0, 1)
    monetary = np.abs(monetary) + 0.1 * income * loyalty_score

    data = pd.DataFrame({
        "customer_id": customers,
        "age": ages,
        "gender": genders,
        "income": income,
        "region": region,
        "signup_date": signup_date,
        "loyalty_score": loyalty_score,
        "preferred_channel": preferred_channel,
        "device_type": device_type,
        "recency": recency,
        "frequency": frequency,
        "monetary": monetary,
        "account_age": account_age,
        "last_purchase_date": last_purchase_date,
        "group": group,
        "purchase": purchase,
    })

    # Inject missing values and outliers
    for col in ["recency", "frequency", "monetary", "income", "loyalty_score"]:
        miss_idx = RNG.choice(n, size=RNG.integers(30, 100), replace=False)
        data.loc[miss_idx, col] = None
    neg_idx = RNG.choice(n, size=50, replace=False)
    data.loc[neg_idx, "monetary"] *= -1  # negative spend
    data.loc[RNG.choice(n, size=30, replace=False), "age"] = 999  # impossible age
    data.loc[RNG.choice(n, size=30, replace=False), "income"] = -10000  # negative income
    data.loc[RNG.choice(n, size=30, replace=False), "loyalty_score"] = 2.0  # out of bounds
    data.loc[RNG.choice(n, size=30, replace=False), "region"] = ""  # empty region
    data.loc[RNG.choice(n, size=30, replace=False), "preferred_channel"] = "Unknown"

    path.parent.mkdir(parents=True, exist_ok=True)
    data.to_csv(path, index=False)
    return data

# Generate and save the data
data_path = Path("generated_data/marketing_data.csv")
df = generate_data(data_path)

## 3. Load and Clean Data
We load the generated CSV data, fix invalid or missing values in 'recency' and 'monetary', and standardize the 'gender' column.

In [None]:
def load_and_clean(path: Path) -> pd.DataFrame:
    """Load data and fix basic quality issues."""
    df = pd.read_csv(path)
    df["recency"] = df["recency"].apply(lambda x: np.nan if x < 0 else x)
    df["recency"].fillna(df["recency"].median(), inplace=True)
    df["monetary"] = df["monetary"].abs()
    df["gender"] = df["gender"].str.upper().str[0]
    return df

# Load and clean the data
data_path = Path("generated_data/marketing_data.csv")
df = load_and_clean(data_path)
df.head()

## 4. Customer Segmentation with KMeans
We apply KMeans clustering to the RFM features to segment customers into groups. The segment labels are added to the DataFrame and segment counts are displayed.

In [None]:
def segment_customers(df: pd.DataFrame) -> pd.DataFrame:
    """Perform KMeans clustering on RFM features."""
    rfm = df[["recency", "frequency", "monetary"]]
    kmeans = KMeans(n_clusters=3, random_state=42)
    df["segment"] = kmeans.fit_predict(rfm)
    return df

# Segment customers
df = segment_customers(df)
df["segment"].value_counts()

## 5. A/B Testing on Conversion Rates
We perform a z-test to compare conversion rates between groups A and B. Conversion rates, z-statistic, and p-value are printed.

In [None]:
def ab_test(df: pd.DataFrame) -> None:
    """Run a z-test on conversion rates between groups."""
    summary = df.groupby("group")["purchase"].agg(["sum", "count"])
    successes = summary["sum"].to_numpy()
    trials = summary["count"].to_numpy()
    stat, pval = proportions_ztest(successes, trials)
    rates = successes / trials
    print("Conversion rates:\n", rates)
    print(f"Z-statistic: {stat:.3f}, p-value: {pval:.3f}")

# Run A/B test
ab_test(df)