# Notebook 1: Synthetic Data Generation

This notebook generates synthetic player data with four distinct segments for uplift modeling research.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_generator import PlayerDataGenerator

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

## 1. Generate Synthetic Players

In [None]:
# Initialize generator
generator = PlayerDataGenerator('../configs/simulation_config.yaml')

# Generate player data
df = generator.generate_players()

# Add ground truth labels
df = generator.calculate_true_uplift(df)
df = generator.generate_ltv_labels(df)
df = generator.generate_churn_labels(df)

print(f"Generated {len(df)} players")
df.head()

## 2. Explore Player Segments

In [None]:
# Segment distribution
print("Segment Distribution:")
print(df['segment'].value_counts())
print(f"\nPercentages:")
print(df['segment'].value_counts(normalize=True) * 100)

In [None]:
# Visualize segments
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Segment sizes
df['segment'].value_counts().plot(kind='bar', ax=axes[0, 0], color=sns.color_palette('Set2'))
axes[0, 0].set_title('Player Segment Distribution', fontweight='bold')
axes[0, 0].set_ylabel('Number of Players')

# True uplift by segment
segment_uplift = df.groupby('segment')['true_uplift'].mean()
colors = ['green' if x > 0 else 'red' if x < 0 else 'gray' for x in segment_uplift.values]
segment_uplift.plot(kind='bar', ax=axes[0, 1], color=colors)
axes[0, 1].axhline(y=0, color='black', linestyle='--')
axes[0, 1].set_title('True Treatment Effect by Segment', fontweight='bold')
axes[0, 1].set_ylabel('True Uplift')

# Engagement by treatment and segment
engagement_data = df.groupby(['segment', 'treatment'])['outcome'].mean().unstack()
engagement_data.plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Engagement Rate by Segment and Treatment', fontweight='bold')
axes[1, 0].set_ylabel('Engagement Rate')
axes[1, 0].legend(['Control', 'Treatment'])

# Feature comparison
df.boxplot(column='total_deposits', by='segment', ax=axes[1, 1])
axes[1, 1].set_title('Total Deposits by Segment', fontweight='bold')
axes[1, 1].set_xlabel('Segment')

plt.suptitle('')
plt.tight_layout()
plt.show()

## 3. Validate Experiment Design

In [None]:
# Check treatment assignment balance
print("Treatment Assignment:")
print(df['treatment'].value_counts())
print(f"\nTreatment proportion: {df['treatment'].mean():.2%}")

# Check outcome distribution
print("\nOutcome Distribution:")
print(df.groupby('treatment')['outcome'].agg(['count', 'sum', 'mean']))

## 4. Feature Statistics by Segment

In [None]:
# Summary statistics by segment
feature_cols = [
    'total_deposits', 'avg_transaction_size', 'login_frequency_30d',
    'session_count_30d', 'days_since_last_login', 'account_age_days'
]

segment_summary = df.groupby('segment')[feature_cols].mean().round(2)
print("\nFeature Means by Segment:")
print(segment_summary)

## 5. Save Datasets

In [None]:
from sklearn.model_selection import train_test_split

# Save full dataset
generator.save_dataset(df, '../data/player_data.csv')

# Create train/test split
train_df, test_df = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['segment']
)

generator.save_dataset(train_df, '../data/player_data_train.csv')
generator.save_dataset(test_df, '../data/player_data_test.csv')

print("\nâœ“ Data generation complete!")