# Motivation: Ilya Sutskever's [presentation @ NuerIPS](https://www.youtube.com/watch?v=WQQdd6qGxNs&t=525s)
## _age of pre-training will end due to the fact that data is not growing_

In [1]:
# Data Generation
from scipy import stats
import numpy as np
import random
import pandas as pd
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Dict, List, Tuple

# Data Transformation
import duckdb

%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sql/magic.py", line 196, in execute
    conn = sql.connection.Connection.set(
  File "/usr/local/lib/python3.10/dist-packages/sql/connection.py", line 82, in set
    raise ConnectionError(
sql.connection.ConnectionError: Environment variable $DATABASE_URL not set, and no connect string given.

Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])


In [2]:
class BeautyRetailDataGenerator:
    def __init__(self, seed: int = 42):
        """Initialize the data generator with constants and distributions."""
        np.random.seed(seed)
        random.seed(seed)
        
        # Product categories and their characteristics
        self.product_categories = {
            'Skincare': {'avg_price': 45, 'std_price': 15, 'seasonality': 0.1},
            'Makeup': {'avg_price': 35, 'std_price': 12, 'seasonality': 0.2},
            'Fragrance': {'avg_price': 85, 'std_price': 25, 'seasonality': 0.3},
            'Haircare': {'avg_price': 30, 'std_price': 10, 'seasonality': 0.15}
        }
        
        # Customer segments and their characteristics
        self.customer_segments = {
            'Luxury': {'price_sensitivity': 0.2, 'purchase_frequency': 2.5},
            'Premium': {'price_sensitivity': 0.4, 'purchase_frequency': 2.0},
            'Mainstream': {'price_sensitivity': 0.7, 'purchase_frequency': 1.5},
            'Value': {'price_sensitivity': 0.9, 'purchase_frequency': 1.0}
        }

    def generate_customer_profiles(self, n_customers: int) -> pd.DataFrame:
        """Generate synthetic customer profiles with realistic attributes."""
        
        # Age distribution (bimodal to represent different customer groups)
        age_dist1 = np.random.normal(25, 5, n_customers // 2)
        age_dist2 = np.random.normal(45, 8, n_customers - n_customers // 2)
        ages = np.concatenate([age_dist1, age_dist2])
        np.random.shuffle(ages)
        
        # Generate other customer attributes
        data = {
            'customer_id': [f'CUST_{i:06d}' for i in range(n_customers)],
            'age': ages.astype(int),
            'signup_date': [
                (datetime.now() - timedelta(days=random.randint(1, 1825))).strftime('%Y-%m-%d')
                for _ in range(n_customers)
            ],
            'segment': np.random.choice(
                list(self.customer_segments.keys()),
                size=n_customers,
                p=[0.1, 0.2, 0.5, 0.2]  # Probability distribution for segments
            )
        }
        
        # Add derived features
        df = pd.DataFrame(data)
        df['loyalty_score'] = self._calculate_loyalty_score(df)
        df['email_engagement_rate'] = self._generate_engagement_rates(df)
        
        return df
    
    def generate_transactions(self, 
                            customer_profiles: pd.DataFrame, 
                            start_date: str,
                            end_date: str) -> pd.DataFrame:
        """Generate synthetic transaction data based on customer profiles."""
        
        start = datetime.strptime(start_date, '%Y-%m-%d')
        end = datetime.strptime(end_date, '%Y-%m-%d')
        days = (end - start).days
        
        transactions = []
        
        for _, customer in customer_profiles.iterrows():
            # Calculate number of transactions based on segment and loyalty
            segment_freq = self.customer_segments[customer['segment']]['purchase_frequency']
            loyalty_multiplier = 1 + (customer['loyalty_score'] / 100) * 0.5
            n_transactions = int(days / 30 * segment_freq * loyalty_multiplier)
            
            # Generate transactions for this customer
            for _ in range(n_transactions):
                transaction_date = start + timedelta(days=random.randint(0, days))
                transactions.append(self._generate_single_transaction(customer, transaction_date))
        
        return pd.DataFrame(transactions)
    
    def _calculate_loyalty_score(self, df: pd.DataFrame) -> np.ndarray:
        """Calculate customer loyalty scores based on various factors."""
        base_scores = np.random.normal(60, 15, len(df))
        
        # Adjust based on segment
        segment_adjustments = {
            'Luxury': 20,
            'Premium': 15,
            'Mainstream': 0,
            'Value': -5
        }
        
        # Calculate tenure in days
        tenure = (datetime.now() - pd.to_datetime(df['signup_date'])).dt.days
        
        # Adjust scores
        adjusted_scores = base_scores + \
                         [segment_adjustments[seg] for seg in df['segment']] + \
                         tenure / 365 * 5  # 5 points per year
        
        return np.clip(adjusted_scores, 0, 100)
    
    def _generate_engagement_rates(self, df: pd.DataFrame) -> np.ndarray:
        """Generate email engagement rates correlated with loyalty and segment."""
        base_rates = np.random.beta(2, 5, len(df))  # Beta distribution for percentage
        
        # Adjust based on loyalty score
        loyalty_factor = df['loyalty_score'] / 100
        
        # Adjust based on segment
        segment_multiplier = {
            'Luxury': 1.3,
            'Premium': 1.2,
            'Mainstream': 1.0,
            'Value': 0.8
        }
        
        adjusted_rates = base_rates * \
                        loyalty_factor * \
                        [segment_multiplier[seg] for seg in df['segment']]
        
        return np.clip(adjusted_rates, 0, 1)
    
    def _generate_single_transaction(self, 
                                   customer: pd.Series, 
                                   transaction_date: datetime) -> Dict:
        """Generate a single transaction for a customer."""
        
        # Select random category with seasonal adjustment
        season_factor = np.sin(2 * np.pi * transaction_date.timetuple().tm_yday / 365)
        category_probs = [1 + cat['seasonality'] * season_factor 
                         for cat in self.product_categories.values()]
        category_probs = np.array(category_probs) / sum(category_probs)
        
        category = np.random.choice(list(self.product_categories.keys()), p=category_probs)
        
        # Calculate base price with customer segment adjustment
        base_price = np.random.normal(
            self.product_categories[category]['avg_price'],
            self.product_categories[category]['std_price']
        )
        
        # Adjust quantity based on customer characteristics
        quantity = np.random.poisson(
            2 * (1 + customer['loyalty_score'] / 100)
        ) + 1
        
        return {
            'transaction_id': f'TXN_{random.randint(0, 999999):06d}',
            'customer_id': customer['customer_id'],
            'date': transaction_date.strftime('%Y-%m-%d'),
            'category': category,
            'quantity': quantity,
            'unit_price': base_price,
            'total_amount': base_price * quantity
        }

# Example usage and validation
if __name__ == "__main__":
    # Initialize generator
    generator = BeautyRetailDataGenerator(seed=42)
    
    # Generate customer profiles
    customers = generator.generate_customer_profiles(1000)
    
    # Generate transactions for the past year
    end_date = datetime.now().strftime('%Y-%m-%d')
    start_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
    transactions = generator.generate_transactions(customers, start_date, end_date)
    
    # Basic validation
    # print("\nCustomer Profile Summary:")
    # print(customers['segment'].value_counts(normalize=True))
    # print("\nAverage Loyalty Score by Segment:")
    # print(customers.groupby('segment')['loyalty_score'].mean())
    
    # print("\nTransaction Summary:")
    # print(f"Total Transactions: {len(transactions)}")
    # print("\nAverage Transaction Amount by Category:")
    # print(transactions.groupby('category')['total_amount'].mean())t

customers

Unnamed: 0,customer_id,age,signup_date,segment,loyalty_score,email_engagement_rate
0,CUST_000000,35,2021-06-14,Mainstream,93.081185,0.211317
1,CUST_000001,28,2024-05-30,Luxury,80.355544,0.187622
2,CUST_000002,20,2024-11-23,Premium,54.239559,0.254868
3,CUST_000003,25,2020-11-17,Mainstream,89.767342,0.154536
4,CUST_000004,28,2023-06-30,Mainstream,57.621680,0.315264
...,...,...,...,...,...,...
995,CUST_000995,46,2022-08-20,Luxury,77.999895,0.387725
996,CUST_000996,38,2021-04-30,Mainstream,44.438342,0.116449
997,CUST_000997,24,2022-12-17,Premium,67.494126,0.070790
998,CUST_000998,28,2024-08-25,Mainstream,52.202432,0.088321


In [3]:
transactions

Unnamed: 0,transaction_id,customer_id,date,category,quantity,unit_price,total_amount
0,TXN_357806,CUST_000000,2024-12-11,Skincare,8,48.527557,388.220452
1,TXN_890844,CUST_000000,2024-01-21,Fragrance,4,61.553292,246.213169
2,TXN_862276,CUST_000000,2024-08-17,Haircare,8,37.358774,298.870189
3,TXN_110665,CUST_000000,2024-09-21,Skincare,4,43.358752,173.435010
4,TXN_379781,CUST_000000,2024-08-23,Haircare,4,23.248625,92.994500
...,...,...,...,...,...,...,...
26381,TXN_778123,CUST_000999,2024-02-27,Skincare,4,52.116852,208.467409
26382,TXN_542317,CUST_000999,2024-06-22,Haircare,6,49.535322,297.211932
26383,TXN_054181,CUST_000999,2024-07-07,Haircare,3,40.263499,120.790496
26384,TXN_717607,CUST_000999,2024-04-05,Makeup,4,34.409350,137.637400


In [4]:
query = """
select *
from customers c
right join transactions t
on c.customer_id = t.customer_id
"""
duckdb.query(query).df()

Unnamed: 0,customer_id,age,signup_date,segment,loyalty_score,email_engagement_rate,transaction_id,customer_id_1,date,category,quantity,unit_price,total_amount
0,CUST_000000,35,2021-06-14,Mainstream,93.081185,0.211317,TXN_357806,CUST_000000,2024-12-11,Skincare,8,48.527557,388.220452
1,CUST_000000,35,2021-06-14,Mainstream,93.081185,0.211317,TXN_890844,CUST_000000,2024-01-21,Fragrance,4,61.553292,246.213169
2,CUST_000000,35,2021-06-14,Mainstream,93.081185,0.211317,TXN_862276,CUST_000000,2024-08-17,Haircare,8,37.358774,298.870189
3,CUST_000000,35,2021-06-14,Mainstream,93.081185,0.211317,TXN_110665,CUST_000000,2024-09-21,Skincare,4,43.358752,173.435010
4,CUST_000000,35,2021-06-14,Mainstream,93.081185,0.211317,TXN_379781,CUST_000000,2024-08-23,Haircare,4,23.248625,92.994500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26381,CUST_000937,27,2023-12-02,Luxury,76.233726,0.234103,TXN_341439,CUST_000937,2024-10-06,Haircare,5,32.102439,160.512196
26382,CUST_000937,27,2023-12-02,Luxury,76.233726,0.234103,TXN_526988,CUST_000937,2024-11-29,Skincare,4,19.686930,78.747719
26383,CUST_000937,27,2023-12-02,Luxury,76.233726,0.234103,TXN_994223,CUST_000937,2024-05-30,Fragrance,7,60.590756,424.135290
26384,CUST_000937,27,2023-12-02,Luxury,76.233726,0.234103,TXN_280128,CUST_000937,2024-05-27,Makeup,5,29.185655,145.928275


In [5]:
# Generating Category, Product levels: 1 and 2 separately

# Provided product hierarchy
product_hierarchy = {
    'Skincare': {
        'Moisturizers': ['Day Cream', 'Night Cream', 'Eye Cream'],
        'Cleansers': ['Foam Cleanser', 'Oil Cleanser', 'Micellar Water'],
        'Treatments': ['Vitamin C Serum', 'Retinol', 'Hyaluronic Acid']
    },
    'Makeup': {
        'Face': ['Foundation', 'Concealer', 'Blush', 'Bronzer'],
        'Eyes': ['Mascara', 'Eyeshadow', 'Eyeliner'],
        'Lips': ['Lipstick', 'Lip Gloss', 'Lip Liner']
    },
    'Fragrance': {
        'Perfume': ['Eau de Parfum', 'Eau de Toilette'],
        'Sets': ['Gift Set', 'Travel Set']
    },
    'Haircare': {
        'Shampoo': ['Regular', 'Dry', 'Oily'],
        'Conditioner': ['Regular', 'Deep', 'Leave-in'],
        'Styling': ['Hair Oil', 'Heat Protectant', 'Hair Spray']
    }
}

# Generate DataFrame with category, product_level_1, and product_level_2
def generate_product_data(product_hierarchy, num_rows=100):
    """Generate a DataFrame with random category, product_level_1, and product_level_2."""
    data = []
    for _ in range(num_rows):
        # Randomly select a category
        category = random.choice(list(product_hierarchy.keys()))
        # Randomly select a product_level_1 (subcategory) within the category
        product_level_1 = random.choice(list(product_hierarchy[category].keys()))
        # Randomly select a product_level_2 (specific product) within the product_level_1
        product_level_2 = random.choice(product_hierarchy[category][product_level_1])
        # Append the row to the data list
        data.append({
            'category': category,
            'product_level_1': product_level_1,
            'product_level_2': product_level_2
        })
    return pd.DataFrame(data)

# Generate the DataFrame
products = generate_product_data(product_hierarchy, num_rows=100)

# Display the DataFrame
products

Unnamed: 0,category,product_level_1,product_level_2
0,Fragrance,Perfume,Eau de Parfum
1,Haircare,Styling,Hair Spray
2,Skincare,Moisturizers,Eye Cream
3,Skincare,Cleansers,Foam Cleanser
4,Makeup,Face,Bronzer
...,...,...,...
95,Haircare,Conditioner,Regular
96,Skincare,Cleansers,Oil Cleanser
97,Haircare,Conditioner,Regular
98,Fragrance,Sets,Travel Set


# Check Quality and Realism of Synthetic Data

In [6]:
class SyntheticDataValidator:
    def __init__(self):
        """Initialize expected distributions and validation thresholds."""
        self.expected_distributions = {
            'age': {
                'mean': 35,
                'std': 12,
                'range': (18, 80)
            },
            'transaction_amount': {
                'mean': 75,
                'std': 45,
                'range': (10, 500)
            },
            'purchase_frequency': {
                'mean': 2,
                'std': 1,
                'range': (0, 10)
            }
        }
        
        self.expected_ratios = {
            'customer_segments': {
                'Luxury': 0.1,
                'Premium': 0.2,
                'Mainstream': 0.5,
                'Value': 0.2
            },
            'product_categories': {
                'Skincare': 0.3,
                'Makeup': 0.35,
                'Fragrance': 0.15,
                'Haircare': 0.2
            }
        }

    def validate_distributions(self, 
                             customers: pd.DataFrame, 
                             transactions: pd.DataFrame)-> Dict:#,
                             #products: pd.DataFrame) -> Dict
        """Perform comprehensive validation of synthetic data distributions."""
        validation_results = {}
        
        # Customer Demographics Validation
        validation_results['demographics'] = self._validate_demographics(customers)
        
        # Transaction Patterns Validation
        validation_results['transactions'] = self._validate_transactions(transactions)
        
        # Product Distribution Validation
        # validation_results['products'] = self._validate_products(products)
        
        # Cross-table Relationship Validation
        validation_results['relationships'] = self._validate_relationships(
            customers, transactions#, products
        )
        
        return validation_results

    def _validate_demographics(self, customers: pd.DataFrame) -> Dict:
        """Validate customer demographic distributions."""
        results = {
            'tests': {},
            'warnings': []
        }
        
        # Age distribution test
        age_stats = stats.describe(customers['age'])
        results['tests']['age'] = {
            'ks_test': stats.kstest(
                customers['age'],
                'norm',
                args=(self.expected_distributions['age']['mean'],
                      self.expected_distributions['age']['std'])
            ),
            'mean_diff': abs(age_stats.mean - self.expected_distributions['age']['mean']),
            'std_diff': abs(np.sqrt(age_stats.variance) - 
                          self.expected_distributions['age']['std'])
        }
        
        # Segment distribution test
        segment_dist = customers['segment'].value_counts(normalize=True)
        for segment, expected_ratio in self.expected_ratios['customer_segments'].items():
            actual_ratio = segment_dist.get(segment, 0)
            if abs(actual_ratio - expected_ratio) > 0.05:
                results['warnings'].append(
                    f"Segment {segment} distribution off by "
                    f"{abs(actual_ratio - expected_ratio):.2%}"
                )
        
        # Loyalty score distribution
        loyalty_stats = stats.describe(customers['loyalty_score'])
        results['tests']['loyalty'] = {
            'mean': loyalty_stats.mean,
            'std': np.sqrt(loyalty_stats.variance),
            'skew': loyalty_stats.skewness
        }
        
        return results

    def _validate_transactions(self, transactions: pd.DataFrame) -> Dict:
        """Validate transaction patterns and distributions."""
        results = {
            'tests': {},
            'warnings': []
        }
        
        # Transaction amount distribution
        amount_stats = stats.describe(transactions['total_amount'])
        results['tests']['amount'] = {
            'ks_test': stats.kstest(
                transactions['total_amount'],
                'gamma',
                args=(2, 0, 50)  # shape, loc, scale
            ),
            'mean': amount_stats.mean,
            'std': np.sqrt(amount_stats.variance)
        }
        
        # Category distribution test
        category_dist = transactions['category'].value_counts(normalize=True)
        for category, expected_ratio in self.expected_ratios['product_categories'].items():
            actual_ratio = category_dist.get(category, 0)
            if abs(actual_ratio - expected_ratio) > 0.05:
                results['warnings'].append(
                    f"Category {category} distribution off by "
                    f"{abs(actual_ratio - expected_ratio):.2%}"
                )
        
        # Purchase frequency patterns
        purchase_freq = transactions.groupby('customer_id').size()
        freq_stats = stats.describe(purchase_freq)
        results['tests']['frequency'] = {
            'mean': freq_stats.mean,
            'std': np.sqrt(freq_stats.variance),
            'max': freq_stats.minmax[1]
        }
        
        return results

    def _validate_products(self, products: pd.DataFrame) -> Dict:
        """Validate product catalog distributions."""
        results = {
            'tests': {},
            'warnings': []
        }
        
        # Price distribution by category
        for category in products['category'].unique():
            cat_prices = products[products['category'] == category]['price']
            price_stats = stats.describe(cat_prices)
            results['tests'][f'price_{category}'] = {
                'mean': price_stats.mean,
                'std': np.sqrt(price_stats.variance),
                'skew': price_stats.skewness
            }
        
        # Brand tier distribution
        tier_dist = products['brand_tier'].value_counts(normalize=True)
        results['tests']['brand_tiers'] = dict(tier_dist)
        
        # Rating distribution
        rating_stats = stats.describe(products['rating'])
        if rating_stats.mean < 3.5 or rating_stats.mean > 4.5:
            results['warnings'].append(
                f"Average rating of {rating_stats.mean:.2f} seems unrealistic"
            )
        
        return results

    def _validate_relationships(self, 
                              customers: pd.DataFrame,
                              transactions: pd.DataFrame)-> Dict:#,
                              #products: pd.DataFrame) -> Dict:
        """Validate relationships between different tables."""
        results = {
            'tests': {},
            'warnings': []
        }
        
        # Customer segment vs price point correlation
        merged_data = transactions.merge(
            customers[['customer_id', 'segment']],
            on='customer_id'
        )#.merge(
        #     products[['product_id', 'price']],
        #     on='product_id'
        # )
        
        segment_avg_price = merged_data.groupby('segment')['unit_price'].mean()
        
        # Check if luxury customers buy more expensive products
        if not segment_avg_price['Luxury'] > segment_avg_price['Mainstream']:
            results['warnings'].append(
                "Luxury segment not showing expected premium purchase behavior"
            )
        
        # Check purchase frequency by segment
        segment_frequency = merged_data.groupby('segment')['customer_id'].count()
        results['tests']['segment_frequency'] = dict(segment_frequency)
        
        return results

    def generate_validation_report(self, validation_results: Dict) -> str:
        """Generate a human-readable validation report."""
        report = ["Synthetic Data Validation Report", "=" * 30, ""]
        
        # Demographics validation
        report.append("Customer Demographics Validation:")
        report.append("-" * 30)
        for test, result in validation_results['demographics']['tests'].items():
            report.append(f"{test}: {result}")
        report.extend(validation_results['demographics']['warnings'])
        report.append("")
        
        # Transaction validation
        report.append("Transaction Patterns Validation:")
        report.append("-" * 30)
        for test, result in validation_results['transactions']['tests'].items():
            report.append(f"{test}: {result}")
        report.extend(validation_results['transactions']['warnings'])
        report.append("")
        
        # Product validation
        # report.append("Product Catalog Validation:")
        # report.append("-" * 30)
        # for test, result in validation_results['products']['tests'].items():
        #     report.append(f"{test}: {result}")
        # report.extend(validation_results['products']['warnings'])
        
        return "\n".join(report)

# Example usage
if __name__ == "__main__":
    # Assuming you have your synthetic data generator
    # generator = BeautyRetailDataGenerator(seed=42)
    # customers = generator.generate_customer_profiles(1000)
    # products = generator.product_catalog
    # transactions = generator.generate_transactions(
    #     customers,
    #     '2024-01-01',
    #     '2024-01-31'
    # )
    
    # Validate the data
    validator = SyntheticDataValidator()
    validation_results = validator.validate_distributions(
        customers,
        transactions
        # ,
        # products
    )
    
    # Generate and print report
    report = validator.generate_validation_report(validation_results)
    print(report)

Synthetic Data Validation Report

Customer Demographics Validation:
------------------------------
age: {'ks_test': KstestResult(statistic=0.13146246127401312, pvalue=1.5727515123852372e-15, statistic_location=29, statistic_sign=1), 'mean_diff': 0.3639999999999972, 'std_diff': 0.059308891098648786}
loyalty: {'mean': 75.79595794857315, 'std': 17.38118109540282, 'skew': -0.42134067179003243}

Transaction Patterns Validation:
------------------------------
amount: {'ks_test': KstestResult(statistic=0.40998961367411735, pvalue=0.0, statistic_location=132.42237329481435, statistic_sign=-1), 'mean': 223.17208242999047, 'std': 163.73082266042226}
frequency: {'mean': 26.386, 'std': 8.395682907988576, 'max': 45}
Category Makeup distribution off by 9.86%
Category Fragrance distribution off by 9.63%
Category Haircare distribution off by 5.10%

