# Activation

## Fake data

In [9]:
import pandas as pd
import numpy as np
from faker import Faker
from scipy.stats import nbinom, bernoulli

def create_fake_data(num_customers=100, num_records=10000, start_date='2020-01-01', end_date='2024-01-01'):
    fake = Faker()
    Faker.seed(42)
    np.random.seed(42)
    
    customer_ids = [fake.uuid4() for _ in range(num_customers)]
    
    data = []
    
    # Parameters for Zero-Inflated Negative Binomial distributions
    zinb_params = [
        {'n': 5, 'p': 0.5, 'inflation_prob': 0.2},
        {'n': 7, 'p': 0.3, 'inflation_prob': 0.1},
        {'n': 10, 'p': 0.4, 'inflation_prob': 0.3},
        {'n': 4, 'p': 0.6, 'inflation_prob': 0.25},
        {'n': 8, 'p': 0.2, 'inflation_prob': 0.15},
        {'n': 6, 'p': 0.35, 'inflation_prob': 0.2},
        {'n': 3, 'p': 0.5, 'inflation_prob': 0.1},
        {'n': 9, 'p': 0.45, 'inflation_prob': 0.3},
        {'n': 5, 'p': 0.55, 'inflation_prob': 0.25},
        {'n': 7, 'p': 0.25, 'inflation_prob': 0.2}
    ]
    
    for _ in range(num_records):
        customer_id = np.random.choice(customer_ids)
        workspace_creation_date = pd.Timestamp(fake.date_between(start_date='-2y', end_date='today'))
        t = np.random.randint(0, 1000)  # Days since workspace creation
        converted = bernoulli.rvs(0.5)  # Randomly assign conversion status (0 or 1)
        
        features = []
        for params in zinb_params:
            if bernoulli.rvs(params['inflation_prob']):
                features.append(0)
            else:
                features.append(nbinom.rvs(params['n'], 1 - params['p']))
        
        dim_1 = fake.random_int(min=1, max=5)
        dim_2 = fake.random_element(elements=('A', 'B', 'C', 'D', 'E'))
        dim_3 = fake.random_number(digits=2)
        
        data.append([t, customer_id, workspace_creation_date, converted] + features + [dim_1, dim_2, dim_3])
    
    columns = ['t', 'customer_id', 'workspace_creation_date', 'converted'] + [f'feature_{i}' for i in range(1, 11)] + ['dim_1', 'dim_2', 'dim_3']
    return pd.DataFrame(data, columns=columns)

# Generate the data
data = create_fake_data(num_customers=10, num_records=1000)
data

Unnamed: 0,t,customer_id,workspace_creation_date,converted,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,dim_1,dim_2,dim_3
0,435,b74d0fb1-32e7-4629-8fad-c1a606cb0fb3,2024-02-07,1,0,1,0,0,2,2,0,4,3,9,4,C,35
1,251,b74d0fb1-32e7-4629-8fad-c1a606cb0fb3,2022-10-19,0,1,1,0,0,1,3,5,10,6,5,3,A,11
2,347,c241330b-01a9-471f-9e8a-774bcf36d58b,2023-04-01,0,0,2,3,3,0,3,12,14,7,0,3,C,77
3,571,972a8469-1641-4f82-8b9d-2434e465e150,2023-01-07,0,0,0,15,0,2,1,0,7,0,7,1,D,68
4,927,17fc695a-07a0-4a6e-8822-e8f36c031199,2022-09-27,1,5,3,11,4,1,0,7,0,0,1,4,A,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,40,9a1de644-815e-46d1-bb8f-aa1837f8a88b,2023-04-14,1,0,1,0,0,0,0,6,10,4,3,1,A,65
996,605,b74d0fb1-32e7-4629-8fad-c1a606cb0fb3,2024-06-27,1,0,1,0,5,5,1,0,10,0,2,3,E,82
997,844,b74d0fb1-32e7-4629-8fad-c1a606cb0fb3,2023-09-05,0,0,1,0,10,6,1,8,9,0,5,3,C,87
998,816,b74d0fb1-32e7-4629-8fad-c1a606cb0fb3,2022-09-15,1,1,1,0,2,1,0,2,0,12,3,1,C,35
