In [None]:
# Package to generate fake data
from faker import Faker
import logging
import random 
import json
import yaml
from datetime import datetime, timedelta
import os
import pandas as pd

In [None]:
# import product_data.yaml
with open(file="product_data.yaml", mode='r') as file:
    product_data = yaml.safe_load(file)

In [None]:
product_data

In [None]:
# Instantiate Faker object
faker = Faker()

In [None]:
# This is the data generator from Carmine's repo
# Configure logging with custom format

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

"""         click    view  purchase
click        0.1     0.7     0.2
view         0.3     0.4     0.3
purchase     0.0     0.0     1.0
"""

TRANSITION_PROBABILITIES = {
    "click":[0.1, 0.7, 0.2],
    "view": [0.3, 0.4, 0.3],
    "purchase": [0.0, 0.0, 1.0]
}

iteration = 0
total_iterations = 10_000
while iteration <= total_iterations :
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    n_customers = random.randint(1,10)

    for _ in range(n_customers):
        category = faker.random_element(elements=product_data.keys())
        product = faker.random_element(elements=product_data[category])
        
        action = "view"
        actions = ["view"]
        max_user_action = random.randint(2,10)
        print(max_user_action)
        user_action = 1
        while user_action <= max_user_action:
            action = random.choices(population=["click", "view", "purchase"],weights=TRANSITION_PROBABILITIES[action])[0]
            actions.append(action)
            user_action += 1
            
            if action != "purchase":
                continue
            else:
                print(user_action)
                break

        data = {
            "timestamp": timestamp,
            "user_id": faker.uuid4(),
            "location":[faker.location_on_land()],
            "action": [actions],
            "nr_user_action": user_action,
            "product_name": product['name'],
            "category": category,
            "price": product['price'],  
            "quantity": None if "purchase" not in actions else faker.random_digit_not_null()
        }
        df_data = pd.DataFrame(data = data, index = [0])
        df_data.to_csv('user_data.csv', index = False, mode = 'a')

        json_data_incoming = json.dumps(data, indent=4)
        logging.info("INCOMING DATA:\n\n %s", json_data_incoming)
    iteration = iteration + 1
    
    







In [None]:
# This is a revised version of the above to reflect seasonality (season multiplier) and date variation.  

# Seasonal events with higher data volume
SEASONAL_EVENTS = {
    "Easter": [(datetime(datetime.now().year - 2, 4, 18), datetime(datetime.now().year - 2, 4, 21)),
               (datetime(datetime.now().year - 1, 4, 18), datetime(datetime.now().year - 1, 4, 21))],
    "Amazon Prime Day": [(datetime(datetime.now().year - 2, 7, 16), datetime(datetime.now().year - 2, 7, 17)),
                         (datetime(datetime.now().year - 1, 7, 16), datetime(datetime.now().year - 1, 7, 17))],
    "Back to School": [(datetime(datetime.now().year - 2, 9, 1), datetime(datetime.now().year - 2, 9, 8)),
                       (datetime(datetime.now().year - 1, 9, 1), datetime(datetime.now().year - 1, 9, 8))],
    "Singles Day": [(datetime(datetime.now().year - 2, 11, 11), datetime(datetime.now().year - 2, 11, 13)),
                    (datetime(datetime.now().year - 1, 11, 11), datetime(datetime.now().year - 1, 11, 13))],
    "Black Friday": [(datetime(datetime.now().year - 2, 11, 24), datetime(datetime.now().year - 2, 11, 28)),
                     (datetime(datetime.now().year - 1, 11, 24), datetime(datetime.now().year - 1, 11, 28))],
    "New Year's Day": [(datetime(datetime.now().year - 2, 12, 26), datetime(datetime.now().year - 2, 12, 31)),
                       (datetime(datetime.now().year - 1, 12, 26), datetime(datetime.now().year - 1, 12, 31))]
}

TRANSITION_PROBABILITIES = {
    "click": [0.1, 0.7, 0.2],
    "view": [0.3, 0.4, 0.3],
    "purchase": [0.0, 0.0, 1.0]
}

csv_file_path = 'user_data.csv'

# If the CSV file already exists, remove it to start fresh
if os.path.exists(csv_file_path):
    os.remove(csv_file_path)

# Parameters
start_date = datetime.now() - timedelta(days=2 * 365)
end_date = datetime.now()
daily_user_base = 1000  # Base number of users per day
batch_size = 30  # Number of days to process in a single batch

# Generate data
current_date = start_date
all_data = []  # Store data in memory before writing in bulk

while current_date <= end_date:
    for _ in range(batch_size):
        if current_date > end_date:
            break

        # Check if the current date is part of a seasonal event
        seasonal_multiplier = 1
        for event_name, date_ranges in SEASONAL_EVENTS.items():
            for start, end in date_ranges:
                if start <= current_date <= end:
                    seasonal_multiplier = 5  # 5x more users during events
                    break

        # Calculate the number of users for the day
        n_users = daily_user_base * seasonal_multiplier

        for _ in range(n_users):
            user_id = faker.uuid4()
            location = faker.city()
            actions = []
            max_actions = random.randint(5, 20)  # More interactions per user
            action = "view"
            for _ in range(max_actions):
                action = random.choices(
                    population=["click", "view", "purchase"],
                    weights=TRANSITION_PROBABILITIES[action]
                )[0]
                actions.append(action)
                if action == "purchase":
                    break

            # Randomly select a category and product
            category = faker.random_element(elements=product_data.keys())
            if product_data[category]:  # Ensure the category has products
                product = faker.random_element(elements=product_data[category])
            else:
                continue  # Skip if no products in the category

            # Add data entry
            data = {
                "timestamp": (current_date + timedelta(seconds=random.randint(0, 86400))).strftime('%Y-%m-%d %H:%M:%S'),
                "user_id": user_id,
                "location": location,  # Ensure location is a string
                "action": ", ".join(actions),  # Convert list of actions to a string
                "category": category,
                "product_name": product["name"],  # Use the product's name
                "price": product["price"]         # Use the product's price
            }

            all_data.append(data)  # Add to in-memory data

        current_date += timedelta(days=1)

    # Write the batch to the CSV
    if all_data:
        df_data = pd.DataFrame(all_data)
        write_header = not os.path.exists(csv_file_path)
        df_data.to_csv(csv_file_path, index=False, mode='a', header=write_header)
        all_data = []  # Clear the batch after writing

print("Data generation complete.")