In [1]:
# env: zanalytics
import polars as pl
import numpy as np
import random
from datetime import datetime, timedelta

In [2]:
# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

In [3]:
# 1. Generate users DataFrame
n_users = 100
user_ids = np.arange(1, n_users + 1)
names = [f"User_{i}" for i in user_ids]
countries = np.random.choice(["Japan", "USA", "Germany", "Brazil"], size=n_users, p=[0.3, 0.3, 0.2, 0.2])

users_df = pl.DataFrame({
    "user_id": user_ids,
    "name": names,
    "country": countries
})

display(users_df.head())

# Save to Parquet
users_df.write_parquet("users.parquet")

user_id,name,country
i64,str,str
1,"""User_1""","""USA"""
2,"""User_2""","""Brazil"""
3,"""User_3""","""Germany"""
4,"""User_4""","""USA"""
5,"""User_5""","""Japan"""


In [4]:
# 2. Generate transactions DataFrame
n_transactions = 1000
transaction_user_ids = np.random.choice(user_ids, size=n_transactions)
products = np.random.choice(["TV", "Laptop", "Phone", "Camera"], size=n_transactions)
amounts = np.round(np.random.uniform(100, 2000, size=n_transactions), 2)

# Generate random dates from 2023 to 2025
def random_date(start_year=2023, end_year=2025):
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    delta = end_date - start_date
    return start_date + timedelta(days=random.randint(0, delta.days))

In [5]:
timestamps = [random_date().isoformat() for _ in range(n_transactions)]

transactions_df = pl.DataFrame({
    "user_id": transaction_user_ids,
    "product": products,
    "amount": amounts,
    "timestamp": timestamps
})

display(transactions_df.head())

# Save to Parquet
transactions_df.write_parquet("transactions.parquet")

user_id,product,amount,timestamp
i64,str,f64,str
63,"""Phone""",452.37,"""2023-08-17T00:00:00"""
96,"""Phone""",977.98,"""2023-02-21T00:00:00"""
52,"""Laptop""",102.34,"""2024-07-17T00:00:00"""
96,"""Laptop""",1236.82,"""2024-05-16T00:00:00"""
4,"""Phone""",1253.88,"""2024-04-02T00:00:00"""
