In [None]:
# -------------------------------------------------------------------
# Script to generate synthetic datasets for Data Engineering / Analytics
# 
# Datasets included:
#   1. orders (E-commerce Transactions)
#   2. api_logs (Web/API Logs)
#   3. financial_transactions (Financial / Fraud Transactions)
#
# Luciana Falcon
# 2025-12-01
# -------------------------------------------------------------------

import pandas as pd
import numpy as np
import os
import random
from datetime import datetime, timedelta

np.random.seed(42)

n_rows = 2000

df = pd.DataFrame({
    "order_id": range(1, n_rows + 1),
    "customer_id": np.random.randint(1000, 2000, n_rows),
    "product_id": np.random.randint(500, 800, n_rows),
    "order_timestamp": pd.to_datetime("2024-01-01") + 
                       pd.to_timedelta(np.random.randint(0, 365, n_rows), unit="D"),
    "quantity": np.random.randint(1, 5, n_rows),
    "unit_price": np.round(np.random.uniform(5, 200, n_rows), 2),
    "currency": "USD",
    "payment_method": np.random.choice(
        ["credit_card", "debit_card", "paypal", "transfer"], n_rows),
    "order_status": np.random.choice(
        ["created", "paid", "shipped", "delivered", "cancelled"], n_rows),
    "country": np.random.choice(
        ["Argentina", "Chile", "Brazil", "Uruguay"], n_rows),
    "city": np.random.choice(
        ["Buenos Aires", "Santiago", "Sao Paulo", "Montevideo"], n_rows),
    "sales_channel": np.random.choice(
        ["web", "mobile_app"], n_rows),
    "shipping_cost": np.round(np.random.uniform(2, 20, n_rows), 2),
    "discount": np.round(np.random.uniform(0, 30, n_rows), 2),
})

df["total_amount"] = (
    df["quantity"] * df["unit_price"] - df["discount"]
).clip(lower=0)

df["created_at"] = pd.Timestamp.now()

df.head()

carpeta = "C:/Users/wallo/OneDrive/Desktop/Web_Projects_Collection/data_science/data"
os.makedirs(carpeta, exist_ok=True)
ruta_csv = os.path.join(carpeta, "orders.csv")
df.to_csv(ruta_csv, index=False)

#-------------------------------------------------------------------
n = 2000

def random_ip():
    return ".".join(str(random.randint(0,255)) for _ in range(4))

def random_endpoint():
    return random.choice([
        "/api/login", "/api/logout", "/api/users", "/api/orders",
        "/api/products", "/api/cart", "/api/checkout"
    ])

def random_http_method():
    return random.choice(["GET", "POST", "PUT", "DELETE"])

def random_status():
    return random.choice([200, 201, 400, 401, 403, 404, 500, 502])

def random_user_agent():
    return random.choice([
        "Mozilla/5.0", "Chrome/108.0", "Safari/605.1", "Edge/107.0"
    ])

def random_country():
    return random.choice(["Argentina", "Chile", "Uruguay", "Brazil"])

def random_city(country):
    mapping = {
        "Argentina": ["Buenos Aires", "Cordoba", "Rosario"],
        "Chile": ["Santiago", "Valparaiso", "Concepcion"],
        "Uruguay": ["Montevideo", "Salto", "Paysandu"],
        "Brazil": ["Sao Paulo", "Rio de Janeiro", "Brasilia"]
    }
    return random.choice(mapping[country])

df_logs = pd.DataFrame({
    "timestamp": [datetime(2025,1,1) + timedelta(minutes=random.randint(0,525600)) for _ in range(n)],
    "ip_address": [random_ip() for _ in range(n)],
    "endpoint": [random_endpoint() for _ in range(n)],
    "http_method": [random_http_method() for _ in range(n)],
    "status_code": [random_status() for _ in range(n)],
    "response_time_ms": [random.randint(10,2000) for _ in range(n)],
    "user_agent": [random_user_agent() for _ in range(n)],
    "bytes_sent": [random.randint(100,50000) for _ in range(n)],
    "session_id": [f"sess_{random.randint(1000,9999)}" for _ in range(n)],
    "referer": [random_endpoint() for _ in range(n)],
    "country": [random_country() for _ in range(n)],
    "city": [random_city(country) for country in [random_country() for _ in range(n)]],
    "browser": [random.choice(["Chrome", "Firefox", "Safari", "Edge"]) for _ in range(n)],
    "os": [random.choice(["Windows", "MacOS", "Linux", "Android", "iOS"]) for _ in range(n)],
    "device_type": [random.choice(["desktop", "mobile", "tablet"]) for _ in range(n)],
    "request_id": [f"req_{random.randint(100000,999999)}" for _ in range(n)]
})

df_logs.head()

carpeta = "C:/Users/wallo/OneDrive/Desktop/Web_Projects_Collection/data_science/data"
os.makedirs(carpeta, exist_ok=True)
ruta_csv = os.path.join(carpeta, "api_logs.csv")
df.to_csv(ruta_csv, index=False)

#-------------------------------------------------------------------
n = 2000  # n√∫mero de filas

# Funciones auxiliares
def random_transaction_type():
    return random.choice(["deposit", "withdrawal", "purchase", "transfer"])

def random_currency():
    return random.choice(["USD", "EUR", "ARS", "BRL"])

def random_channel():
    return random.choice(["online", "ATM", "branch"])

def random_target():
    return random.choice([0, 1])  # 0 = no fraude, 1 = fraude

def random_city(country):
    mapping = {
        "Argentina": ["Buenos Aires", "Cordoba", "Rosario"],
        "Chile": ["Santiago", "Valparaiso", "Concepcion"],
        "Uruguay": ["Montevideo", "Salto", "Paysandu"],
        "Brazil": ["Sao Paulo", "Rio de Janeiro", "Brasilia"]
    }
    return random.choice(mapping[country])

def random_country():
    return random.choice(["Argentina", "Chile", "Uruguay", "Brazil"])

# Crear DataFrame
df_finanzas = pd.DataFrame({
    "transaction_id": range(1, n+1),
    "customer_id": [random.randint(1000,9999) for _ in range(n)],
    "account_id": [random.randint(10000,99999) for _ in range(n)],
    "transaction_date": [datetime(2024,1,1) + timedelta(days=random.randint(0,365)) for _ in range(n)],
    "transaction_time": [f"{random.randint(0,23):02d}:{random.randint(0,59):02d}:{random.randint(0,59):02d}" for _ in range(n)],
    "transaction_type": [random_transaction_type() for _ in range(n)],
    "amount": [round(random.uniform(10,5000), 2) for _ in range(n)],
    "balance_before": [round(random.uniform(100,10000),2) for _ in range(n)],
    "balance_after": [round(random.uniform(100,10000),2) for _ in range(n)],
    "currency": [random_currency() for _ in range(n)],
    "merchant": [f"Merchant_{random.randint(1,100)}" for _ in range(n)],
    "category": [random.choice(["food","electronics","clothing","travel","health"]) for _ in range(n)],
    "location_country": [random_country() for _ in range(n)],
    "location_city": [random_city(random_country()) for _ in range(n)],
    "channel": [random_channel() for _ in range(n)],
    "target": [random_target() for _ in range(n)]
})

# Mostrar primeras filas
df_finanzas.head()

carpeta = "C:/Users/wallo/OneDrive/Desktop/Web_Projects_Collection/data_science/data"
os.makedirs(carpeta, exist_ok=True)
ruta_csv = os.path.join(carpeta, "financial_transactions.csv")
df.to_csv(ruta_csv, index=False)

