In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

random.seed(42)
np.random.seed(42)

# Parameters
n_cases = 3333
start_date = datetime(2025, 1, 1, 8, 0, 0)

activities = [
    "Receber Pedido",
    "Validar Dados",
    "Analisar Crédito",
    "Aprovar Pedido",
    "Revisar Documentação",  # atividade opcional (variant)
    "Finalizar",
]

resources = ["Marina", "Pedro"]

# Produtividade dos recursos (multiplicador de duração)
resource_speed = {
    "Pedro": 10,  # muito lento → outlier extremo
    "Marina": 1.3,
}

# Backlog base para atividades (simula carga do processo)
base_backlog = {
    "Validar Dados": 1.0,
    "Analisar Crédito": 1.2,
    "Aprovar Pedido": 30,
}

rows = []

for case_id in range(1, n_cases + 1):

    # --------------------------
    # 1. START DATE
    # --------------------------
    start_offset_days = np.random.randint(0, 30)
    start_offset_minutes = np.random.randint(0, 12 * 60)
    current_time = start_date + timedelta(
        days=start_offset_days, minutes=start_offset_minutes
    )

    # Informação contextual
    hour = current_time.hour
    is_bad_day = np.random.rand() < 0.05  # 5% dos casos → tudo mais lento

    # --------------------------
    # 2. RECEBER PEDIDO
    resource_rp = random.choice(resources)
    rows.append(
        {
            "case_id": case_id,
            "activity": "Receber Pedido",
            "timestamp": current_time,
            "resource": resource_rp,
            "cost": round(np.random.normal(120, 15), 2),
        }
    )

    # ---------------------------------------------------
    # 3. VALIDAR DADOS (1–4 vezes, dependendo do backlog)
    # ---------------------------------------------------
    base_rework = np.random.choice([1, 2, 3, 4], p=[0.5, 0.3, 0.15, 0.05])
    for _ in range(base_rework):

        # backlog adiciona tempo extra
        backlog_factor = base_backlog["Validar Dados"] * np.random.uniform(0.8, 2.0)

        # recurso escolhido
        r = random.choice(resources)
        speed_factor = resource_speed[r]

        # turno influencia velocidade
        shift_factor = 1.2 if hour > 18 or hour < 7 else 1.0

        # dia ruim
        bad_day_factor = 1.5 if is_bad_day else 1.0

        dur_min = np.random.randint(20, 80)
        total_delay = (
            dur_min * backlog_factor * speed_factor * shift_factor * bad_day_factor
        )

        current_time += timedelta(minutes=total_delay)

        rows.append(
            {
                "case_id": case_id,
                "activity": "Validar Dados",
                "timestamp": current_time,
                "resource": r,
                "cost": round(np.random.normal(80, 10), 2),
            }
        )

    # -------------------------------
    # 4. ANALISAR CRÉDITO (1–2 vezes)
    # -------------------------------
    for _ in range(np.random.choice([1, 2], p=[0.85, 0.15])):

        backlog_factor = base_backlog["Analisar Crédito"] * np.random.uniform(0.7, 1.8)
        r = random.choice(resources)
        speed_factor = resource_speed[r]
        shift_factor = 1.2 if current_time.hour > 18 or current_time.hour < 7 else 1.0
        bad_day_factor = 1.5 if is_bad_day else 1.0

        dur_min = np.random.randint(30, 120)
        total_delay = (
            dur_min * backlog_factor * speed_factor * shift_factor * bad_day_factor
        )

        current_time += timedelta(minutes=total_delay)

        rows.append(
            {
                "case_id": case_id,
                "activity": "Analisar Crédito",
                "timestamp": current_time,
                "resource": r,
                "cost": round(np.random.normal(90, 12), 2),
            }
        )

    # -------------------------------
    # 5. REVISAR DOCUMENTAÇÃO (20% dos casos)
    # -------------------------------
    if np.random.rand() < 0.20:

        r = random.choice(resources)
        backlog_factor = np.random.uniform(1.0, 2.5)
        speed_factor = resource_speed[r]
        bad_day_factor = 1.5 if is_bad_day else 1.0

        dur_min = np.random.randint(40, 120)
        total_delay = dur_min * backlog_factor * speed_factor * bad_day_factor

        current_time += timedelta(minutes=total_delay)

        rows.append(
            {
                "case_id": case_id,
                "activity": "Revisar Documentação",
                "timestamp": current_time,
                "resource": r,
                "cost": round(np.random.normal(70, 10), 2),
            }
        )

    # -------------------------------
    # 6. APROVAR PEDIDO (90%)
    # -------------------------------
    if np.random.rand() < 0.9:

        r = random.choice(resources)
        backlog_factor = base_backlog["Aprovar Pedido"] * np.random.uniform(0.7, 1.8)
        speed_factor = resource_speed[r]
        shift_factor = 1.2 if current_time.hour > 18 or current_time.hour < 7 else 1.0

        dur_min = np.random.randint(30, 180)
        total_delay = dur_min * backlog_factor * speed_factor * shift_factor

        current_time += timedelta(minutes=total_delay)

        rows.append(
            {
                "case_id": case_id,
                "activity": "Aprovar Pedido",
                "timestamp": current_time,
                "resource": r,
                "cost": round(np.random.normal(60, 8), 2),
            }
        )

    # -------------------------------
    # 7. FINALIZAR (com atraso controlado)
    # -------------------------------
    if np.random.rand() < 0.20:
        # Espera longa intencional
        wait_hours = np.random.randint(24, 72)
        current_time += timedelta(hours=wait_hours)
    else:
        wait_min = np.random.randint(60, 300)
        current_time += timedelta(minutes=wait_min)

    r = random.choice(resources)

    rows.append(
        {
            "case_id": case_id,
            "activity": "Finalizar",
            "timestamp": current_time,
            "resource": r,
            "cost": round(np.random.normal(40, 6), 2),
        }
    )

# Build DataFrame
df_log = pd.DataFrame(rows)
df_log["timestamp"] = df_log["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")

# Save to CSV
file_path = f"../raw/event_log_sintetico_{n_cases}_cases.csv"
df_log.to_csv(file_path, index=False)

df_log.head(), df_log.tail(), file_path

(   case_id          activity            timestamp resource    cost
 0        1    Receber Pedido  2025-01-07 15:15:00   Marina  129.72
 1        1     Validar Dados  2025-01-07 15:53:30   Marina   95.23
 2        1  Analisar Crédito  2025-01-08 03:12:09    Pedro   84.21
 3        1         Finalizar  2025-01-09 23:12:09   Marina   40.98
 4        2    Receber Pedido  2025-01-01 15:39:00   Marina   83.41,
        case_id          activity            timestamp resource   cost
 20002     3333     Validar Dados  2025-01-20 03:53:04    Pedro  84.37
 20003     3333     Validar Dados  2025-01-20 04:41:59   Marina  73.94
 20004     3333  Analisar Crédito  2025-01-21 12:46:22    Pedro  95.31
 20005     3333    Aprovar Pedido  2025-03-12 05:20:04    Pedro  54.35
 20006     3333         Finalizar  2025-03-12 08:25:04    Pedro  50.22,
 '../raw/event_log_sintetico_3333_cases.csv')