# Simulating data

In [1]:
import numpy as np
import pandas as pd

In [2]:
rng = np.random.default_rng(42)

N = 100_000

In [3]:
# ------------------------
# User & Account Features
# ------------------------
user_id = np.arange(N)
account_id = rng.integers(1, 15_000, size=N)

user_role = rng.choice(
    ["admin", "end_user"],
    p=[0.35, 0.65],
    size=N
)

plan_type = rng.choice(
    ["free", "pro", "enterprise"],
    p=[0.5, 0.35, 0.15],
    size=N
)

country = rng.choice(
    ["US", "EU", "LATAM", "APAC"],
    p=[0.4, 0.3, 0.2, 0.1],
    size=N
)

timezone = rng.integers(-8, 10, size=N)

In [4]:
# ------------------------
# Session Behavior
# ------------------------
session_length_seconds = rng.gamma(shape=2.5, scale=120, size=N).astype(int)
pages_visited = rng.poisson(lam=4, size=N)
last_seen_days_ago = rng.exponential(scale=5, size=N).astype(int)

conversation_open = rng.binomial(1, p=0.25, size=N)

In [5]:
# ------------------------
# Messaging History
# ------------------------
previous_messages_sent = rng.poisson(lam=2.2, size=N)
previous_messages_clicked = np.minimum(
    previous_messages_sent,
    rng.poisson(lam=1.1, size=N)
)
previous_messages_dismissed = np.maximum(
    0,
    previous_messages_sent - previous_messages_clicked
)

In [6]:
# ------------------------
# Intent & Message Type
# ------------------------
message_type = rng.choice(
    ["onboarding_tip", "pricing_help", "bug_help", "none"],
    p=[0.35, 0.25, 0.15, 0.25],
    size=N
)

In [7]:
# ------------------------
# Derived Signals
# ------------------------
intent_score = (
    0.003 * session_length_seconds +
    0.25 * pages_visited +
    1.0 * conversation_open -
    0.3 * last_seen_days_ago
)

fatigue_score = (
    previous_messages_sent -
    previous_messages_clicked
)

In [8]:
# Plan effect
plan_multiplier = np.where(plan_type == "enterprise", 1.3,
                    np.where(plan_type == "pro", 1.1, 0.9))

In [9]:
# Message relevance
relevance = np.where(
    (message_type == "pricing_help") & (plan_type != "free"), 1.2,
    np.where((message_type == "onboarding_tip") & (last_seen_days_ago < 2), 1.1, 0.9)
)

In [10]:
# ------------------------
# Decision: Send Message?
# ------------------------
send_message_logit = (
    0.8 * intent_score
    - 1.2 * fatigue_score
    + rng.normal(0, 1, size=N)
)

send_message_prob = 1 / (1 + np.exp(-send_message_logit))
send_message = rng.binomial(1, np.clip(send_message_prob, 0, 1))

In [12]:
# ------------------------
# Outcomes
# ------------------------
click_logit = (
    1.2 * intent_score
    - 1.5 * fatigue_score
    + 1.0 * relevance
    + 0.5 * send_message
    + rng.normal(0, 1, size=N)
)

clicked_prob = 1 / (1 + np.exp(-click_logit))
clicked = rng.binomial(1, np.clip(clicked_prob, 0, 1))

reply_logit = (
    0.8 * click_logit
    + 0.6 * conversation_open
    + rng.normal(0, 1, size=N)
)

replied_prob = 1 / (1 + np.exp(-reply_logit))
replied = rng.binomial(1, np.clip(replied_prob, 0, 1))

conversation_resolved = rng.binomial(
    1,
    p=np.clip(0.3 + 0.4 * replied, 0, 1)
)

In [13]:
# ------------------------
# Final DataFrame
# ------------------------
df = pd.DataFrame({
    "user_id": user_id,
    "account_id": account_id,
    "user_role": user_role,
    "plan_type": plan_type,
    "country": country,
    "timezone": timezone,
    "session_length_seconds": session_length_seconds,
    "pages_visited": pages_visited,
    "last_seen_days_ago": last_seen_days_ago,
    "conversation_open": conversation_open,
    "previous_messages_sent": previous_messages_sent,
    "previous_messages_clicked": previous_messages_clicked,
    "previous_messages_dismissed": previous_messages_dismissed,
    "message_type": message_type,
    "send_message": send_message,
    "clicked": clicked,
    "replied": replied,
    "conversation_resolved": conversation_resolved
})

display(df.head())

Unnamed: 0,user_id,account_id,user_role,plan_type,country,timezone,session_length_seconds,pages_visited,last_seen_days_ago,conversation_open,previous_messages_sent,previous_messages_clicked,previous_messages_dismissed,message_type,send_message,clicked,replied,conversation_resolved
0,0,1339,end_user,pro,US,-1,311,5,1,0,1,1,0,bug_help,1,1,1,1
1,1,11609,end_user,pro,APAC,8,353,8,7,1,0,0,0,bug_help,1,1,1,0
2,2,9818,end_user,pro,LATAM,-3,216,4,0,0,2,1,1,none,0,1,1,1
3,3,6583,admin,pro,LATAM,-5,348,5,10,0,2,0,2,onboarding_tip,0,0,0,0
4,4,6495,end_user,pro,EU,6,301,4,13,0,1,1,0,onboarding_tip,0,0,0,1


In [14]:
df.to_csv("simulated_data.csv", index=False)