## Install Dependences

Install the required dependencies to execute this notebook.

In [1]:
%pip install --upgrade "nemo-microservices[data-designer]" python-dotenv pandas -qqq


Note: you may need to restart the kernel to use updated packages.


## Configure Data Designer

Load our NVIDIA API Key (available from `https://build.nvidia.com/`), import required libraries, and configure our base API URL endpoint.

> We assume you have an API key called `NVIDIA_API_KEY` in a `.env` file in the same directory as this notebook. 

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd

from nemo_microservices.data_designer.essentials import (
    CategorySamplerParams,
    DataDesignerConfigBuilder,
    LLMTextColumnConfig,
    NeMoDataDesignerClient,
    PersonSamplerParams,
    SamplerColumnConfig,
    SamplerType,
    SubcategorySamplerParams,
    UniformSamplerParams,
)

# Load .env and get NVIDIA_API_KEY
load_dotenv()
api_key = os.getenv("NVIDIA_API_KEY")

# Initialize hosted NeMo Data Designer client
data_designer_client = NeMoDataDesignerClient(
    base_url="https://ai.api.nvidia.com/v1/nemo/dd",
    default_headers={"Authorization": f"Bearer {api_key}"}
)

model_alias = "nemotron-nano-v2"


## Define Data Schema

Data Designers supports generating a single table at a time. We have chosen to create a larger onmitable containing all of the desired data, and then splitting that table into many smaller tables in a post-processing step.

Each group of columns is segments by the name of the eventual target table to which it will be written.

You could generate each table as a dedicated operation if you wished to do so.

In [3]:
config_builder = DataDesignerConfigBuilder()

# Company
config_builder.add_column(
    SamplerColumnConfig(
        name="company_id",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=[f"COMP-{i:03d}" for i in range(1, 51)]  # 50 possible companies
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="company_segment",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["SMB", "Mid-Market", "Enterprise"],
            weights=[3, 2, 1],
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="company_region",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["North America", "Europe", "APAC", "LATAM"]
        ),
    )
)

config_builder.add_column(
    LLMTextColumnConfig(
        name="company_name",
        prompt=(
            "Generate a realistic B2B SaaS company name operating in the {{ company_segment }} "
            "segment in {{ company_region }}. Respond with only the company name."
        ),
        system_prompt=(
            "You are a helpful assistant that generates realistic but fictional company names for "
            "B2B SaaS businesses. Respond with only the company name, no quotes or extra text."
        ),
        model_alias=model_alias,
    )
)

# User
config_builder.add_column(
    SamplerColumnConfig(
        name="user_id",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=[f"USER-{i:04d}" for i in range(1, 1001)]
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="user_role",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["admin", "manager", "individual_contributor", "viewer"]
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="user_profile",
        sampler_type=SamplerType.PERSON,
        params=PersonSamplerParams(age_range=[22, 65]),
    )
)

# Subscription
config_builder.add_column(
    SamplerColumnConfig(
        name="subscription_id",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=[f"SUB-{i:05d}" for i in range(1, 5001)] # Define 1:5000 subscriptions per user
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="plan_tier",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["Free", "Starter", "Growth", "Scale", "Enterprise"],
            weights=[1, 3, 3, 2, 1],
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="billing_period",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["monthly", "annual"],
            weights=[3, 2],
        ),
    )
)

# Invoice
config_builder.add_column(
    SamplerColumnConfig(
        name="invoice_id",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=[f"INV-{i:06d}" for i in range(1, 10001)]
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="invoice_status",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["paid", "unpaid", "overdue", "refunded"],
            weights=[6, 2, 1, 1],
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="invoice_amount_usd",
        sampler_type=SamplerType.UNIFORM,
        params=UniformSamplerParams(low=50, high=5000),
    )
)

# Events
config_builder.add_column(
    SamplerColumnConfig(
        name="event_id",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=[f"EVT-{i:07d}" for i in range(1, 50001)]
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="event_type",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=[
                "signup",
                "login",
                "feature_use",
                "admin_action",
                "billing_page_view",
                "cancel_attempt",
            ]
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="event_channel",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["web_app", "mobile_app", "api", "support"],
        ),
    )
)

config_builder.add_column(
    LLMTextColumnConfig(
        name="event_description",
        prompt=(
            "You are describing a single SaaS product event. The user with role {{ user_role }} at "
            "{{ company_name }} (plan tier {{ plan_tier }}, billed {{ billing_period }}) performed "
            "an event of type '{{ event_type }}' via '{{ event_channel }}'. "
            "Describe this as a short event log line (1‚Äì2 sentences)."
        ),
        model_alias=model_alias,
    )
)


## Generate Preview Data

The `preview` function allows us to probe Data Designer for a certain number of samples. We've chosen 100 samples here.

In [None]:
# Generate 100 synthetic SaaS event records
preview = data_designer_client.preview(config_builder, num_records=100)

events_df = preview.dataset
events_df.head()


[13:37:30] [INFO] ‚úÖ Validation passed
[13:37:30] [INFO] üöÄ Starting preview generation
[13:37:32] [INFO] ‚õìÔ∏è Sorting column configs into a Directed Acyclic Graph
[13:37:32] [INFO] ü©∫ Running health checks for models...
[13:37:34] [INFO]   |-- üëÄ Checking 'nvidia/nvidia-nemotron-nano-9b-v2'...
[13:37:34] [INFO]   |-- ‚úÖ Passed!
[13:37:36] [INFO]   |-- üëÄ Checking 'nvidia/llama-3.3-nemotron-super-49b-v1.5'...
[13:37:36] [INFO]   |-- ‚úÖ Passed!
[13:37:37] [INFO]   |-- üëÄ Checking 'mistralai/mistral-small-24b-instruct'...
[13:37:37] [INFO]   |-- ‚úÖ Passed!
[13:37:37] [INFO]   |-- üëÄ Checking 'openai/gpt-oss-20b'...
[13:37:37] [INFO]   |-- ‚úÖ Passed!
[13:37:39] [INFO]   |-- üëÄ Checking 'openai/gpt-oss-120b'...
[13:37:39] [INFO]   |-- ‚úÖ Passed!
[13:37:40] [INFO]   |-- üëÄ Checking 'meta/llama-4-scout-17b-16e-instruct'...
[13:37:40] [INFO]   |-- ‚úÖ Passed!
[13:37:40] [INFO] ‚è≥ Processing batch 1 of 1
[13:37:40] [INFO] üé≤ Preparing samplers to generate 100 records

Unnamed: 0,company_id,company_segment,company_region,user_id,user_role,user_profile,subscription_id,plan_tier,billing_period,invoice_id,invoice_status,invoice_amount_usd,event_id,event_type,event_channel,company_name,company_name__reasoning_trace,event_description,event_description__reasoning_trace
0,COMP-037,Enterprise,North America,USER-0248,admin,"{'age': 57, 'bachelors_field': 'no degree', 'b...",SUB-00461,Growth,monthly,INV-000694,unpaid,510.540892,EVT-0010197,admin_action,web_app,EnterprisePrime,"Okay, the user wants a realistic B2B SaaS comp...",Admin user from EnterprisePrime on the Growth ...,"Okay, let's tackle this query. The user wants ..."
1,COMP-003,Mid-Market,LATAM,USER-0169,admin,"{'age': 32, 'bachelors_field': 'business', 'bi...",SUB-04103,Scale,monthly,INV-004516,unpaid,2653.049858,EVT-0040227,login,support,NubeCrecimiento,"Okay, the user wants a realistic B2B SaaS comp...","Admin user at NubeCrecimiento (Scale plan, bil...","Okay, let's tackle this query. The user wants ..."
2,COMP-040,Enterprise,LATAM,USER-0296,viewer,"{'age': 34, 'bachelors_field': 'no degree', 'b...",SUB-01543,Growth,annual,INV-005508,unpaid,922.032216,EVT-0007361,signup,web_app,AstraSphere,"Okay, the user wants a realistic B2B SaaS comp...",User with role 'viewer' at AstraSphere signed ...,"Okay, let's tackle this query. The user wants ..."
3,COMP-005,Mid-Market,Europe,USER-0342,viewer,"{'age': 42, 'bachelors_field': 'education', 'b...",SUB-02448,Enterprise,annual,INV-000750,paid,2403.899953,EVT-0027114,signup,api,NexaCore Solutions,"Okay, the user wants a realistic B2B SaaS comp...",Viewer user at NexaCore Solutions (Enterprise ...,"Okay, let's tackle this query. The user wants ..."
4,COMP-032,Enterprise,Europe,USER-0166,individual_contributor,"{'age': 31, 'bachelors_field': 'no degree', 'b...",SUB-02851,Starter,annual,INV-007150,overdue,67.55489,EVT-0018708,admin_action,mobile_app,AstraGlobal Solutions,"Okay, the user wants a realistic B2B SaaS comp...",An individual_contributor at AstraGlobal Solut...,"Okay, let's tackle this query. The user wants ..."


## Split Omnitable in Many Tables

Our data generated with Data Designer has been output as a `pandas` dataframe. This is a easily transformable data format that enables us to split our monolithic omnitable into several smaller tables relatively simply.

In [None]:
# Companies
companies = (
    events_df[
        [
            "company_id",
            "company_name",
            "company_segment",
            "company_region",
        ]
    ]
    .drop_duplicates(subset=["company_id"])
    .reset_index(drop=True)
)

# Users
users = (
    events_df[
        [
            "user_id",
            "company_id",
            "user_role",
            "user_profile",
        ]
    ]
    .drop_duplicates(subset=["user_id"])
    .reset_index(drop=True)
)

# Subscriptions
subscriptions = (
    events_df[
        [
            "subscription_id",
            "company_id",
            "plan_tier",
            "billing_period",
        ]
    ]
    .drop_duplicates(subset=["subscription_id"])
    .reset_index(drop=True)
)

# Invoices
invoices = (
    events_df[
        [
            "invoice_id",
            "subscription_id",
            "invoice_status",
            "invoice_amount_usd",
        ]
    ]
    .drop_duplicates(subset=["invoice_id"])
    .reset_index(drop=True)
)

# Products
product_events = events_df[
    [
        "event_id",
        "company_id",
        "user_id",
        "subscription_id",
        "invoice_id",
        "event_type",
        "event_channel",
        "event_description",
    ]
].copy()

companies.head(), users.head(), subscriptions.head(), invoices.head(), product_events.head()


(  company_id           company_name company_segment company_region
 0   COMP-037        EnterprisePrime      Enterprise  North America
 1   COMP-003        NubeCrecimiento      Mid-Market          LATAM
 2   COMP-040            AstraSphere      Enterprise          LATAM
 3   COMP-005     NexaCore Solutions      Mid-Market         Europe
 4   COMP-032  AstraGlobal Solutions      Enterprise         Europe,
      user_id company_id               user_role  \
 0  USER-0248   COMP-037                   admin   
 1  USER-0169   COMP-003                   admin   
 2  USER-0296   COMP-040                  viewer   
 3  USER-0342   COMP-005                  viewer   
 4  USER-0166   COMP-032  individual_contributor   
 
                                         user_profile  
 0  {'age': 57, 'bachelors_field': 'no degree', 'b...  
 1  {'age': 32, 'bachelors_field': 'business', 'bi...  
 2  {'age': 34, 'bachelors_field': 'no degree', 'b...  
 3  {'age': 42, 'bachelors_field': 'education', 'b...

## Save Data

Now that the data is split, we can persist it as a collection of CSV files for analysis with the MOSTLY AI Assistant.

In [9]:
import os

# Make /data
os.makedirs("./data", exist_ok=True)

# Save as CSVs
companies.to_csv("./data/companies.csv", index=False)
users.to_csv("./data/users.csv", index=False)
subscriptions.to_csv("./data/subscriptions.csv", index=False)
invoices.to_csv("./data/invoices.csv", index=False)
product_events.to_csv("./data/product_events.csv", index=False)
