In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.environ['INDOX_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [2]:
columns = ["name", "age", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

In [3]:
columns_medical = ["Patient ID","Patient Name","Diagnosis Code","Procedure Code","Total Charge","Insurance Claim Amount"]
examples_medical = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [4]:
from indoxGen.llms import OpenAi, IndoxApi

indox = IndoxApi(api_key=INDOX_API_KEY)

nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")

In [5]:
from indoxGen.synthCore import SyntheticDataGenerator
generator = SyntheticDataGenerator(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=columns,
    example_data=example_data,
    user_instruction="Generate realistic data including name, age and occupation. Ensure a mix of common and rare procedures, varying race, and appropriate date ranges for age.",
    verbose=1
)

In [6]:
medical_billing_generator = SyntheticDataGenerator(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=columns_medical,
    example_data=examples_medical,
    user_instruction="Generate realistic medical billing data including patient IDs, Patient Name, diagnosis codes, Total Charge, and Insurance Claim Amount. Ensure a mix of common and rare procedures, varying charge amounts, and appropriate date ranges for a typical healthcare provider.",
    verbose=1
)

In [7]:
# Generate data
generated_data = generator.generate_data(num_samples=20)

Generated data point: {'name': 'Dr. Maya Patel', 'age': '37', 'occupation': 'Neurosurgeon'}
Generated data point: {'name': 'Capt. Jamal Al-Rashid', 'age': '36', 'occupation': 'Aircraft Pilot'}
Generated data point: {'name': 'Prof. Yuko Sato', 'age': '39', 'occupation': 'Marine Biologist'}
Generated data point: {'name': 'Rev. Carlos Mendoza', 'age': '40', 'occupation': 'Social Worker'}
Generated data point: {'name': 'Dr. Indira Patel', 'age': '37', 'occupation': 'Neurosurgeon'}
Generated data point: {'name': 'Maj. Thandiwe Nkosi', 'age': '36', 'occupation': 'Aerospace Engineer'}
Generated data point: {'name': 'Prof. Hana Yamaguchi', 'age': '39', 'occupation': 'Marine Biologist'}
Generated data point: {'name': "Rev. James O'Connell", 'age': '38', 'occupation': 'Chaplain'}
Generated data point: {'name': 'Dr. Amal Al-Mansouri', 'age': '37', 'occupation': 'Neurosurgeon'}
Generated data point: {'name': 'Sgt. Major Fatima Singh', 'age': '40', 'occupation': 'Military Officer'}
Progress: 10/20 

In [8]:
generated_data

Unnamed: 0,name,age,occupation
0,Dr. Maya Patel,37,Neurosurgeon
1,Capt. Jamal Al-Rashid,36,Aircraft Pilot
2,Prof. Yuko Sato,39,Marine Biologist
3,Rev. Carlos Mendoza,40,Social Worker
4,Dr. Indira Patel,37,Neurosurgeon
5,Maj. Thandiwe Nkosi,36,Aerospace Engineer
6,Prof. Hana Yamaguchi,39,Marine Biologist
7,Rev. James O'Connell,38,Chaplain
8,Dr. Amal Al-Mansouri,37,Neurosurgeon
9,Sgt. Major Fatima Singh,40,Military Officer


In [9]:
medical_billing_data = medical_billing_generator.generate_data(num_samples=6)

Generated data point: {'Patient ID': '987654', 'Patient Name': 'Olivia Brown', 'Diagnosis Code': 'I10', 'Procedure Code': '36415', 'Total Charge': '$2,500', 'Insurance Claim Amount': '$2,000'}
Generated data point: {'Patient ID': '654321', 'Patient Name': 'Michael Davis', 'Diagnosis Code': 'K21.9', 'Procedure Code': '43235', 'Total Charge': '$1,800', 'Insurance Claim Amount': '$1,500'}
Generated data point: {'Patient ID': '135792', 'Patient Name': 'Sophia Williams', 'Diagnosis Code': 'G47.33', 'Procedure Code': '92551', 'Total Charge': '$1,250', 'Insurance Claim Amount': '$1,000'}
Generated data point: {'Patient ID': '246813', 'Patient Name': 'Ava Thompson', 'Diagnosis Code': 'F32.9', 'Procedure Code': '90837', 'Total Charge': '$1,750', 'Insurance Claim Amount': '$1,400'}
Generated data point: {'Patient ID': '987654', 'Patient Name': 'Benjamin Brown', 'Diagnosis Code': 'I10', 'Procedure Code': '36.12', 'Total Charge': '$2,500', 'Insurance Claim Amount': '$2,000'}
Generated data point: 

In [10]:
medical_billing_data

Unnamed: 0,Patient ID,Patient Name,Diagnosis Code,Procedure Code,Total Charge,Insurance Claim Amount
0,987654,Olivia Brown,I10,36415.0,"$2,500","$2,000"
1,654321,Michael Davis,K21.9,43235.0,"$1,800","$1,500"
2,135792,Sophia Williams,G47.33,92551.0,"$1,250","$1,000"
3,246813,Ava Thompson,F32.9,90837.0,"$1,750","$1,400"
4,987654,Benjamin Brown,I10,36.12,"$2,500","$2,000"
5,678901,Isabella Johnson,N18.9,50.59,"$3,800","$3,200"
