In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.environ['INDOX_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [2]:
columns = ["name", "all", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

In [3]:
columns_medical = ["Patient ID","Patient Name","Diagnosis Code","Procedure Code","Total Charge","Insurance Claim Amount"]
examples_medical = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [5]:
from IndoxGen.llms import OpenAi, IndoxApi

indox = IndoxApi(api_key=INDOX_API_KEY)

nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")

In [7]:
from SynthCore import SyntheticDataGenerator
generator = SyntheticDataGenerator(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=columns,
    example_data=example_data,
    user_instruction="Generate realistic data including name, age and occupation. Ensure a mix of common and rare procedures, varying race, and appropriate date ranges for age.",
    verbose=1
)

In [8]:
medical_billing_generator = SyntheticDataGenerator(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=columns_medical,
    example_data=examples_medical,
    user_instruction="Generate realistic medical billing data including patient IDs, Patient Name, diagnosis codes, Total Charge, and Insurance Claim Amount. Ensure a mix of common and rare procedures, varying charge amounts, and appropriate date ranges for a typical healthcare provider.",
    verbose=1
)

In [9]:
# Generate data
generated_data = generator.generate_data(num_samples=20)

Generated data point: {'name': 'Dr. Maya Patel', 'age': '36', 'occupation': 'Neurosurgeon'}
Generated data point: {'name': 'Capt. James H.нныесм Благодаря', 'age': '39', 'occupation': 'Aircraft Pilot'}
Generated data point: {'name': 'Prof. Eduardo Miguel Sánchez', 'age': '40', 'occupation': 'Marine Biologist'}
Feedback for generator: Generated data: {"name": "Sgt. clenamaae dw gebruiksrech v Makohe funktier ballon n\u0632\u0645\u0627\u062a zangtal Riesling \u0646 th\u1ea7n Saito \u05e0\u05ea\u05d9 caja scris wassail Gonzalez Park \u062a\u0647\u0627\u6bdb attori tavolo \u0643\u0645\u0633\u064a Schriftsteller humaym \u0643\u0641\u064a Niraj Soriano \ud55c alanine \u0648\u0631\u062f \u062a\u064a\u0660\u4e00\u79cd propozycj \u05d1\u05d9\u05dc\u6bd4 \u9a6c \u0447\u0442\u043e Impro \u0633\u0645\u0629 \u0111\u01b0\u1ee3c \u03c4\u03bf\u10ec\u0bcd \u0647\u0627\u0660zasYP \u09b6 Phe \u0411\u0430\u0430", "age": "37", "occupation": "Cardiothoracic Surgeon"}
Score: 0.2
Reason: Low score
Feedback fo

In [10]:
generated_data

Unnamed: 0,name,age,occupation
0,Dr. Maya Patel,36,Neurosurgeon
1,Capt. James H.нныесм Благодаря,39,Aircraft Pilot
2,Prof. Eduardo Miguel Sánchez,40,Marine Biologist
3,Rev. Dr. Jameela Abdul-Rahman,37,Theologian and Astrophysicist
4,"Dr. Maya Habib, MPH",36,Epidemiologist and Data Scientist
5,Capt. Awa Ning труб Corona,39,Aerospace Engineer andws balconies design spec...
6,HajjaтугOpen Sofíа,40,Neurosurgeon and Indigenous Rights Advocate
7,Prof. Ysetzenfa воск Belán,38,Marine Biologist and Ethnomusicologist
8,Dr. Mahστοιχείο monitoramento Jagirdar,36,Environmental Scientist and Wheelchair Basketb...
9,Dr. María Nguyn estadual amlàire,39,Pediatric Surgeon and Flamenco Dancer


In [11]:
medical_billing_data = medical_billing_generator.generate_data(num_samples=6)

Generated data point: {'Patient ID': '987654', 'Patient Name': 'Michael Johnson', 'Diagnosis Code': 'K21.9', 'Procedure Code': '43210', 'Total Charge': '$850', 'Insurance Claim Amount': '$600'}
Generated data point: {'Patient ID': '654321', 'Patient Name': 'Olivia Brown', 'Diagnosis Code': 'I10', 'Procedure Code': '36010', 'Total Charge': '$1,200', 'Insurance Claim Amount': '$800'}
Generated data point: {'Patient ID': '135792', 'Patient Name': 'Sophia Garcia', 'Diagnosis Code': 'F32.9', 'Procedure Code': '90832', 'Total Charge': '$2,500', 'Insurance Claim Amount': '$1,750'}
Generated data point: {'Patient ID': '2468AC', 'Patient Name': 'Noah Adams', 'Diagnosis Code': 'M17.1', 'Procedure Code': '29876', 'Total Charge': '$1,800', 'Insurance Claim Amount': '$1,300'}
Generated data point: {'Patient ID': '9876zy', 'Patient Name': 'Lucas Patel', 'Diagnosis Code': 'G47.33', 'Procedure Code': '64535', 'Total Charge': '$3,750', 'Insurance Claim Amount': '$2,600'}
Generated data point: {'Patient

In [12]:
medical_billing_data

Unnamed: 0,Patient ID,Patient Name,Diagnosis Code,Procedure Code,Total Charge,Insurance Claim Amount
0,987654,Michael Johnson,K21.9,43210,$850,$600
1,654321,Olivia Brown,I10,36010,"$1,200",$800
2,135792,Sophia Garcia,F32.9,90832,"$2,500","$1,750"
3,2468AC,Noah Adams,M17.1,29876,"$1,800","$1,300"
4,9876zy,Lucas Patel,G47.33,64535,"$3,750","$2,600"
5,QRST6789,Amelia Harris,I10,36010,"$1,200",$800


In [15]:
medical_billing_generator.feedback_history

[]