In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.environ['INDOX_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [2]:

from models import OpenAi, IndoxApi

indox = IndoxApi(api_key=INDOX_API_KEY)

# Get api key from NVIDIA (https://build.nvidia.com/nvidia/nemotron-4-340b-instruct?api_key=true&)
nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")

In [3]:
# nemotron_response = nemotron.chat(prompt="Write a limerick about the wonders of GPU computing.",
#                                   system_prompt="You are a helpful assistant designed to generate synthetic data.",
#                                   stream=True, temperature=0.2, top_p=0.7)
# print(nemotron_response)

In [4]:
# indox_response = indox.chat(prompt="Write a limerick about the wonders of GPU computing.",
#                             system_prompt="You are a helpful assistant designed to generate synthetic data.",
#                             stream=True, temperature=0.2, top_p=0.7)
# print(indox_response)

In [5]:
from lsynth.synthetic_data_generation import SyntheticDataGenerator

In [6]:
# Set up the generator
columns = ["name", "age", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

In [7]:
columns_medical = ["Patient ID","Patient Name","Diagnosis Code","Procedure Code","Total Charge","Insurance Claim Amount"]
examples_medical = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [9]:
medical_billing_generator = SyntheticDataGenerator(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=columns_medical,
    example_data=examples_medical,
    user_instruction="Generate realistic medical billing data including patient IDs, Patient Name, diagnosis codes, Total Charge, and Insurance Claim Amount. Ensure a mix of common and rare procedures, varying charge amounts, and appropriate date ranges for a typical healthcare provider.",
)

In [9]:
# Generate data
generated_data = generator.generate_data(num_samples=20)

Generated diverse data point: {'name': 'Dr. Sofia Rodríguez', 'age': 37, 'occupation': 'Data Scientist'}
Generated diverse data point: {'name': "James O'Connor", 'age': 39, 'occupation': 'Engineer'}
Generated diverse data point: {'name': 'Dr. Priya Patel', 'age': 36, 'occupation': 'Physician'}
Generated diverse data point: {'name': 'Capt. Ethan Kim', 'age': 38, 'occupation': 'Pilot'}
Generated diverse data point: {'name': 'María García López', 'age': 37, 'occupation': 'Graphic Designer'}
Generated diverse data point: {'name': 'Prof. Yùkai Wang', 'age': 39, 'occupation': 'Data Scientist'}
Generated diverse data point: {'name': 'Dr. Aisha ben-Jobair', 'age': 40, 'occupation': 'Marine Biologist'}
Generated diverse data point: {'name': 'Jamal Adebayo', 'age': 36, 'occupation': 'Software Engineer'}
Generated diverse data point: {'name': 'Mx. Reverie membrii Zhõu', 'age': 41, 'occupation': 'Aerospace Engineer'}
Generated diverse data point: {'name': "Captain Santiago 'Andromeda' Aguilar", 'a

In [13]:
generated_data

Unnamed: 0,name,age,occupation
0,Dr. Sofia Rodríguez,37,Data Scientist
1,James O'Connor,39,Engineer
2,Dr. Priya Patel,36,Physician
3,Capt. Ethan Kim,38,Pilot
4,María García López,37,Graphic Designer
5,Prof. Yùkai Wang,39,Data Scientist
6,Dr. Aisha ben-Jobair,40,Marine Biologist
7,Jamal Adebayo,36,Software Engineer
8,Mx. Reverie membrii Zhõu,41,Aerospace Engineer
9,Captain Santiago 'Andromeda' Aguilar,37,Astronaut and Planetary Scientist


In [10]:
medical_billing_data = medical_billing_generator.generate_data(num_samples=10)

Generated diverse data point: {'Patient ID': '987654', 'Patient Name': 'Michael Johnson', 'Diagnosis Code': 'G47.33', 'Procedure Code': '92507', 'Total Charge': '$850', 'Insurance Claim Amount': '$650'}
Generated diverse data point: {'Patient ID': '654321', 'Patient Name': 'Olivia Brown', 'Diagnosis Code': 'I10', 'Procedure Code': '36415', 'Total Charge': '$1200', 'Insurance Claim Amount': '$900'}
Generated data is not diverse. Retrying... (Failure count: 1)
Generated data is not diverse. Retrying... (Failure count: 2)
Generated data is not diverse. Retrying... (Failure count: 3)
Generated data is not diverse. Retrying... (Failure count: 4)
Generated data is not diverse. Retrying... (Failure count: 5)
Generated data is not diverse. Retrying... (Failure count: 6)
Generated data is not diverse. Retrying... (Failure count: 7)
Generated data is not diverse. Retrying... (Failure count: 8)
Progress: 2/10 data points generated. Attempts: 10
Generated data is not diverse. Retrying... (Failure 

In [11]:
medical_billing_data

Unnamed: 0,Patient ID,Patient Name,Diagnosis Code,Procedure Code,Total Charge,Insurance Claim Amount
0,987654,Michael Johnson,G47.33,92507,$850,$650
1,654321,Olivia Brown,I10,36415,$1200,$900
2,543210,Sophia Garcia,F32.9,90832,$280,$200
3,ABC123,Ava Patel,N30.0,51700,$450,$320
4,XYZ789,Chloe Harris,R53.83,36416,$850,$600
5,QWERTY1,Isabella Taylor,G47.33,95811,$1200,$850
6,LMN456,William Brown,I50.9,33208,$1800,$1300
7,ABC123DEF,Samantha Johnson,K21.9,43235,"$1,500","$1,000"
8,ZYX987,Daniel Lee,M25.552,29888,"$2,200","$1,650"
9,XYZ789QRS,Grace Thompson,G47.33,64561,"$3,500","$2,500"


In [15]:
medical_billing_generator.feedback_history

[]