In [1]:
import os
from dotenv import load_dotenv

load_dotenv('api.env')
INDOX_API_KEY = os.environ['INDOX_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [2]:
columns = ["name", "age", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

In [3]:
columns_medical = ["Patient ID","Patient Name","Diagnosis Code","Procedure Code","Total Charge","Insurance Claim Amount"]
examples_medical = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [4]:
from models import OpenAi, IndoxApi

indox = IndoxApi(api_key=INDOX_API_KEY)

nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")

In [5]:
from SynthCore import SyntheticDataGeneratorFeedback
generator = SyntheticDataGeneratorFeedback(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=columns,
    example_data=example_data,
    user_instruction="Generate realistic data including name, age and occupation. Ensure a mix of common and rare procedures, varying race, and appropriate date ranges for age.",
    verbose=1,
    diversity_threshold=1,
    feedback_min_score = 1
)

In [6]:
# Generate data
generated_data = generator.generate_data(num_samples=3)

Generated data is not diverse. Retrying... (Failure count: 1)
Generated data is not diverse. Retrying... (Failure count: 2)
Generated data is not diverse. Retrying... (Failure count: 3)


In [7]:
generated_data

In [8]:
generator.pending_review

Unnamed: 0,data,score
0,"{'name': 'Dr. Maya Singh', 'age': '39', 'occup...",0.8
1,"{'name': 'Dr. Maya Singh', 'age': '39', 'occup...",0.8
2,"{'name': 'Dr. Maya Chatterjee', 'age': '39', '...",0.8


In [9]:
generator.user_review_and_regenerate(accepted_rows = [],regenerate_rows= ['all'],regeneration_feedback = 'change name to another name , also change occupation to another occupation',min_score=0.7)

Unnamed: 0,name,age,occupation
0,Dr. Amelia Kibo,40,Cardiothoracic Surgeon
1,Dr. Jamal Rodríguez,36,Pediatric Ophthalmologist
2,Dr. Hana Rao,39,Vascular Surgeon


In [10]:
generator.pending_review

Unnamed: 0,data,score
