In [5]:
nv_api_key = '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'

In [None]:
import json
from typing import List, Dict, Any, Callable
import requests
import pandas as pd


class SyntheticDataGenerator:
    def __init__(self, api_key: str, num_samples: int = 1, temperature: float = 0.2, top_p: float = 0.7, max_tokens: int = 2048):
        """
        Initializes the synthetic data generator with necessary configurations for NVIDIA Nemotron API.
        """
        self.api_key = api_key
        self.num_samples = num_samples
        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens
        self.base_url = "https://integrate.api.nvidia.com/v1"  # NVIDIA API endpoint

    def generate_data(self, subject: str, extra: str, runs: int, prompt_template: Callable[[str, str, int], str]) -> List[Dict[str, Any]]:
        """
        Generate synthetic data based on the given instructions using a prompt template.
        """
        synthetic_results = []
        for _ in range(runs):
            prompt = prompt_template(subject, extra, self.num_samples)

            # Prepare the API call
            data = {
                "model": "nvidia/nemotron-4-340b-instruct",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": self.temperature,
                "top_p": self.top_p,
                "max_tokens": self.max_tokens,
                "stream": False
            }

            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self.api_key}"
            }

            # Make the API call to NVIDIA's endpoint
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=headers,
                json=data
            )

            if response.status_code != 200:
                raise ValueError(f"Failed to call the API: {response.status_code} {response.text}")

            # Get the raw response from the model
            raw_output = response.json()['choices'][0]['message']['content']

            # Parse the response and add it to synthetic results
            parsed_data = self._parse_response(raw_output)
            synthetic_results.append(parsed_data)

        return synthetic_results

    def _parse_response(self, raw_output: str) -> List[Dict[str, Any]]:
        """
        Parse the raw LLM output into a structured format. Strips extra data and
        ensures that only valid JSON content is returned.

        :param raw_output: The raw string response from the model
        :return: Parsed JSON as a list of dictionaries
        """
        # Remove any potential code block delimiters
        json_output = raw_output.strip().strip('```json').strip('```').strip()

        # Find the first valid JSON object in the response
        try:
            # Sometimes the model may return multiple chunks of text; we split by braces.
            json_output = json_output[json_output.find('{'):json_output.rfind('}')+1]
            # Try to parse the cleaned-up output as JSON
            return json.loads(json_output)
        except json.JSONDecodeError as e:
            # Handle parsing errors and provide feedback
            raise ValueError(f"LLM output is not valid JSON: {e}\nRaw output: {raw_output}")




In [None]:
from pydantic import BaseModel
from typing import List

# Define a MedicalBilling class for structured data
class MedicalBilling(BaseModel):
    patient_id: str
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float


In [None]:
def my_prompt_template(subject: str, extra: str, num_samples: int) -> str:
    return (
        f"Generate {num_samples} samples of {subject} data. Ensure the following fields are present: "
        f"Patient ID, Patient Name, Diagnosis Code, Procedure Code, Total Charge, Insurance Claim Amount. "
        f"{extra} Output the data in valid JSON format."
    )


In [None]:
# Initialize the synthetic data generator
api_key = nv_api_key
generator = SyntheticDataGenerator(api_key=api_key, num_samples=1)

# Define the instructions for synthetic data generation
subject = "medical_billing"
extra = "The name must be chosen at random. Make it something unusual."
runs = 1

# Generate the synthetic data using the custom prompt template
synthetic_results = generator.generate_data(subject=subject, extra=extra, runs=runs, prompt_template=my_prompt_template)
synthetic_results

[{'medical_billing_data': {'Patient ID': 'P12345',
   'Patient Name': 'Zephyr Quicksilver',
   'Diagnosis Code': 'M54.5',
   'Procedure Code': '99213',
   'Total Charge': 150.0,
   'Insurance Claim Amount': 120.0}}]

In [None]:
# Convert the synthetic results into structured MedicalBilling objects
synthetic_data = []
for result in synthetic_results:
    # Check if the result is a dictionary and contains 'medical_billing_data'
    if isinstance(result, dict) and 'medical_billing_data' in result:
        entry = result['medical_billing_data']
        # Now process the entry dictionary
        billing_data = MedicalBilling(
            patient_id=entry['Patient ID'],  # Adjusted to match the correct key name
            patient_name=entry['Patient Name'],
            diagnosis_code=entry['Diagnosis Code'],
            procedure_code=entry['Procedure Code'],
            total_charge=float(entry['Total Charge']),
            insurance_claim_amount=float(entry['Insurance Claim Amount'])
        )
        synthetic_data.append(billing_data)
    else:
        print(f"Unexpected result format: {result}")

# Convert the synthetic data to a Pandas DataFrame
synthetic_df = pd.DataFrame([billing.dict() for billing in synthetic_data])

# Display the DataFrame
synthetic_df


Unnamed: 0,patient_id,patient_name,diagnosis_code,procedure_code,total_charge,insurance_claim_amount
0,P12345,Zephyr Quicksilver,M54.5,99213,150.0,120.0


In [1]:
import json
from typing import List, Dict, Any, Callable
import requests
import pandas as pd


class SyntheticDataGenerator:
    def __init__(self, api_key: str, num_samples: int = 1, temperature: float = 0.2, top_p: float = 0.7, max_tokens: int = 2048):
        """
        Initializes the synthetic data generator with necessary configurations for NVIDIA Nemotron API.
        """
        self.api_key = api_key
        self.num_samples = num_samples
        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens
        self.base_url = "https://integrate.api.nvidia.com/v1"  # NVIDIA API endpoint

    def generate_data(self, subject: str, extra: str, runs: int, examples: List[Dict[str, str]], prompt_template: Callable[[str, str, List[Dict[str, str]], int], str]) -> List[Dict[str, Any]]:
        """
        Generate synthetic data based on the given instructions using a prompt template and examples.
        """
        synthetic_results = []
        for _ in range(runs):
            prompt = prompt_template(subject, extra, examples, self.num_samples)

            # Prepare the API call
            data = {
                "model": "nvidia/nemotron-4-340b-instruct",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": self.temperature,
                "top_p": self.top_p,
                "max_tokens": self.max_tokens,
                "stream": False
            }

            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self.api_key}"
            }

            # Make the API call to NVIDIA's endpoint
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=headers,
                json=data
            )

            if response.status_code != 200:
                raise ValueError(f"Failed to call the API: {response.status_code} {response.text}")

            # Get the raw response from the model
            raw_output = response.json()['choices'][0]['message']['content']

            # Parse the response and add it to synthetic results
            parsed_data = self._parse_response(raw_output)
            synthetic_results.append(parsed_data)

        return synthetic_results

    def _parse_response(self, raw_output: str) -> List[Dict[str, Any]]:
        """
        Parse the raw LLM output into a structured format. Strips extra data and
        ensures that only valid JSON content is returned.
        """
        # Remove any potential code block delimiters
        json_output = raw_output.strip().strip('```json').strip('```').strip()

        # Find the first valid JSON object in the response
        try:
            # Sometimes the model may return multiple chunks of text; we split by braces.
            json_output = json_output[json_output.find('{'):json_output.rfind('}')+1]
            # Try to parse the cleaned-up output as JSON
            return json.loads(json_output)
        except json.JSONDecodeError as e:
            # Handle parsing errors and provide feedback
            raise ValueError(f"LLM output is not valid JSON: {e}\nRaw output: {raw_output}")





In [2]:
from pydantic import BaseModel


# Define a MedicalBilling class for structured data
class MedicalBilling(BaseModel):
    patient_id: str
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

In [3]:
from typing import List
def my_prompt_template(subject: str, extra: str, examples: List[Dict[str, str]], num_samples: int) -> str:
    """
    Creates a prompt with few-shot examples and a template for generating new synthetic data.
    """
    example_text = "\n".join([f"Example:\n{ex['example']}" for ex in examples])
    
    prompt = (
        f"{example_text}\n\n"
        f"Now generate {num_samples} samples of {subject} data. Ensure the following fields are present: "
        f"Patient ID, Patient Name, Diagnosis Code, Procedure Code, Total Charge, Insurance Claim Amount. "
        f"{extra} Output the data in valid JSON format."
    )
    return prompt

In [4]:
# Define the examples for few-shot learning
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]


In [6]:
# Initialize the synthetic data generator
api_key = nv_api_key
generator = SyntheticDataGenerator(api_key=api_key, num_samples=1)

# Define the instructions for synthetic data generation
subject = "medical_billing"
extra = "The name must be chosen at random. Make it something unusual."
runs = 1

# Generate the synthetic data using the custom prompt template with examples
synthetic_results = generator.generate_data(subject=subject, extra=extra, runs=runs, examples=examples, prompt_template=my_prompt_template)

In [7]:
# Convert the synthetic results into structured MedicalBilling objects
synthetic_data = []
for result in synthetic_results:
    # Check if the result is a dictionary and contains 'medical_billing_data'
    if isinstance(result, dict) and 'medical_billing_data' in result:
        entry = result['medical_billing_data']
        # Now process the entry dictionary
        billing_data = MedicalBilling(
            patient_id=entry['Patient ID'],  # Adjusted to match the correct key name
            patient_name=entry['Patient Name'],
            diagnosis_code=entry['Diagnosis Code'],
            procedure_code=entry['Procedure Code'],
            total_charge=float(entry['Total Charge']),
            insurance_claim_amount=float(entry['Insurance Claim Amount'])
        )
        synthetic_data.append(billing_data)
    else:
        print(f"Unexpected result format: {result}")

# Convert the synthetic data to a Pandas DataFrame
synthetic_df = pd.DataFrame([billing.dict() for billing in synthetic_data])

# Display the DataFrame
synthetic_df

Unnamed: 0,patient_id,patient_name,diagnosis_code,procedure_code,total_charge,insurance_claim_amount
0,987654,Zephyr Quicksilver,G47.33,99215,450.0,320.0


In [9]:
import asyncio
from typing import Any, Dict, List, Optional, Callable, Union
import requests
import json


class LLMChain:
    """Chain to handle LLM and prompt interaction."""
    def __init__(self, llm: Callable, prompt: Callable):
        self.llm = llm
        self.prompt = prompt

    def run(self, subject: str, extra: str = "", examples: List[Dict[str, str]] = [], num_samples: int = 1) -> Dict[str, Any]:
        """Run the LLM chain and generate data based on inputs."""
        prompt_text = self.prompt(subject, extra, examples, num_samples)
        synthetic_data = self.llm(prompt_text)
        return synthetic_data

    async def arun(self, subject: str, extra: str = "", examples: List[Dict[str, str]] = [], num_samples: int = 1) -> Dict[str, Any]:
        """Asynchronous run method."""
        prompt_text = self.prompt(subject, extra, examples, num_samples)
        synthetic_data = await asyncio.to_thread(self.llm, prompt_text)  # Running LLM in a thread for async compatibility
        return synthetic_data


class SyntheticDataGenerator:
    """Generate synthetic data using an LLM and few-shot template, with dynamic example handling."""
    
    def __init__(self, api_key: str, template: Callable, num_samples: int = 1, temperature: float = 0.2, top_p: float = 0.7, max_tokens: int = 2048):
        """Initialize the synthetic data generator."""
        self.api_key = api_key
        self.num_samples = num_samples
        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens
        self.base_url = "https://integrate.api.nvidia.com/v1"  # NVIDIA API endpoint
        
        # Chain and example management
        self.template = template
        self.llm_chain = None
        self.examples = []  # Initial empty list for few-shot examples

    def set_llm_chain(self, llm: Callable):
        """Sets the LLM chain."""
        if not llm:
            raise ValueError("LLM must be provided to initialize LLM chain.")
        self.llm_chain = LLMChain(llm=llm, prompt=self.template)

    def generate_data(self, prompt: str) -> Dict[str, Any]:
        """Make an API call to the LLM."""
        data = {
            "model": "nvidia/nemotron-4-340b-instruct",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": self.temperature,
            "top_p": self.top_p,
            "max_tokens": self.max_tokens,
            "stream": False
        }

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }

        response = requests.post(f"{self.base_url}/chat/completions", headers=headers, json=data)
        if response.status_code != 200:
            raise ValueError(f"API error: {response.status_code} {response.text}")

        raw_output = response.json()['choices'][0]['message']['content']
        return self._parse_response(raw_output)

    def _parse_response(self, raw_output: str) -> Dict[str, Any]:
        """Parses the LLM response into JSON."""
        json_output = raw_output.strip().strip('```json').strip('```').strip()
        json_output = json_output[json_output.find('{'):json_output.rfind('}')+1]
        try:
            return json.loads(json_output)
        except json.JSONDecodeError as e:
            raise ValueError(f"LLM output is not valid JSON: {e}")

    def _update_examples(self, result: Dict[str, Any]) -> None:
        """Updates the few-shot examples with new generated data."""
        if len(self.examples) >= 3:  # Limit example pool to the last 3 results
            self.examples.pop(0)
        self.examples.append({"example": json.dumps(result)})  # Add new example

    def generate(self, subject: str, runs: int, extra: str = "") -> List[Dict[str, Any]]:
        """Generates synthetic data synchronously, updating examples dynamically."""
        if self.llm_chain is None:
            raise ValueError("LLM chain is not set. Use 'set_llm_chain' to initialize the chain.")
        
        results = []
        for _ in range(runs):
            result = self.llm_chain.run(subject=subject, extra=extra, examples=self.examples, num_samples=self.num_samples)
            self._update_examples(result)  # Update examples with new result
            results.append(result)
        return results

    async def agenerate(self, subject: str, runs: int, extra: str = "") -> List[Dict[str, Any]]:
        """Generates synthetic data asynchronously, updating examples dynamically."""
        if self.llm_chain is None:
            raise ValueError("LLM chain is not set. Use 'set_llm_chain' to initialize the chain.")
        
        results = []
        async def run_chain_async() -> None:
            result = await self.llm_chain.arun(subject=subject, extra=extra, examples=self.examples, num_samples=self.num_samples)
            self._update_examples(result)
            results.append(result)

        await asyncio.gather(*(run_chain_async() for _ in range(runs)))
        return results


class DatasetGenerator:
    """Generate synthetic datasets using a given LLM and prompt template."""
    
    def __init__(self, llm: Callable, template: Callable, sentence_preferences: Optional[Dict[str, Any]] = None):
        """Initialize the dataset generator with an LLM and sentence preferences."""
        self.generator = SyntheticDataGenerator(api_key="", template=template)
        self.generator.set_llm_chain(llm)
        self.sentence_preferences = sentence_preferences or {}

    def generate_dataset(self, fields_collection: List[List[Any]]) -> List[Dict[str, Any]]:
        """Generate synthetic dataset synchronously."""
        results = []
        for fields in fields_collection:
            subject = fields[0] if fields else ""
            extra = self.sentence_preferences.get("extra", "")
            runs = self.sentence_preferences.get("runs", 1)
            result = self.generator.generate(subject=subject, runs=runs, extra=extra)
            results.extend(result)
        return results

    async def generate_dataset_async(self, fields_collection: List[List[Any]]) -> List[Dict[str, Any]]:
        """Generate synthetic dataset asynchronously."""
        results = []
        async def run_async(fields):
            subject = fields[0] if fields else ""
            extra = self.sentence_preferences.get("extra", "")
            runs = self.sentence_preferences.get("runs", 1)
            result = await self.generator.agenerate(subject=subject, runs=runs, extra=extra)
            results.extend(result)

        await asyncio.gather(*(run_async(fields) for fields in fields_collection))
        return results


# Define the prompt template
def my_prompt_template(subject: str, extra: str, examples: List[Dict[str, str]], num_samples: int) -> str:
    """Template to generate the LLM prompt using few-shot examples."""
    example_text = "\n".join([f"Example:\n{ex['example']}" for ex in examples])
    prompt = (
        f"{example_text}\n\n"
        f"Now generate {num_samples} samples of {subject} data. "
        f"Ensure the following fields are present: Patient ID, Patient Name, Diagnosis Code, "
        f"Procedure Code, Total Charge, Insurance Claim Amount. {extra} Output the data in valid JSON format."
    )
    return prompt


In [10]:
# Usage example:
api_key = nv_api_key  # Replace with your actual API key

# Initialize the LLM (in this case the synthetic data generator's generate_data method)
llm = SyntheticDataGenerator(api_key=api_key, template=my_prompt_template).generate_data

# Create the dataset generator with LLM and preferences
sentence_preferences = {
    "extra": "Ensure unusual patient names.",
    "runs": 2
}

dataset_generator = DatasetGenerator(llm=llm, template=my_prompt_template, sentence_preferences=sentence_preferences)

# Generate dataset synchronously
fields_collection = [["medical_billing"]]
synthetic_dataset = dataset_generator.generate_dataset(fields_collection)


In [11]:
synthetic_dataset

[{'medical_billing_data': [{'Patient ID': 'P12345',
    'Patient Name': 'Xylophone Yonder',
    'Diagnosis Code': 'M54.5',
    'Procedure Code': '99213',
    'Total Charge': 150.0,
    'Insurance Claim Amount': 120.0}]},
 {'medical_billing_data': [{'Patient ID': 'P67890',
    'Patient Name': 'Zephyr Zenith',
    'Diagnosis Code': 'R10.1',
    'Procedure Code': '99214',
    'Total Charge': 200.0,
    'Insurance Claim Amount': 160.0}]}]

In [12]:
# Flatten the synthetic dataset
flattened_data = []

for entry in synthetic_dataset:
    if 'medical_billing_data' in entry:
        for record in entry['medical_billing_data']:
            flattened_data.append(record)

# Convert to pandas DataFrame
df = pd.DataFrame(flattened_data)

# Save to CSV
df.to_csv('synthetic_billing_data.csv', index=False)

# Display the DataFrame (optional)
print(df)

  Patient ID      Patient Name Diagnosis Code Procedure Code  Total Charge  \
0     P12345  Xylophone Yonder          M54.5          99213         150.0   
1     P67890     Zephyr Zenith          R10.1          99214         200.0   

   Insurance Claim Amount  
0                   120.0  
1                   160.0  


In [14]:
# Asynchronous dataset generation
async def generate_dataset_async():
    async_synthetic_dataset = await dataset_generator.generate_dataset_async(fields_collection)
    print(async_synthetic_dataset)

# In Jupyter or environments with an active event loop
# Use await directly to call the asynchronous function
await generate_dataset_async()


[{'medical_billing_data': [{'Patient ID': 'P09876', 'Patient Name': 'Quirky Quasar', 'Diagnosis Code': 'G47.33', 'Procedure Code': '99215', 'Total Charge': 250.0, 'Insurance Claim Amount': 200.0}]}, {'medical_billing_data': [{'Patient ID': 'P09876', 'Patient Name': 'Quirky Quasar', 'Diagnosis Code': 'G47.33', 'Procedure Code': '99215', 'Total Charge': 250.0, 'Insurance Claim Amount': 200.0}]}]
