In [None]:
import json
from typing import List, Dict, Any
import requests

class SyntheticDataGenerator:
    def __init__(self, api_key: str, num_samples: int = 1):
        self.api_key = api_key
        self.num_samples = num_samples
        self.base_url = "https://integrate.api.nvidia.com/v1"  # NVIDIA API endpoint

    def generate_data(self, instructions: str) -> List[Dict[str, Any]]:
        prompt = (
            f"Using the following instructions, generate {self.num_samples} samples of synthetic data in JSON format. "
            "Ensure the output is a valid JSON array where each entry is a dictionary. "
            f"Instructions: {instructions}"
        )

        # Prepare the API call with messages, temperature, and other parameters
        data = {
            "model": "nvidia/nemotron-4-340b-instruct",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.2,
            "top_p": 0.7,
            "max_tokens": 1024,
            "stream": False
        }

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }

        # Make the API call to NVIDIA's endpoint
        response = requests.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=data
        )

        if response.status_code != 200:
            raise ValueError(f"Failed to call the API: {response.status_code} {response.text}")

        # Get the raw response from the model
        raw_output = response.json()['choices'][0]['message']['content']

        # Parse the response
        return self._parse_response(raw_output)

    def _parse_response(self, raw_output: str) -> List[Dict[str, Any]]:
        # Strip the backticks and "json" formatting if present
        json_output = raw_output.strip('```json').strip('```').strip()
        try:
            # Try to parse the cleaned-up output as JSON
            return json.loads(json_output)
        except json.JSONDecodeError as e:
            # Handle parsing errors
            raise ValueError(f"LLM output is not valid JSON: {e}")



In [None]:
generator = SyntheticDataGenerator(api_key=nv_api_key)
data = generator.generate_data("Generate synthetic user profiles with fields: name, age, email.")
print(data)


[{'name': 'John Doe', 'age': 30, 'email': 'john.doe@example.com'}]


In [None]:
sentiment_instructions = "Generate 10 labeled text samples for sentiment analysis. Each sample should be a sentence with a label: 'positive', 'negative', or 'neutral'."
synthetic_sentiment_data = generator.generate_data(sentiment_instructions)
print(synthetic_sentiment_data)

[{'text': 'The new restaurant in town serves delicious food and has excellent service.', 'label': 'positive'}, {'text': 'I am disappointed with the quality of the product, it broke after just one use.', 'label': 'negative'}, {'text': 'The weather today is neither too hot nor too cold, just perfect for a walk outside.', 'label': 'neutral'}, {'text': "This book is so inspiring and thought-provoking, I couldn't put it down!", 'label': 'positive'}, {'text': "The hotel room was dirty and noisy, I couldn't sleep at all.", 'label': 'negative'}, {'text': "The movie was okay, not the best I've seen but not the worst either.", 'label': 'neutral'}, {'text': 'I am so grateful for my friends and family, they bring so much joy to my life.', 'label': 'positive'}, {'text': 'The customer service was terrible, they were rude and unhelpful.', 'label': 'negative'}, {'text': 'The concert last night was amazing, the band put on a great show!', 'label': 'positive'}, {'text': 'The article was informative but 

In [None]:
# Generate synthetic QA data
qa_instructions = "Generate 10 question-answer pairs about world history, with questions ranging from basic to advanced."
synthetic_qa_data = generator.generate_data(qa_instructions)

# Print the generated synthetic data
for sample in synthetic_qa_data:
    print(sample)

{'question': "Who is known as the 'Father of History' for his work on the Histories, which chronicled the Greco-Persian Wars?", 'answer': 'Herodotus'}
{'question': 'Which ancient civilization is credited with the invention of paper, gunpowder, and the compass?', 'answer': 'China'}
{'question': 'What was the name of the legal code established by Hammurabi, the sixth king of the First Babylonian Dynasty?', 'answer': 'Code of Hammurabi'}
{'question': 'Which empire was led by Attila the Hun during the 5th century?', 'answer': 'Hunnic Empire'}
{'question': 'Who was the first Roman Emperor, marking the end of the Roman Republic?', 'answer': 'Augustus'}
{'question': 'What was the name of the agreement that officially ended World War I?', 'answer': 'Treaty of Versailles'}
{'question': 'Which historical event is associated with the date October 31, 1517, and is considered a key starting point of the Protestant Reformation?', 'answer': "Martin Luther's Ninety-five Theses"}
{'question': 'Who was 

In [None]:
import json
from typing import List, Dict, Any, Callable
import requests


class SyntheticDataGenerator:
    def __init__(self, api_key: str, num_samples: int = 1, temperature: float = 0.2, top_p: float = 0.7, max_tokens: int = 1024):
        """
        Initializes the synthetic data generator with necessary configurations for NVIDIA Nemotron API.

        :param api_key: Your API key for accessing NVIDIA Nemotron
        :param num_samples: Number of samples to generate
        :param temperature: Sampling temperature for the model
        :param top_p: Controls the cumulative probability of token selection
        :param max_tokens: Maximum number of tokens to generate
        """
        self.api_key = api_key
        self.num_samples = num_samples
        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens
        self.base_url = "https://integrate.api.nvidia.com/v1"  # NVIDIA API endpoint

    def generate_data(self, instructions: str, prompt_template: Callable[[str], str]) -> List[Dict[str, Any]]:
        """
        Generate synthetic data based on the given instructions using a prompt template.

        :param instructions: The instructions that define the data generation task
        :param prompt_template: A callable function that formats the instructions into a prompt for the model
        :return: A list of dictionaries representing the generated synthetic data
        """
        prompt = prompt_template(instructions)

        # Prepare the API call with messages, temperature, and other parameters
        data = {
            "model": "nvidia/nemotron-4-340b-instruct",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": self.temperature,
            "top_p": self.top_p,
            "max_tokens": self.max_tokens,
            "stream": False
        }

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }

        # Make the API call to NVIDIA's endpoint
        response = requests.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=data
        )

        if response.status_code != 200:
            raise ValueError(f"Failed to call the API: {response.status_code} {response.text}")

        # Get the raw response from the model
        raw_output = response.json()['choices'][0]['message']['content']

        # Parse and return the response
        return self._parse_response(raw_output)

    def _parse_response(self, raw_output: str) -> List[Dict[str, Any]]:
        """
        Parse the raw LLM output into a structured format.

        :param raw_output: The raw string response from the model
        :return: Parsed JSON as a list of dictionaries
        """
        # Strip formatting (e.g., if the model wraps the response in code block formatting)
        json_output = raw_output.strip('```json').strip('```').strip()
        try:
            # Parse the cleaned-up output as JSON
            return json.loads(json_output)
        except json.JSONDecodeError as e:
            # Handle parsing errors and raise an exception if the output isn't valid JSON
            raise ValueError(f"LLM output is not valid JSON: {e}")

In [None]:
# Example usage of the class:

def my_prompt_template(instructions: str) -> str:
    """
    Custom prompt template for generating synthetic data.

    :param instructions: The instructions for the data generation
    :return: A formatted prompt string
    """
    return (
        f"Using the following instructions, generate {1} samples of synthetic data in JSON format. "
        f"Ensure the output is a valid JSON array where each entry is a dictionary. "
        f"Instructions: {instructions}"
    )

In [None]:
# Initialize the synthetic data generator
api_key = nv_api_key
generator = SyntheticDataGenerator(api_key=api_key, num_samples=1)

# Define your instructions for data generation
instructions = "Generate synthetic user profiles containing 'name', 'age', and 'location'."

# Generate the synthetic data using the custom prompt template
generated_data = generator.generate_data(instructions=instructions, prompt_template=my_prompt_template)

# Print the generated data
print(generated_data)

[{'name': 'John Doe', 'age': 30, 'location': 'New York, USA'}]


In [None]:
import json
from typing import List, Dict, Any, Callable
import requests
import pandas as pd


class SyntheticDataGenerator:
    def __init__(self, api_key: str, num_samples: int = 1, temperature: float = 0.2, top_p: float = 0.7, max_tokens: int = 2048):
        """
        Initializes the synthetic data generator with necessary configurations for NVIDIA Nemotron API.
        """
        self.api_key = api_key
        self.num_samples = num_samples
        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens
        self.base_url = "https://integrate.api.nvidia.com/v1"  # NVIDIA API endpoint

    def generate_data(self, subject: str, extra: str, runs: int, prompt_template: Callable[[str, str, int], str]) -> List[Dict[str, Any]]:
        """
        Generate synthetic data based on the given instructions using a prompt template.
        """
        synthetic_results = []
        for _ in range(runs):
            prompt = prompt_template(subject, extra, self.num_samples)

            # Prepare the API call
            data = {
                "model": "nvidia/nemotron-4-340b-instruct",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": self.temperature,
                "top_p": self.top_p,
                "max_tokens": self.max_tokens,
                "stream": False
            }

            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self.api_key}"
            }

            # Make the API call to NVIDIA's endpoint
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=headers,
                json=data
            )

            if response.status_code != 200:
                raise ValueError(f"Failed to call the API: {response.status_code} {response.text}")

            # Get the raw response from the model
            raw_output = response.json()['choices'][0]['message']['content']

            # Parse the response and add it to synthetic results
            parsed_data = self._parse_response(raw_output)
            synthetic_results.append(parsed_data)

        return synthetic_results

    def _parse_response(self, raw_output: str) -> List[Dict[str, Any]]:
        """
        Parse the raw LLM output into a structured format. Strips extra data and
        ensures that only valid JSON content is returned.

        :param raw_output: The raw string response from the model
        :return: Parsed JSON as a list of dictionaries
        """
        # Remove any potential code block delimiters
        json_output = raw_output.strip().strip('```json').strip('```').strip()

        # Find the first valid JSON object in the response
        try:
            # Sometimes the model may return multiple chunks of text; we split by braces.
            json_output = json_output[json_output.find('{'):json_output.rfind('}')+1]
            # Try to parse the cleaned-up output as JSON
            return json.loads(json_output)
        except json.JSONDecodeError as e:
            # Handle parsing errors and provide feedback
            raise ValueError(f"LLM output is not valid JSON: {e}\nRaw output: {raw_output}")




In [None]:
from pydantic import BaseModel
from typing import List

# Define a MedicalBilling class for structured data
class MedicalBilling(BaseModel):
    patient_id: str
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float


In [None]:
def my_prompt_template(subject: str, extra: str, num_samples: int) -> str:
    return (
        f"Generate {num_samples} samples of {subject} data. Ensure the following fields are present: "
        f"Patient ID, Patient Name, Diagnosis Code, Procedure Code, Total Charge, Insurance Claim Amount. "
        f"{extra} Output the data in valid JSON format."
    )


In [None]:
# Initialize the synthetic data generator
api_key = nv_api_key
generator = SyntheticDataGenerator(api_key=api_key, num_samples=1)

# Define the instructions for synthetic data generation
subject = "medical_billing"
extra = "The name must be chosen at random. Make it something unusual."
runs = 1

# Generate the synthetic data using the custom prompt template
synthetic_results = generator.generate_data(subject=subject, extra=extra, runs=runs, prompt_template=my_prompt_template)
synthetic_results

[{'medical_billing_data': {'Patient ID': 'P12345',
   'Patient Name': 'Zephyr Quicksilver',
   'Diagnosis Code': 'M54.5',
   'Procedure Code': '99213',
   'Total Charge': 150.0,
   'Insurance Claim Amount': 120.0}}]

In [None]:
# Convert the synthetic results into structured MedicalBilling objects
synthetic_data = []
for result in synthetic_results:
    # Check if the result is a dictionary and contains 'medical_billing_data'
    if isinstance(result, dict) and 'medical_billing_data' in result:
        entry = result['medical_billing_data']
        # Now process the entry dictionary
        billing_data = MedicalBilling(
            patient_id=entry['Patient ID'],  # Adjusted to match the correct key name
            patient_name=entry['Patient Name'],
            diagnosis_code=entry['Diagnosis Code'],
            procedure_code=entry['Procedure Code'],
            total_charge=float(entry['Total Charge']),
            insurance_claim_amount=float(entry['Insurance Claim Amount'])
        )
        synthetic_data.append(billing_data)
    else:
        print(f"Unexpected result format: {result}")

# Convert the synthetic data to a Pandas DataFrame
synthetic_df = pd.DataFrame([billing.dict() for billing in synthetic_data])

# Display the DataFrame
synthetic_df


Unnamed: 0,patient_id,patient_name,diagnosis_code,procedure_code,total_charge,insurance_claim_amount
0,P12345,Zephyr Quicksilver,M54.5,99213,150.0,120.0
