In [78]:
from typing import Optional, List, Dict, Any, Callable
import json
import random


class SyntheticDataGenerator:
    def __init__(
            self,
            generator_llm: Callable,
            judge_llm: Callable,
            columns: List[str],
            example_data: List[Dict[str, Any]],
            real_data: Optional[List[Dict[str, Any]]] = None
    ):
        """Initialize the SyntheticDataGenerator with LLMs, columns, and example/real data."""
        self.generator_llm = generator_llm
        self.judge_llm = judge_llm
        self.columns = columns
        self.example_data = example_data
        self.real_data = real_data
        self.generated_data = []
        self.feedback_history = []

    def generate_data(self, num_samples: int) -> List[Dict[str, Any]]:
        """Generate a specified number of synthetic data samples with feedback and improvement."""
        for _ in range(num_samples):
            while True:
                generated = self._generate_single_data_point()
                score = self._judge_data_point(generated)

                if score >= 0.9:  # Perfect score
                    self.generated_data.append(generated)
                    break
                elif score >= 0.5:  # Medium score
                    if self._ask_human_feedback(generated):
                        self.generated_data.append(generated)
                        break
                    else:
                        self._inform_generator(generated, score, "Human rejected")
                else:  # Low score
                    self._inform_generator(generated, score, "Low score")

        return self.generated_data

    def _generate_single_data_point(self) -> Dict[str, Any]:
        """Generate a single data point using the generator LLM."""
        system_prompt = "You are a synthetic data generator. Generate one realistic data based on the given examples and criteria."
        prompt = self._create_generation_prompt()
    
        # Call the generator LLM (Hugging Face in this case)
        generated = self.generator_llm(prompt, system_prompt=system_prompt, temperature=0.7, top_p=0.85)
        
        # Print the raw output for debugging
        # print("Raw output from LLM:", generated)
        
        try:
            # Attempt to parse the generated data as JSON
            return json.loads(generated)
        except json.JSONDecodeError:
            print(f"Failed to parse generated data as JSON: {generated}")
            return {}


    def _judge_data_point(self, data: Dict[str, Any]) -> float:
        """Judge the generated data using the judge LLM."""
        system_prompt = "You are a data quality judge. Evaluate the given data based on the criteria and return a score between 0 and 1. It's important to only send score without any description"
        criteria = self._create_judge_criteria()
        prompt = f"Data to evaluate: {json.dumps(data)}\n\nCriteria:\n{criteria}\n\nProvide only a numeric score between 0 and 1."
        score_str = self.judge_llm(prompt, system_prompt=system_prompt)
    
        try:
            score = float(score_str)
            if score < 0 or score > 1:
                raise ValueError(f"Score out of bounds: {score}")
            return score
        except ValueError:
            print(f"Failed to parse judge score: {score_str}. Defaulting to 0.0")
            return 0.0

    def _ask_human_feedback(self, data: Dict[str, Any]) -> bool:
        """Simulate asking for human feedback on the generated data."""
        print("\nPlease review this generated data point:")
        for col, value in data.items():
            print(f"{col}: {value}")
        return input("Is this data acceptable? (y/n): ").lower() == 'y'

    def _inform_generator(self, data: Dict[str, Any], score: float, reason: str):
        """Provide feedback to the generator based on the judged data."""
        feedback = f"Generated data: {json.dumps(data)}\nScore: {score}\nReason: {reason}"
        self.feedback_history.append(feedback)
        print(f"Feedback for generator: {feedback}")

    def _create_generation_prompt(self) -> str:
        """Create the prompt for the generator LLM to produce a data point."""
        random_variation = random.choice([
            "Please ensure diversity in the generated data.",
            "Generate a new data point with slight variation from the examples.",
            "Ensure the generated data is fresh and distinct from previous examples."
        ])
        
        prompt = f"Generate synthetic data with the following columns: {', '.join(self.columns)}\n"
        prompt += "The data should be similar to the following examples:\n\n"
        for example in self.example_data:
            prompt += json.dumps(example) + "\n"
        if self.real_data:
            prompt += "\nAdditional real data for reference:\n"
            for real in self.real_data:
                prompt += json.dumps(real) + "\n"
        if self.feedback_history:
            prompt += "\nPrevious feedback:\n"
            prompt += "\n".join(self.feedback_history[-3:])  # Include last 3 feedback items
    
        # Add instruction to return valid JSON
        prompt += f"\n{random_variation}\n"
        prompt += "Output the data as a valid JSON object without any additional explanations or text."
        
        return prompt


    def _create_judge_criteria(self) -> str:
        """Create the judging criteria for evaluating the generated data."""
        criteria = """
        Evaluate the generated data based on the following criteria:
        1. Contains all required columns (name, age, occupation)
        2. Data types match the example data (name: string, age: integer, occupation: string)
        3. Values are plausible and coherent (e.g., reasonable age, sensible occupation)
        4. Absence of personally identifiable information beyond what is necessary
        5. Similarity to the example data patterns
    
        Give the data a fair score even if the values are slightly different but still realistic.
    
        Return a score between 0 and 1, where 1 is a perfect match.
        """
        return criteria


In [50]:
from llms import OpenAi, IndoxApi

indox = IndoxApi(api_key=indox_api_key)

# Get api key from NVIDIA (https://build.nvidia.com/nvidia/nemotron-4-340b-instruct?api_key=true&)
nemotron = OpenAi(api_key=nv_api_key, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")
nemotron_response = nemotron.chat(prompt="Write a limerick about the wonders of GPU computing.",
                                  system_prompt="You are a helpful assistant designed to generate synthetic data.",
                                  stream=True, temperature=0.2, top_p=0.7)
print(nemotron_response)

In the realm of computing, where speed is a must,
GPU power is something we trust.
Parallel processing, oh what a delight,
Transforms heavy workloads, from day into night.
With teraflops galore, it's a tech lover's lust.

Nvidia, AMD, they're in the game,
Their silicon marvels, far from tame.
Complex calculations, in a flash,
For AI, gaming, or a data mash.
GPU computing, forever we'll acclaim!


In [51]:
indox_response = indox.chat(prompt="Write a limerick about the wonders of GPU computing.",
                            system_prompt="You are a helpful assistant designed to generate synthetic data.",
                            stream=True, temperature=0.2, top_p=0.7)
print(indox_response)

In a world where the pixels collide,  
GPUs work with incredible pride.  
With parallel might,  
They render the light,  
And in data's vast ocean, they glide!


In [68]:
# Set up the generator
columns = ["name", "age", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

generator = SyntheticDataGenerator(
    generator_llm=indox.chat,
    judge_llm=nemotron.chat,
    columns=columns,
    example_data=example_data
)
generated_data = generator.generate_data(num_samples=1)
generated_data

[[{'name': 'Catherine Smith', 'age': 29, 'occupation': 'Software Engineer'},
  {'name': 'David Brown', 'age': 38, 'occupation': 'Graphic Designer'},
  {'name': 'Emma Wilson', 'age': 27, 'occupation': 'Data Analyst'},
  {'name': 'Frank Miller', 'age': 45, 'occupation': 'Project Manager'},
  {'name': 'Grace Taylor', 'age': 31, 'occupation': 'Marketing Specialist'},
  {'name': 'Henry Davis', 'age': 50, 'occupation': 'Sales Executive'},
  {'name': 'Isabella Martinez', 'age': 24, 'occupation': 'Content Writer'},
  {'name': 'James Anderson', 'age': 39, 'occupation': 'Financial Advisor'},
  {'name': 'Sophia Thomas', 'age': 34, 'occupation': 'HR Coordinator'},
  {'name': 'Liam Jackson', 'age': 28, 'occupation': 'Web Developer'}]]

In [79]:
from llms import OpenAi,HuggingFaceModel

# Initialize HuggingFaceModel
hf_model = HuggingFaceModel(api_key=hf_api_key, model="mistralai/Mistral-Nemo-Instruct-2407")

# Set up the generator
columns = ["name", "age", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

# Get api key from NVIDIA
nemotron = OpenAi(api_key=nv_api_key, model="nvidia/nemotron-4-340b-instruct", base_url="https://integrate.api.nvidia.com/v1")

# Use HuggingFaceModel chat method for generation
generator = SyntheticDataGenerator(
    generator_llm=hf_model.chat,
    judge_llm=nemotron.chat,
    columns=columns,
    example_data=example_data
)

# Generate synthetic data using the HuggingFaceModel as the generator
generated_data = generator.generate_data(num_samples=1)

# Print generated data
print(generated_data)

[32mINFO[0m: [1mInitializing HuggingFaceModel with model: mistralai/Mistral-Nemo-Instruct-2407[0m
[32mINFO[0m: [1mHuggingFaceModel initialized successfully[0m
[32mINFO[0m: [1mGenerating response from Hugging Face model[0m
[32mINFO[0m: [1mSending request to Hugging Face API[0m
[32mINFO[0m: [1mReceived successful response from Hugging Face API[0m
Failed to parse generated data as JSON: You are a synthetic data generator. Generate one realistic data based on the given examples and criteria.
Generate synthetic data with the following columns: name, age, occupation
The data should be similar to the following examples:

{"name": "Alice Johnson", "age": 35, "occupation": "Manager"}
{"name": "Bob Williams", "age": 42, "occupation": "Accountant"}

Generate a new data point with slight variation from the examples.
Output the data as a valid JSON object without any additional explanations or text.
hiiii 0
Feedback for generator: Generated data: {}
Score: 0.0
Reason: Low score


KeyboardInterrupt: 

In [None]:
from llms import Mistral
mistral_api = Mistral(api_key="your-api-key")

synthetic_data_gen = SyntheticDataGenerator(
    generator_llm=mistral_api.chat,  # Pass the chat method
    judge_llm=nemotron.chat,      # Pass the chat method or a custom judge function
    columns=["name", "age", "occupation"],
    example_data=[{"name": "Alice", "age": 30, "occupation": "Engineer"}]
)

# Generate synthetic data
generated_data = synthetic_data_gen.generate_data(num_samples=10)
