In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.environ['INDOX_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [2]:
from models import OpenAi, IndoxApi

indox = IndoxApi(api_key=INDOX_API_KEY)

# Get api key from NVIDIA (https://build.nvidia.com/nvidia/nemotron-4-340b-instruct?api_key=true&)
nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")

In [7]:
nemotron_response = nemotron.chat(prompt="Write a limerick about the wonders of GPU computing.",
                                  system_prompt="You are a helpful assistant designed to generate synthetic data.",
                                  stream=True, temperature=0.2, top_p=0.7)
print(nemotron_response)

In the realm of computing, where speed is a must,
GPU power is something we trust.
Parallel processing, oh what a delight,
Transforms heavy workloads, from day into night.
With teraflops galore, it's a tech lover's lust.

Nvidia, AMD, they're in the game,
Their silicon marvels, far from tame.
Complex calculations, in a flash,
For AI, gaming, or a data mash.
GPU computing, forever we'll acclaim!


In [9]:
indox_response = indox.chat(prompt="Write a limerick about the wonders of GPU computing.",
                            system_prompt="You are a helpful assistant designed to generate synthetic data.",
                            stream=True, temperature=0.2, top_p=0.7)
print(indox_response)

In a world where the pixels do dance,  
GPUs give our data a chance.  
With cores all aligned,  
They process combined,  
Making algorithms prance in a trance!


In [10]:
import json
from typing import List, Dict, Any, Optional
import csv
from io import StringIO
import random

class SyntheticDataGenerator:
    def __init__(
            self,
            generator_llm,
            judge_llm,
            columns: List[str],
            example_data: List[Dict[str, Any]],
            real_data: Optional[List[Dict[str, Any]]] = None
    ):
        self.generator_llm = generator_llm
        self.judge_llm = judge_llm
        self.columns = columns
        self.example_data = example_data
        self.real_data = real_data
        self.generated_data = []
        self.feedback_history = []

    def generate_data(self, num_samples: int) -> str:
        while len(self.generated_data) < num_samples:
            generated = self._generate_single_data_point()
            if not generated:
                continue  # Skip empty generations
    
            score = self._judge_data_point(generated)
            
            if score < 0.4:
                # Inform generator about its mistake and retry
                self._inform_generator(generated, score, "Low score")
            elif 0.4 <= score <= 0.7:
                # Ask human feedback
                print(f"Generated data for human review: {generated}")  # Display the generated data
                if self._ask_human_feedback(generated):
                    if not self._is_duplicate(generated):
                        self.generated_data.append(generated)
                        print(f"Human accepted data point: {generated}")
                    else:
                        print("Duplicate data detected. Skipping...")
                else:
                    self._inform_generator(generated, score, "Rejected by human feedback")
            else:  # score > 0.7
                if not self._is_duplicate(generated) and self._is_diverse(generated):
                    self.generated_data.append(generated)
                    print(f"Generated diverse data point: {generated}")
                else:
                    print("Generated data is not diverse or is a duplicate. Retrying...")
    
        return self._convert_to_csv()
    def _generate_single_data_point(self) -> Dict[str, Any]:
        system_prompt = "You are a synthetic data generator. Generate diverse and realistic data based on the given examples and criteria. Your response must be a valid JSON object."
        prompt = self._create_generation_prompt()
        
        max_attempts = 5  # Increased max attempts
        for attempt in range(max_attempts):
            try:
                generated = self.generator_llm.chat(prompt, system_prompt=system_prompt, temperature=1.0)  # Increased temperature
                json_start = generated.find('{')
                json_end = generated.rfind('}') + 1
                if json_start != -1 and json_end != -1:
                    json_str = generated[json_start:json_end]
                    data = json.loads(json_str)
                    
                    if all(col in data for col in self.columns):
                        # Perturb the data if it is not diverse enough
                        if not self._is_diverse(data) and attempt < max_attempts - 1:
                            print("Generated data is not diverse enough. Modifying the prompt and retrying...")
                            prompt += "\nAvoid using the exact same values. Try changing names, ages, or occupations.\n"
                            continue
                        
                        return data
                    else:
                        missing_columns = set(self.columns) - set(data.keys())
                        print(f"Generated data is missing columns: {missing_columns}")
                else:
                    print("No valid JSON object found in the generated data")
            except json.JSONDecodeError as e:
                print(f"Failed to parse generated data (Attempt {attempt + 1}/{max_attempts}): {str(e)}")
            
            if attempt < max_attempts - 1:
                print(f"Retrying generation (Attempt {attempt + 2}/{max_attempts})...")
        
        print("Max attempts reached. Skipping this data point.")
        return {}

    def _convert_to_csv(self) -> str:
        output = StringIO()
        writer = csv.DictWriter(output, fieldnames=self.columns)
        writer.writeheader()
        for data in self.generated_data:
            writer.writerow(data)
        return output.getvalue()

    def _create_generation_prompt(self) -> str:
        prompt = f"Generate diverse synthetic data with the following columns: {', '.join(self.columns)}.\n"
        prompt += "Ensure that each generated data point is unique and significantly different from the previous ones.\n"
        prompt += "The data should be realistic and within the given examples, but with variations. For example, use a new name or different age ranges.\n\n"
        
        # Include some example data in the prompt
        shuffled_examples = random.sample(self.example_data + (self.real_data or []), 
                                          min(5, len(self.example_data) + len(self.real_data or [])))
        for example in shuffled_examples:
            prompt += json.dumps(example) + "\n"
        
        # Add recent outputs to avoid
        if self.generated_data:
            prompt += "\nPlease avoid using the following names, ages, and occupations in the new data point:\n"
            
            used_names = {data['name'] for data in self.generated_data if 'name' in data}
            used_ages = {data['age'] for data in self.generated_data if 'age' in data}
            used_occupations = {data['occupation'] for data in self.generated_data if 'occupation' in data}
            
            prompt += f"Names: {', '.join(used_names)}\n"
            prompt += f"Ages: {', '.join(map(str, used_ages))}\n"
            prompt += f"Occupations: {', '.join(used_occupations)}\n"
        
        # Specific instructions for generating diverse data
        prompt += "\nEnsure that names, ages, and occupations are different from the above lists. For example, use new names, ages outside the used range, or different occupations.\n"
        
        # Include recent feedback history
        if self.feedback_history:
            prompt += "\nPrevious feedback (avoid these patterns):\n"
            prompt += "\n".join(self.feedback_history[-2:])  # Include last 2 feedback items
        
        prompt += "\nGenerate a single, unique data point as a JSON object."
        return prompt



    def _convert_to_csv(self) -> str:
        output = StringIO()
        writer = csv.DictWriter(output, fieldnames=self.columns)
        writer.writeheader()
        for data in self.generated_data:
            writer.writerow(data)
        return output.getvalue()
    
    def _judge_data_point(self, data: Dict[str, Any]) -> float:
        system_prompt = "You are a data quality judge. Evaluate the given data based on the criteria and return a score between 0 and 1."
        criteria = self._create_judge_criteria()
        prompt = f"Data to evaluate: {json.dumps(data)}\n\nCriteria:\n{criteria}\n\nProvide a numeric score between 0 and 1."
        
        score_str = self.judge_llm.chat(prompt, system_prompt=system_prompt, temperature=0.5)
        try:
            score = float(score_str)
            return score
        except ValueError:
            print(f"Failed to parse judge score: {score_str}")
            return 0.5  # Default to a mid-range score to avoid rejecting too often
    
    def _is_diverse(self, new_data: Dict[str, Any]) -> bool:
        if len(self.generated_data) < 2:
            return True
        
        # Relaxed diversity check: only require differences in at least 1/4 of the columns
        last_data = self.generated_data[-2:]
        for data in last_data:
            differences = sum(1 for col in self.columns if str(new_data.get(col)) != str(data.get(col)))
            if differences >= len(self.columns) // 4:  # Require fewer differences
                return True
        return False
    def _ask_human_feedback(self, data: Dict[str, Any]) -> bool:
        print("\nPlease review this generated data point:")
        for col, value in data.items():
            print(f"{col}: {value}")
        response = input("Is this data acceptable? (y/n): ").strip().lower()
        return response == 'y'
    
    def _inform_generator(self, data: Dict[str, Any], score: float, reason: str):
        feedback = f"Generated data: {json.dumps(data)}\nScore: {score}\nReason: {reason}"
        self.feedback_history.append(feedback)
        print(f"Feedback for generator: {feedback}")


    def _create_generation_prompt(self) -> str:
        prompt = f"Generate diverse synthetic data with the following columns: {', '.join(self.columns)}\n"
        prompt += "Ensure each generated data point is significantly different from previous ones.\n"
        prompt += "The data should be similar to the following examples, but not identical:\n\n"
        for example in self.example_data:
            prompt += json.dumps(example) + "\n"
        if self.real_data:
            prompt += "\nAdditional real data for reference:\n"
            for real in self.real_data:
                prompt += json.dumps(real) + "\n"
        if self.feedback_history:
            prompt += "\nPrevious feedback:\n"
            prompt += "\n".join(self.feedback_history[-3:])  # Include last 3 feedback items
        prompt += "\nGenerate a single data point as a JSON object."
        return prompt
    
    def _is_duplicate(self, new_data: Dict[str, Any]) -> bool:
        # Check for duplicates in the generated data
        for existing_data in self.generated_data:
            if all(str(new_data.get(col)) == str(existing_data.get(col)) for col in self.columns):
                return True
        return False
    
    def _create_judge_criteria(self) -> str:
        criteria = "Evaluate the generated data based on the following relaxed criteria:\n"
        criteria += "1. Contains all required columns.\n"
        criteria += "2. Data types match the example data.\n"
        criteria += "3. Values are generally plausible and make sense.\n"
        criteria += "4. Avoids clear personal information like full names, addresses.\n"
        criteria += "5. Similar to example data patterns but not exact copies.\n"
        criteria += "Return a score between 0 and 1, where 1 is perfect.It is very important to only return score. do not add any description or feedback."
        return criteria

In [11]:
# Set up the generator
columns = ["name", "age", "occupation"]
example_data = [
    {"name": "Alice Johnson", "age": 35, "occupation": "Manager"},
    {"name": "Bob Williams", "age": 42, "occupation": "Accountant"}
]

generator = SyntheticDataGenerator(
    generator_llm=indox,
    judge_llm=nemotron,
    columns=columns,
    example_data=example_data
)


In [12]:
# Generate data
generated_data = generator.generate_data(num_samples=3)

Generated data for human review: {'name': 'Emma Garcia', 'age': 29, 'occupation': 'Software Engineer'}

Please review this generated data point:
name: Emma Garcia
age: 29
occupation: Software Engineer
Feedback for generator: Generated data: {"name": "Emma Garcia", "age": 29, "occupation": "Software Engineer"}
Score: 0.6
Reason: Rejected by human feedback
Generated data for human review: {'name': 'Evelyn Thompson', 'age': 51, 'occupation': 'Graphic Designer'}

Please review this generated data point:
name: Evelyn Thompson
age: 51
occupation: Graphic Designer
Human accepted data point: {'name': 'Evelyn Thompson', 'age': 51, 'occupation': 'Graphic Designer'}
Generated data for human review: {'name': 'Ming Zhao', 'age': 54, 'occupation': 'Civil Engineer'}

Please review this generated data point:
name: Ming Zhao
age: 54
occupation: Civil Engineer
Human accepted data point: {'name': 'Ming Zhao', 'age': 54, 'occupation': 'Civil Engineer'}
Generated data for human review: {'name': 'Michael Th

In [13]:
generated_data

'name,age,occupation\r\nEvelyn Thompson,51,Graphic Designer\r\nMing Zhao,54,Civil Engineer\r\nMichael Thompson,54,Construction Foreman\r\n'

In [10]:
# Print final generated data
print("\nFinal generated data:")
for data in generator.generated_data:
    print(json.dumps(data, indent=2))


Final generated data:
{
  "name": "Charlie Davis",
  "age": 39,
  "occupation": "Software Engineer"
}
{
  "name": "Charlie Davis",
  "age": 39,
  "occupation": "Software Engineer"
}
{
  "name": "Charlie Davis",
  "age": 39,
  "occupation": "Software Engineer"
}
