# Project Overview: Synthetic Data Generator

The goal of this project is to
*   Use models that can generate synthetic datasets
*   Create a Gradio UI for the product

Why this is useful
*   Applies to almost any business area
*   Can be used in day job and personal projects

# STEP 0: Installs, Imports, API Setup

## Installs
Since we're using a Google Colab to run this

In [None]:
!pip install -q --upgrade bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Imports

In [None]:
import os
import re
import requests
import threading
import tempfile
import json
import pandas as pd
from dataclasses import dataclass
from typing import Any, List, Dict, Tuple, Optional
from google.colab import drive, userdata
from IPython.display import Markdown, display, update_display

import gradio as gr
from openai import OpenAI
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, TextIteratorStreamer, BitsAndBytesConfig
import torch

## Global Constants & Model Config Class Setup

In [34]:
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
max_tokens = 1024

In [35]:
# @TODO: Add some more models to test later - ran out of free Colab compute when making this
model_choice = LLAMA

In [36]:
@dataclass
class ModelConfig:
  model_id: str = model_choice
  max_new_tokens: int = max_tokens
  do_sample: bool = True
  temperature: float = 0.7
  top_p: float = 0.9

## Sign into HuggingFace Hub

In [None]:
hf_token = userdata.get('HF_TOKEN')
if hf_token and hf_token.startswith("hf_"):
  print("HF key looks good so far")
else:
  print("HF key is not set - please click the key in the left sidebar")

login(hf_token, add_to_git_credential=True)

HF key looks good so far


# STEP 1: Set up Tokenizer, Quantization and Model in function

In [33]:
def load_model(cfg: ModelConfig):
  quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_quant_type = "nf4"
  )

  tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

  # Adding a try/except (except will need more RAM)
  try:
    model = AutoModelForCausalLM.from_pretrained(
        cfg.model_id,
        quantization_config = quant_config,
        device_map="auto",
        torch_dtype = torch.float16
    )
  except RuntimeError as e:
    print(f"WARNING: 4-bit quantized loading failed, falling back to full precision. Error: {e}")
    model = AutoModelForCausalLM.from_pretrained(
        cfg.model_id,
        device_map="auto",
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    )

  model.eval()

  return tokenizer, model

# STEP 2: Design the Prompt Engine
Allows us to tell the LLM *what to do*

## System Prompt

In [32]:
system_prompt = """
You are an accurate synthetic data generator.
Your sole task is to output valid data in the format specified by the user.

Hard rules:
- Output ONLY a valid JSON array (list of objects). No markdown, no code fences, no commentary.
- Every object must contain exactly the requested fields (no extras).
- Values must match the requested types.
- Use realistic values and include variability.
- If a field is constrained by the user (e.g., ranges, enums), obey it.

The user will provide:
- Dataset description
- Exact schema (fields + types)
- Number of rows

Return ONLY the JSON array.
"""

messages = [
    {"role": "system", "content": system_prompt}
  ]

## Repair Prompt
Fun easter egg... we actually use the LLM again to repair the JSON output if needed!

In [None]:
repair_prompt = """You are a JSON repair tool.

Task:
- You will be given text that should be a JSON array (list of objects), but it may be invalid JSON.
- Produce ONLY a valid JSON array that preserves the intended data.
- No markdown, no commentary, no extra keys.
- If any values are missing quotes or invalid, fix them minimally.

Return ONLY the JSON array.
"""

## Text Generation Function using HuggingFace

In [31]:
def _chat_completion_hf(tokenizer, model, messages: List[Dict[str, str]], cfg: ModelConfig):
  # Prepare inputs for the model based on the current messages history.
  enc = tokenizer.apply_chat_template(
      messages,
      return_tensors="pt",
      padding = True,
      truncation = True
  )

  # Device Handling
  device = next(model.parameters()).device
  # Full transparency - Claude helped me with this input to device code to be friendly with Colab. My interpretation is below...
  """
  This section takes the encoded, tokenized input data and moves it to the appropriate computing device (usually a GPU).
  The if statement checks if enc is a single PyTorch tensor
    - If so, the tensor is moved to the device and the attention mask is created
    - If not, it's assumed to be a dict-like object (BatchEncoding) which has the input_ids and attention mask as separate tensors.
      - We iterate through each key-value pair and moves the tensor value (v) to the device
  """
  if isinstance(enc, torch.Tensor):
    input_ids = enc.to(device)
    attention_mask = torch.ones_like(input_ids, device=device)
    inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
  else:
    # BatchEncoding / dict-like
    inputs = {k: v.to(device) for k, v in enc.items()}

  """
  We generate the text here using input_ids, attention_mask and the params from cfg.
  Disabled gradient calculation here since we're not updating the model, we're just doing inference
  """
  with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=cfg.max_new_tokens,
            do_sample=cfg.do_sample,
            temperature=cfg.temperature,
            top_p=cfg.top_p,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

  # Decoded should have entire prompt + LLM assistant response
  decoded = tokenizer.decode(out[0], skip_special_tokens=True)
  return decoded

# STEP 3: Core Data Generation Logic

## JSON Helpers

In [30]:
# Tries to extract the JSON array from LLM output by []
def _extract_json_array(text: str) -> str:
  start = text.find("[")    # Find the first [
  end = text.rfind("]")     # Find the last ]
  if start == -1 or end == -1 or end <= start:
    raise ValueError("Could not find a JSON array in the LLM output")
  return text[start:end + 1].strip()

In [29]:
# Parses the JSON array
def _loads_json_array(text: str) -> List[Dict[str, Any]]:
  # Should work if everything is normal but let's look for some potential erros
  data = json.loads(text)
  if not isinstance(data, list):
    raise ValueError("Parsed JSON isn't a list")
  if any(not isinstance(x, dict) for x in data):
    raise ValueError("JSON array doesn't contain objects")
  return data

In [28]:
# Basic Schema Validation (looks for missing keys or extra keys)
def _validate_schema(rows: List[Dict[str, Any]], fields: List[Tuple[str, str]]) -> None:
  expected_keys = [f for f, _t in fields]
  expected_set = set(expected_keys)

  for i, r in enumerate(rows):
    keys = set(r.keys())
    missing = expected_set - keys
    extra = keys - expected_set
    if missing:
      raise ValueError(f"Row {i} is missing keys: {sorted(missing)}")
    if extra:
      raise ValueError(f"Row {i} has extra keys: {sorted(extra)}")

In [27]:
# Parses schema lines to return a list of (field, type)
def parse_schema(schema_text: str) -> List[Tuple[str, str]]:
  fields: List[Tuple[str, str]] = []

  for line in schema_text.strip().splitlines():
    line = line.strip()
    if not line:
      continue
    if ":" not in line:
      raise ValueError(f"Invalid schema line (expected field:type): '{line}'")

    # Just splitting the field name and it's type
    field, type_str = line.split(":", 1)
    fields.append(
        (field.strip(), type_str.strip().lower())
    )

  if not fields:
    raise ValueError("No fields found in schema")
  return fields

## Core Generation Function

In [37]:
def generate_dataset_json(
    description: str,
    schema_text: str,
    num_rows: int,
    tokenizer,
    model,
    cfg: ModelConfig
) -> Tuple[str, Optional[pd.DataFrame], Optional[str]]:
    try:
      fields = parse_schema(schema_text)
      if num_rows <1 or num_rows > 200:
        raise ValueError("Number of rows must be between 1 and 200")

      schema_lines = "\n".join([f"- {f}: {t}" for f, t in fields])

      # Crafting the user prompt with all the inputs
      user_prompt = f"""Dataset description:
{description}

Schema (field:type):
{schema_lines}

Number of rows: {num_rows}

Return ONLY a JSON array of {num_rows} objects.
"""

      # Stateless messages per generation (so we don't bog down memory)
      # We really don't need to maintain the state since the dataset generation should be independent each time
      messages = [
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": user_prompt}
      ]

      # Get the raw output from the model and then extract the JSON array
      raw = _chat_completion_hf(tokenizer, model, messages, cfg)
      candidate = _extract_json_array(raw)

      # Extract the rows of data
      try:
        rows = _loads_json_array(candidate)
      except Exception:
        # Repair attempt using an LLM again!
        # If you made it this far into my code, you'll see that we actually use an LLM to repair the JSOn if needed
          repair_messages = [
              {"role": "system", "content": repair_prompt},
              {"role": "user", "content": candidate},
          ]
          repaired_raw = _chat_completion_hf(tokenizer, model, repair_messages, cfg)
          repaired_candidate = _extract_json_array(repaired_raw)
          rows = _loads_json_array(repaired_candidate)
          candidate = repaired_candidate

      # Enforce Length (@TODO: future work - add error handlign)
      if len(rows) != num_rows:
        rows = rows[:num_rows]  # Just trim for now

      # Validate the schema
      _validate_schema(rows, fields)

      df = pd.DataFrame(rows)

      # Write the CSV
      tmpdir = tempfile.mkdtemp()
      csv_path = os.path.join(tmpdir, "synthetic_data.csv")
      df.to_csv(csv_path, index=False)

      return rows, df, csv_path
    except Exception as e:
      return {"Error": str(e)}, None, None

# STEP 4: Gradio User Interface
Wrap everything in Gradio for publication



## Examples
*   Dataset Description
*   Schema Definition
*   Nunber of rows


In [None]:
nfl_example = [
    "Synthetic dataset for NFL quarterback game-by-game passing yards",
    "full_name:str\nteam_abbreviation:str\nopponent_team_abbreviation:str\ngame_date:date\npassing_yards:int\npass_attempts:int\npass_completions:int\npassing_touchdowns:int\npassing_interceptions:int",
    3
]

nba_example = [
    "Synthetic dataset for NBA player game-by-game stat lines",
    "full_name:str\nteam_abbreviation:str\nopponent_team_abbreviation:str\ngame_date:date\npoints:int\nrebounds:int\nassists:int\nsteals:int\npersonal_fouls:int",
    4
]

stock_example = [
    "Synthetic dataset for 5 different stocks",
    "ticker:str\nopen:float\nhigh:float\nlow:float\nclose:float\nvolume:int\nmarket_cap:float",
    5
]

## Gradio App Function

In [38]:
def build_app(tokenizer, model, cfg: ModelConfig):
  with gr.Blocks(title = "Nikhil Gavini's Synthetic Data Generator (Open-Source LLM)") as demo:
    ## Basic Title and Description
    gr.Markdown("## Nikhil Gavini's Synthetic Data Generator using HuggingFace LLMs")
    gr.Markdown(
        "Enter a dataset description, a schema (one `field:type` per line), and row count. \n"
        "The model returns JSON and a downloadable CSV."
    )


    ## First Row is User freeform Input
    with gr.Row():
      description = gr.Textbox(
          label = "Dataset Description",
          lines = 4,
          placeholder = "Describe the dataset you want to generate",
      )
      schema = gr.Textbox(
          label = "Schema (field:type per line)",
          lines = 6,
          placeholder = "Enter one field:type per line",
      )


    ## Allows the user pick how many rows they want
    num_rows = gr.Slider(
        minimum = 1,
        maximum = 200,
        value = 5,
        step = 1,
        label = "Number of rows",
    )

    ## Button will have functionality linked to it
    generate_button = gr.Button("Generate")


    ## Outputs (JSON, DataFrame Preview, CSV Download)
    json_output = gr.JSON(
        label = "JSON Output"
    )

    df_output = gr.Dataframe(
        label = "CSV Preview (up to 5 rows)"
    )

    download_file = gr.File(
        label = "Download CSV"
    )


    ## Generate the data in all forms
    def on_generate(desc, sch, nr):
      json_text, df, csv_path = generate_dataset_json(
          desc, sch, int(nr), tokenizer, model, cfg
      )
      if df is None:
        return json_text, None, None
      return json_text, df.head(5), csv_path


    ## Link functionality to button
    generate_button.click(
        fn = on_generate,
        inputs = [description, schema, num_rows],
        outputs = [json_output, df_output, download_file]
    )


    ## Examples for user to pick from
    gr.Examples(
        examples = [
            nfl_example,
            nba_example,
            stock_example
        ],
        inputs = [description, schema, num_rows]
    )

  return demo

# Final Step: Call the App!

In [39]:
cfg = ModelConfig()
tokenizer, model = load_model(cfg)
demo = build_app(tokenizer, model, cfg)
demo.launch(share = False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

