<a href="https://colab.research.google.com/github/mightyoctopus/mockup-data-generator/blob/main/w3_d5_mockup_data_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers torch requests bitsandbytes accelerate anthropic

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import requests
from typing import List, Dict, Tuple
from datetime import datetime
from IPython.display import Markdown, display, update_display
from google.colab import drive, userdata
from huggingface_hub import login, snapshot_download
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
import torch
from anthropic import Anthropic

In [None]:
HF_TOKEN = userdata.get("HF_TOKEN")
ANTHROPIC_API_KEY = userdata.get("ANTHROPIC_API_KEY")

if HF_TOKEN:
    login(HF_TOKEN, add_to_git_credential=True)

QWEN_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
CLAUDE_MODEL = "claude-3-5-haiku-latest"

claude = Anthropic(api_key=ANTHROPIC_API_KEY)


In [None]:
# drive.flush_and_unmount()

### Mount FIRST and pick ONE mount point consistently
### (Before placing snapshot_download method)
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
cache_path = "/content/drive/MyDrive/models/huggingface_cache"
os.makedirs(cache_path, exist_ok=True)


expected_model_dir = os.path.join(cache_path, QWEN_MODEL.replace("/", "--"))

if not os.path.exists(expected_model_dir):
    print("Downloading the model...")
    model_path = snapshot_download(
        repo_id=QWEN_MODEL,
        cache_dir=cache_path,
        local_dir=expected_model_dir,
        local_dir_use_symlinks=False
    )
else:
    print("Model already exists in cache.")
    model_path = expected_model_dir

Model already exists in cache.


In [None]:
def invoke_messages(
        rows_num: int,
        business_category: str,
        columns: str,
        instruction: str,
        ) -> List[Dict[str, str]]:

    system_message = """
        You are a helpful assistant generating synthetic mockup dataset as per
        user's request across all types of businesses and sorts.

        User's specific request for the data niche, data column types, and all
        other details and your job is to create wonderful mockup data for them
        to use for their demo apps or develop in a testing environment.
    """.strip()

    user_prompt = f"""
        Generate a synthetic mockup data that fits the following instruction:
        - Number of rows: {rows_num}
        - Business area: {business_category}
        - Columns: {columns}
        - Other instruction: {instruction}
        ㅡ Make sure to deliver only the markdown content without any additional comments
    """.strip()

    system_message = system_message + """
        In the case of sql file selection as an output, make sure to
        contain the full sql file format, including CREATE TABLE command.
    """.strip()

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    return messages


def pass_claude_msg(format: str, content: str) -> Tuple[str, str]:
    claude_sys_msg = """
        You are a helpful assistant, converting generated outputs (done by other model)
        into the format of chosen type:

        example: csv, sql, or json format.

        NOTE: generate the result output that only includes the markdown content
        without any addtional comments!
    """.strip()
    claude_user_msg = f"""
        Convert the output into the {format} format for the following content:
        ----------------------------------------------------------------------
        {content}
    """.strip()

    return claude_sys_msg, claude_user_msg



In [None]:
### Lazy loader
model = None
tokenizer = None

def enable_model():
    global model, tokenizer

    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

    if model is not None:
        return model

    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map={"": 0},
        torch_dtype="auto",
        low_cpu_mem_usage=True,
        quantization_config=bnb,
    ).eval()


In [None]:

def generate_output(messages):
    enable_model()

    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        return_dict=True, ### IMPORTANT: to get a mapping
        tokenize=True,
        add_generation_prompt=True,
        padding=True,
        return_attention_mask=True
    ).to(model.device)

    # print(inputs)

    outputs = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.2
    )

    ### Get the length(num of tokens) of the input prompt
    prompt_len = inputs["input_ids"].shape[1]

    ### Slice the generated sequence to skip the prompt length
    gen_tokens = outputs[0][prompt_len:]

    # print(tokenizer.decode(gen_tokens, skip_special_tokens=True))

    return gen_tokens



In [None]:
def launch_claude_api(sys_msg, user_msg):
    response = claude.messages.create(
        model=CLAUDE_MODEL,
        system=sys_msg,
        max_tokens=400,
        temperature=0.1,
        messages = [
            {"role": "user", "content": user_msg}
        ]
    )
    return response.content[0].text

In [None]:
!pip install -q gradio

import gradio as gr
import time

In [None]:
###============= Gradio Fucntion =============###

def generate_mockup_data(category, num_data_rows, columns, a_instruction,
                         progress=gr.Progress(track_tqdm=True)):

    progress(0, desc="Loading prompts...")
    msg = invoke_messages(category, num_data_rows, columns, a_instruction)

    for t in (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9):
        progress(t, desc="Generating output...")
        time.sleep(5)

    gen_tokens = generate_output(msg)
    progress(1.0, desc="Initial output generated.")

    return tokenizer.decode(gen_tokens, skip_special_tokens=True)

def show_hidden_row():
    return gr.update(visible=True)

def make_file(btn_sort: str, category: str, content: str):
    '''
    btn_sort: one of the 3 download buttons -- csv, sql, json download
    category: Business cateogry or area that the data is associated with.
    content: LLM generated text output to write in a file
    '''

    sys_msg, user_msg = pass_claude_msg(btn_sort, content)

    claude_output = launch_claude_api(sys_msg, user_msg)
    # progress(0.9, desc="Output has been generated.")
    # print("CLAUDE OUTPUT: ", claude_output)


    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = f"/tmp/{category}_mockup_{ts}.{btn_sort}"

    with open(filepath, "w") as f:
        f.write(claude_output)

    return filepath


In [None]:
from logging import PlaceHolder
###============= Gradio UI =============###

def render_interface():
    custom_css = """
        .download-btn {
            width: 20px;
            padding: 6px 16px;
        }
    """

    with gr.Blocks(title="Mockup Data Generator", css="footer {visibility:hidden}") as demo:
        category = gr.Textbox(
            label="Business Area/Category",
            placeholder="e.g. HR, Sales, Hospitality, Senior Care, E-commerce, Finance",
            )
        num_data_rows = gr.Number(
            label="Number of Rows",
            placeholder="Type number...",
            minimum=10,
            maximum=50,
            step=10,
            precision=0
            )
        columns = gr.Textbox(
            label="Insert Columns",
            placeholder="Comma, separated..."
        )
        a_instruction = gr.Textbox(
            label="Additional Instruction"
            placeholder="Any additional instruction. Leave blank if none.",
            lines=5
            )
        btn = gr.Button(
            value="Generate"
        )
        out = gr.Textbox(label="Result shown here.")


        buttons_row = gr.Row(visible=False)

        with buttons_row:
            btn_csv = gr.DownloadButton(label="Download csv", size="md", elem_classes=["download-btn"])
            btn_sql = gr.DownloadButton(label="Download sql", size="md", elem_classes=["download-btn"])
            btn_json = gr.DownloadButton(label="Download json", size="md", elem_classes=["download-btn"])


        chain = btn.click(
            fn=generate_mockup_data,
            inputs=[category, num_data_rows, columns, a_instruction],
            outputs=out,
            queue=True
        )

        chain = chain.then(
            fn=show_hidden_row,
            inputs=None,
            outputs=buttons_row,
        )


        btn_csv.click(
        lambda category, data: make_file("csv", category, data),
        inputs=[category, out],
        outputs=btn_csv
        )

        btn_sql.click(
        lambda category, data: make_file("sql", category, data),
        inputs=[category, out],
        outputs=btn_sql
        )

        btn_json.click(
        lambda category, data: make_file("json", category, data),
        inputs=[category, out],
        outputs=btn_json
        )

    return demo

if __name__ == "__main__":
    app = render_interface()
    app.queue()
    app.launch(share=True, debug=True, inline=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://963cc27fba3affe0da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://963cc27fba3affe0da.gradio.live


To-Do

1. Revise: Eliminate the file type input and let just the 3 buttons be the decesive factor for the user to decide the file type to be downloaded -- DONE


2. Deal with the unresponsive looking widgets(all buttons)

