In [None]:
!pip install -q --upgrade bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import uuid
import datetime as dt
import pandas as pd
import torch
import gradio as gr
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
ARENAS = [
    "Training Camp", "Goblin Stadium", "Bone Pit",
    "Spell Valley", "Royal Arena", "Legendary Arena"
]

ARCHETYPES = [
    "Hog Cycle", "LavaLoon", "Golem Beatdown",
    "X-Bow Siege", "P.E.K.K.A Bridge Spam", "Royal Giant Control"
]

CARDS = [
    "Knight","Archers","Goblins","Giant","Prince","Baby Dragon","Witch","Skeleton Army",
    "Hog Rider","Freeze","Zap","Fireball","Arrows","Tesla","Musketeer","P.E.K.K.A","Cannon",
    "Miner","Mega Minion","Lava Hound","Balloon","Inferno Tower","Bats","Ice Spirit","Valkyrie",
    "Wizard","Royal Giant","Goblin Barrel","Log","Tornado","Electro Wizard","Bandit","Magic Archer",
    "Ram Rider","Mega Knight","Mortar","X-Bow","Rascals","Graveyard","Poison"
]

SPELLS = {"Zap","Fireball","Arrows","Poison","Freeze","Tornado","Log"}

JSON_SCHEMA = {
  "type": "object",
  "properties": {
    "match_id": {"type": "string"},
    "date": {"type": "string"},
    "arena": {"type": "string"},
    "player_trophies": {"type": "integer"},
    "opponent_trophies": {"type": "integer"},
    "deck_archetype": {"type": "string"},
    "deck": {"type": "array", "items": {"type": "string"}},
    "opponent_deck": {"type": "array", "items": {"type": "string"}},
    "crown_diff": {"type": "integer"},
    "result": {"type": "string"},
    "duration_sec": {"type": "integer"},
    "elixir_leak": {"type": "number"},
    "spells_used": {"type": "integer"}
  },
  "required": [
      "match_id","date","arena","player_trophies","opponent_trophies",
      "deck","opponent_deck","crown_diff","result","duration_sec"
  ]
}

In [None]:
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
def llm_rows(n: int):
    prompt = f"""
You are a JSON data generator.

TASK
Produce {n} lines of JSON (JSON Lines). Each line must be a single, valid JSON object.

OUTPUT RULES
- Output JSON ONLY. No prose, no code fences, no comments.
- Every line MUST parse with a JSON parser.
- Arrays must not contain duplicates.

SCHEMA (types)
{{
  "match_id": string,
  "date": string,
  "arena": string,
  "player_trophies": integer,
  "opponent_trophies": integer,
  "deck_archetype": string,
  "deck": string[8],
  "opponent_deck": string[8],
  "crown_diff": integer,
  "result": string,
  "duration_sec": integer,
  "elixir_leak": number,
  "spells_used": integer
}}

CONSTRAINTS
- "arena" ∈ [{", ".join(f'"{a}"' for a in ARENAS)}]
- "deck_archetype" ∈ [{", ".join(f'"{a}"' for a in ARCHETYPES)}]
- "deck" and "opponent_deck" must each contain 8 DISTINCT cards from:
  [{", ".join(f'"{c}"' for c in CARDS)}]
- "spells_used" = count(deck ∩ {{"Zap","Fireball","Arrows","Poison","Freeze","Tornado","Log"}}).
- "result" must align with crown_diff: >0→"win", <0→"loss", 0→"win"|"loss"|"draw".
- "date" must be an ISO date (YYYY-MM-DD) within the last 60 days.
- "player_trophies": 3000-7500.
- "opponent_trophies": 3000-8000 and within ±400 of player_trophies.
- "duration_sec": 120-300.
- "elixir_leak": 0.0-5.0 with ONE decimal (e.g. 1.7). Winners tend lower (1.0-2.2), losers higher (1.6-3.0).

GOLDEN EXAMPLE
{{"match_id":"a1b2c3d4","date":"2025-08-02","arena":"Royal Arena","player_trophies":6210,"opponent_trophies":6401,"deck_archetype":"Hog Cycle","deck":["Hog Rider","Ice Spirit","Musketeer","Fireball","Log","Cannon","Archers","Knight"],"opponent_deck":["P.E.K.K.A","Poison","Bandit","Magic Archer","Electro Wizard","Ram Rider","Zap","Bats"],"crown_diff":1,"result":"win","duration_sec":173,"elixir_leak":1.7,"spells_used":2}}

NOW PRODUCE {n} MORE LINES IN THE SAME FORMAT (WITHOUT REPEATING THE EXAMPLE).
""".strip()
    # 2) Encode correctly for a chat model if chat_template exists
    if getattr(tokenizer, "chat_template", None):
        messages = [
            {"role": "system", "content": "You are a precise JSON generator. Output JSON Lines only."},
            {"role": "user", "content": prompt},
        ]
        input_ids = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt")
        enc = {"input_ids": input_ids.to("cuda"),
               "attention_mask": torch.ones_like(input_ids).to("cuda")}
    else:
        enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
        enc = {k: v.to("cuda") for k, v in enc.items()}

    # 3) Generate (decode only the new tokens)
    model.to("cuda")
    eos_id = tokenizer.eos_token_id
    pad_id = tokenizer.pad_token_id or eos_id

    with torch.no_grad():
        gen_ids = model.generate(
            **enc,
            max_new_tokens=min(350*n, 4096),
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
            eos_token_id=eos_id,
            pad_token_id=pad_id,
            repetition_penalty=1.05,
        )

    input_len = enc["input_ids"].shape[1]
    new_tokens = gen_ids[0, input_len:]
    text = tokenizer.decode(new_tokens, skip_special_tokens=True)

    # 4) Robust JSON extraction: scan the whole string (not per-line)
    decoder = json.JSONDecoder()
    i, L = 0, len(text)
    found = []
    while i < L:
        # seek next '{'
        j = text.find("{", i)
        if j == -1: break
        try:
            obj, end = decoder.raw_decode(text, j)
            if isinstance(obj, dict):
                found.append(obj)
            i = end
        except json.JSONDecodeError:
            i = j + 1

    # 5) Normalize + keep first n dicts
    rows = []
    for obj in found:
        # Ensure id
        obj["match_id"] = obj.get("match_id") or uuid.uuid4().hex[:8]
        # Deck arrays -> comma-joined strings (if arrays present)
        d = obj.get("deck", [])
        od = obj.get("opponent_deck", [])
        if isinstance(d, list):  d = ",".join(d)
        if isinstance(od, list): od = ",".join(od)
        obj["deck"] = d
        obj["opponent_deck"] = od
        # Crown/result consistency
        try:
            cd = int(obj.get("crown_diff", 0))
            if cd > 0: obj["result"] = "win"
            elif cd < 0: obj["result"] = "loss"
            else: obj["result"] = obj.get("result", "draw")
        except Exception:
            pass
        # elixir one-decimal (if present)
        if "elixir_leak" in obj:
            try:
                obj["elixir_leak"] = float(f"{float(obj['elixir_leak']):.1f}")
            except Exception:
                pass

        rows.append(obj)
        if len(rows) >= n:
            break

    # Optional: peek a couple for sanity
    for ex in rows[:2]:
        print("[SAMPLE ROW]\n", json.dumps(ex, indent=2)[:600], "\n")

    return rows

In [None]:
def generate_dataset(n_rows: int, batch_size: int = 100, max_batches: int = 20) -> pd.DataFrame:
    """
    Generate at most (batch_size * max_batches) rows trying the LLM repeatedly
    until we collect >= n_rows valid JSON objects. No rule-based fallback.
    """
    rows = []
    needed = n_rows
    batches = 0
    while needed > 0 and batches < max_batches:
        b = min(batch_size, needed)
        rows.extend(llm_rows(b))
        needed = n_rows - len(rows)
        batches += 1
    if len(rows) < n_rows:
        # Keep only what we have; caller requested LLM-only
        print(f"[warning] Requested {n_rows} rows, LLM produced {len(rows)} valid JSON rows.")
    return pd.DataFrame(rows[:n_rows])

In [None]:
def gradio_generate(n_rows: int):
    n_rows = int(n_rows)
    df = generate_dataset(n_rows, batch_size=min(n_rows, 200))
    csv_path = "clash_royale_llm_only.csv"
    df.to_csv(csv_path, index=False)
    return df, csv_path

In [None]:
gradio_generate(4)

[SAMPLE ROW]
 {
  "match_id": "a4b5c6d7e",
  "date": "2025-09-01",
  "arena": "Royal Arena",
  "player_trophies": 6201,
  "opponent_trophies": 6402,
  "deck_archetype": "LavaLoon",
  "deck": "Hog Rider,Ice Spirit,Musketeer,Fireball,Log,Cannon,Archers,Knight",
  "opponent_deck": "P.E.K.K.A,Poison,Bandit,Magic Archer,Electro Wizard,Ram Rider,Zap,Bats",
  "crown_diff": -1,
  "result": "loss",
  "duration_sec": 141,
  "elixir_leak": 2.1,
  "spells_used": 1
} 

[SAMPLE ROW]
 {
  "match_id": "f1g2h3i",
  "date": "2025-07-25",
  "arena": "Bone Pit",
  "player_trophies": 6300,
  "opponent_trophies": 6100,
  "deck_archetype": "X-Bow Siege",
  "deck": "Goblin Barrel,Giant,Witch,Zap,Fireball,Archers,Knight,Musketeer",
  "opponent_deck": "P.E.K.K.A,Poison,Bandit,Magic Archer,Electro Wizard,Ram Rider,Cannon,Hog Rider",
  "crown_diff": 1,
  "result": "win",
  "duration_sec": 179,
  "elixir_leak": 1.9,
  "spells_used": 4
} 

[SAMPLE ROW]
 {
  "match_id": "a5b6c7d8",
  "date": "2025-09-01",
  "arena":

(    match_id        date           arena  player_trophies  opponent_trophies  \
 0   a5b6c7d8  2025-09-01  Goblin Stadium             6200               6300   
 1   a9b2c3d4  2025-07-25        Bone Pit             6500               6800   
 2   a3b4c5d6  2025-10-15     Royal Arena             7000               7200   
 3  e5f6g7h8i  2025-09-15  Goblin Stadium             6201               6601   
 
         deck_archetype                                               deck  \
 0             LavaLoon  Giant,Witch,Musketeer,Fireball,Log,Cannon,Arch...   
 1          X-Bow Siege  Hog Rider,Ice Spirit,Musketeer,Fireball,Log,Ca...   
 2  Royal Giant Control  Royal Giant,Witch,Musketeer,Fireball,Log,Canno...   
 3             LavaLoon  Giant,Witch,Hog Rider,Fireball,Log,Cannon,Arch...   
 
                                        opponent_deck  crown_diff result  \
 0  P.E.K.K.A,Poison,Bandit,Magic Archer,Electro W...          -1   loss   
 1  P.E.K.K.A,Poison,Bandit,Magic Archer,Electro 

In [None]:


with gr.Blocks(title="Clash Royale Synthetic CSV (LLM only)") as demo:
    gr.Markdown("## Clash Royale Synthetic CSV (LLM only)\nSet a row count and click **Generate**. Requires `HF_MODEL`.")
    n = gr.Slider(1, 100, value=3, step=1, label="Number of rows")
    btn = gr.Button("Generate")
    df_out = gr.Dataframe(label="Preview")
    file_out = gr.File(label="Download CSV")
    btn.click(gradio_generate, inputs=[n], outputs=[df_out, file_out])

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://81b6f34af6664d8556.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


