# Synthetic data generation

In [1]:
import requests
import json
import os

In [2]:
from huggingface_hub import InferenceClient

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
import os
import requests

API_URL = "https://router.huggingface.co/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

response = query({
    "messages": [
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
    "model": "Qwen/Qwen2.5-7B-Instruct:together"
})

# print(response["choices"][0]["message"])

In [25]:
response

{'id': 'oRhktF2-66dFFu-9b7bda2af88cc979',
 'object': 'chat.completion',
 'created': 1767373641,
 'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo',
 'prompt': [],
 'choices': [{'finish_reason': 'stop',
   'seed': 11771503340345810000,
   'index': 0,
   'logprobs': None,
   'message': {'role': 'assistant',
    'content': 'The capital of France is Paris.',
    'tool_calls': []}}],
 'usage': {'prompt_tokens': 36,
  'completion_tokens': 8,
  'total_tokens': 44,
  'cached_tokens': 0}}

In [27]:
SYSTEM_PROMPT = """You are an expert AI assistant for a mobile app that connects padel, tennis, and squash players. 
Your role is to: 
- Help users find game partners and book courts
- Answer questions about rules, techniques, and equipment
- Provide training tips and improvement advice
- Assist with app navigation and features
- Be friendly, concise, and actionable

Keep responses natural, conversational, and mobile-friendly (not too long)."""

In [28]:
PROMPT_TEMPLATES = [
    # Booking & Matchmaking
    """Generate a realistic conversation where a user wants to book a {sport} court for {time_context}. 
    The user should ask about availability, pricing, or location. Assistant helps with booking process.""",
    
    """Create a dialogue where a user is looking for a {sport} partner at {skill_level} level. 
    The assistant asks clarifying questions and suggests how to find matches.""",
    
    """Generate a conversation about finding nearby {sport} courts/clubs in a specific city. 
    User asks for recommendations, assistant provides helpful filtering options.""",
    
    # Rules & Game Knowledge
    """Create a dialogue where a beginner asks about basic {sport} rules, scoring, or court layout. 
    Assistant explains clearly and concisely.""",
    
    """Generate a conversation where user asks about the differences between {sport1} and {sport2}. 
    Assistant compares them objectively.""",
    
    """Create a dialogue about specific {sport} terminology or advanced rules (e.g., 'What is a let?', 
    'What are the walls rules in padel?'). Assistant explains with examples.""",
    
    # Equipment & Gear
    """Generate a conversation where user asks for {sport} equipment recommendations for {skill_level} players. 
    Assistant suggests specific gear with reasoning.""",
    
    """Create a dialogue about {sport} racket/paddle selection - shape, weight, materials. 
    User has specific needs, assistant guides the choice.""",
    
    """Generate a conversation about what to bring for first {sport} training session. 
    Assistant provides practical checklist.""",
    
    # Technique & Training
    """Create a dialogue where user wants to improve specific {sport} technique (serve, volley, backhand, etc.). 
    Assistant provides actionable drills and tips.""",
    
    """Generate a conversation about common {sport} mistakes for beginners and how to fix them. 
    Assistant is encouraging and specific.""",
    
    """Create a dialogue where user asks how to train {sport} skills solo/at home. 
    Assistant suggests practical exercises.""",
    
    # Strategy & Tactics
    """Generate a conversation about {sport} doubles strategy and positioning. 
    User wants to improve teamwork, assistant explains tactics.""",
    
    """Create a dialogue about shot selection in {sport} - when to attack, defend, or lob. 
    Assistant provides strategic thinking.""",
    
    # App Features & Navigation
    """Generate a conversation where user is confused about how to use a feature in the app 
    (e.g., 'How do I set my availability?', 'How do I filter players by level?'). 
    Assistant guides them step-by-step.""",
    
    """Create a dialogue where user wants to know how notifications work, how to join tournaments, 
    or how to rate players after a match. Assistant explains app functionality.""",
    
    # Social & Community
    """Generate a conversation where user asks about {sport} etiquette, what to expect on first game, 
    or how to behave in club. Assistant gives social tips.""",
    
    """Create a dialogue where user wants to join a {sport} community, league, or regular group. 
    Assistant suggests how to get involved.""",
    
    # Multi-turn Complex
    """Generate a MULTI-TURN conversation (4-6 exchanges) where user initially has vague request 
    like 'I want to start playing {sport}' and assistant asks clarifying questions about 
    level, location, goals, then provides personalized recommendations.""",
    
    """Create a MULTI-TURN dialogue where user reports a problem or bug in the app. 
    Assistant troubleshoots step-by-step and escalates if needed.""",
]

SPORTS = ['padel', 'squash', 'tennis']
SPORT_PAIRS = [('padel', 'tennis'), ('padel', 'squash'), ('tennis', 'squash')]
SKILL_LEVELS = ['beginner', 'intermediate', 'advanced', 'professional']
TIME_CONTEXTS = [
    'tomorrow morning', 'this weekend', 'next week', 'tonight', 
    'Friday evening', 'weekday afternoon', 'recurring weekly'
]

In [29]:
import random

In [30]:
def generate_prompt(template_idx=None):
    """Generate a varied prompt from templates"""
    
    if template_idx is None:
        template = random.choice(PROMPT_TEMPLATES)
    else:
        template = PROMPT_TEMPLATES[template_idx % len(PROMPT_TEMPLATES)]
    
    # Fill in variables
    template = template.format(
        sport=random.choice(SPORTS),
        sport1=random.choice(SPORTS),
        sport2=random. choice([s for s in SPORTS if s != template.split('{sport1}')[0]]) if '{sport2}' in template else '',
        skill_level=random. choice(SKILL_LEVELS),
        time_context=random. choice(TIME_CONTEXTS)
    )
    
    full_prompt = f"""You are a data generation expert. Create a realistic, natural conversation for a mobile app assistant. 

CONTEXT:  {template}

Generate the conversation in this EXACT JSON format (no markdown, no extra text):
{{
  "user":  "user's message or question",
  "assistant": "helpful, concise assistant response",
  "context": "brief scenario description",
  "intent": "primary intent (e.g., booking, matchmaking, rules_question, equipment_advice, technique_help, app_navigation)",
  "entities": {{"sport":  "padel/squash/tennis", "skill_level": "if mentioned", "time":  "if mentioned"}}
}}

Requirements:
- Natural, conversational language
- Mobile-friendly length (assistant response:  2-4 sentences max, unless explaining complex concept)
- Specific and actionable
- Diverse vocabulary and phrasing
- Include realistic user errors, typos occasionally (10% of time)

Generate ONE conversation now: """
    
    return full_prompt

In [33]:
prompt = generate_prompt(0)

In [32]:
import time

In [39]:
def call_hf_api(prompt, max_retries=3, timeout=30):
    """Call Hugging Face API with retry logic"""
    
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens":  800,
            "temperature": 0.85,
            "top_p":  0.92,
            "return_full_text": False,
        }
    }
    
    API_URL = "https://router.huggingface.co/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
    }

    def query(payload):
        response = requests.post(API_URL, headers=headers, json=payload)
        return response.json()

    response = query({
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "model": "Qwen/Qwen2.5-7B-Instruct:together"
    })
    
    return response

In [41]:
response = call_hf_api(prompt)

In [None]:
response

{'id': 'oRhrUmV-62bZhn-9b7bf4d21b0e4e62',
 'object': 'chat.completion',
 'created': 1767374733,
 'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo',
 'prompt': [],
 'choices': [{'finish_reason': 'stop',
   'seed': 9222958021129199000,
   'index': 0,
   'logprobs': None,
   'message': {'role': 'assistant',
    'content': '{\n  "user": "Hey, can I book a squash court for next week?",\n  "assistant": "Sure thing! When exactly next week are you thinking about playing?",\n  "context": "User wants to book a squash court for next week.",\n  "intent": "booking",\n  "entities": {"sport": "squash", "time": "next week"}\n}',
    'tool_calls': []}}],
 'usage': {'prompt_tokens': 268,
  'completion_tokens': 79,
  'total_tokens': 347,
  'cached_tokens': 0}}

In [54]:
x = response["choices"][0]["message"]['content']
x

'{\n  "user": "Hey, can I book a squash court for next week?",\n  "assistant": "Sure thing! When exactly next week are you thinking about playing?",\n  "context": "User wants to book a squash court for next week.",\n  "intent": "booking",\n  "entities": {"sport": "squash", "time": "next week"}\n}'

In [57]:
print(x)

{
  "user": "Hey, can I book a squash court for next week?",
  "assistant": "Sure thing! When exactly next week are you thinking about playing?",
  "context": "User wants to book a squash court for next week.",
  "intent": "booking",
  "entities": {"sport": "squash", "time": "next week"}
}


In [59]:
x = json.loads(x)

In [None]:
import json

def convert_to_training_format(raw_example):
    chatml_format = {
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant for a sports booking app. Help users book courts, find partners, and answer questions about padel, squash, and tennis."
            },
            {
                "role": "user",
                "content": raw_example["user"]
            },
            {
                "role": "assistant",
                "content": raw_example["assistant"]
            }
        ]
    }

    instruction_format = {
        "instruction": "You are a helpful assistant for a sports booking app.",
        "input": raw_example["user"],
        "output": raw_example["assistant"]
    }

    text_format = {
        "text": f"""<|im_start|>system
You are a helpful assistant for a sports booking app. <|im_end|>
<|im_start|>user
{raw_example["user"]}<|im_end|>
<|im_start|>assistant
{raw_example["assistant"]}<|im_end|>"""
    }
    
    return chatml_format, instruction_format, text_format

In [61]:
convert_to_training_format(x)

({'messages': [{'role': 'system',
    'content': 'You are a helpful assistant for a sports booking app. Help users book courts, find partners, and answer questions about padel, squash, and tennis.'},
   {'role': 'user',
    'content': 'Hey, can I book a squash court for next week?'},
   {'role': 'assistant',
    'content': 'Sure thing! When exactly next week are you thinking about playing?'}]},
 {'instruction': 'You are a helpful assistant for a sports booking app.',
  'input': 'Hey, can I book a squash court for next week?',
  'output': 'Sure thing! When exactly next week are you thinking about playing?'},
 {'text': '<|im_start|>system\nYou are a helpful assistant for a sports booking app. <|im_end|>\n<|im_start|>user\nHey, can I book a squash court for next week?<|im_end|>\n<|im_start|>assistant\nSure thing! When exactly next week are you thinking about playing?<|im_end|>'})