In [1]:
from datasets import Dataset, DatasetDict 
import openai
import json
import os  
import pydantic
from typing import Optional

In [2]:
import getpass

openai_api_key = getpass.getpass('Enter your API KEY')

Enter your API KEY ········


In [28]:
os.environ["OPENAI_API_KEY"] = openai_api_key


In [55]:
class ClerkSituationModel(pydantic.BaseModel):
    ui_state: str = pydantic.Field(description="A description of the current UI state, in human language")
    user_query: Optional[str] = pydantic.Field(description="The query from the user, mutually exclusive with action_response")
    action_response: Optional[str] = pydantic.Field(description="The result of the action, mutually exclusive with user_query")
                                               
                                               
    action_request: Optional[str] = pydantic.Field(default=None, description="The backend action the clerk will take, this usually results in a 'Please hold, Please wait' style response_to_human message")
    response_to_human: Optional[str] = pydantic.Field(default=None, description="What the Clerk is expected to 'say' to the user, if there is an action to be taken, this should be 'Please wait' or some variation on that")
    ui_change_request: Optional[str] = pydantic.Field(default=None, description="Changes to the UI/Presentation if any, should be in a human language, and should provide details/urls if any, this should be 'Show loading' or some variation, if there is an action to be taken")

class ClerkSituations(pydantic.BaseModel):
    situations: List[ClerkSituationModel]

class ClerkSituationsCache(pydantic.BaseModel):
    cache: dict[str, ClerkSituations] = {}
    

In [37]:
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI

model = ChatOpenAI(temperature=0)

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=ClerkSituations)

prompt = PromptTemplate(
    template="You are an expert data creator, generate {num_situations} situations for the following scenario.\n{query}\n{format_instructions}\n",
    input_variables=["query", "num_situations"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser



In [38]:
response = chain.invoke({"num_situations": "5", "query": 'A user is using a weather app , the clerk is a weather assistant'})

In [39]:
response



In [56]:

def load_cache(cache_file):
    if os.path.exists(cache_file):
        with open(cache_file, 'r') as f:
            r = json.load(f)
            return ClerkSituationsCache(**r)
    else:
        return ClerkSituationsCache()

def save_cache(cache_file, cache_data):
    with open(cache_file, 'w') as f:
        json.dump(cache_data.dict(), f)


In [65]:

# Main script logic
scenarios_file = "scenarios.txt"
output_dataset_path = "datasets/" 
cache_file = "scenario_cache.json"

cache = load_cache(cache_file)
dataset = []

with open(scenarios_file, 'r') as f:
    for scenario_line in f:
        scenario = scenario_line.strip()
        if scenario in cache:
            dataset.extend(cache[scenario].situations)  # Load from cache 
        else:
            scenario_data = response = chain.invoke({"num_situations": "5", "query": scenario})
            dataset.extend(scenario_data.situations)  # Process and add to dataset
            cache.cache[scenario] = scenario_data  # Update the cache

save_cache(cache_file, cache) 



In [66]:
dataset

[ClerkSituationModel(ui_state='The user opens the weather app for the first time', user_query='', action_response='', action_request='Fetch current location and weather data', response_to_human='Please wait while we fetch the current weather for your location', ui_change_request='Display loading spinner on screen'),
 ClerkSituationModel(ui_state='The user searches for a specific location', user_query='New York City weather', action_response='', action_request='Fetch weather data for New York City', response_to_human='Please wait while we fetch the weather for New York City', ui_change_request='Display loading spinner on screen'),
 ClerkSituationModel(ui_state='The user switches from Celsius to Fahrenheit', user_query='Switch to Fahrenheit', action_response='Temperature units changed to Fahrenheit', action_request='', response_to_human='Temperature units changed to Fahrenheit', ui_change_request='Update temperature display to show Fahrenheit'),
 ClerkSituationModel(ui_state='The user cl

In [70]:
hf_dataset = Dataset.from_list([r.dict() for r in dataset])  

In [72]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [74]:
hf_dataset.push_to_hub("m0ejay/aiui-clerk-small-set", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/m0ejay/aiui-clerk-small-set/commit/2fdcdf8f16eefa731ffcf879e6aecaa907e6953f', commit_message='Upload dataset', commit_description='', oid='2fdcdf8f16eefa731ffcf879e6aecaa907e6953f', pr_url=None, pr_revision=None, pr_num=None)

In [78]:
 hf_dataset[2]

{'ui_state': 'The user receives a severe weather alert',
 'user_query': '',
 'action_request': '',
 'ui_change_request': 'Display emergency alert notification on screen'}