# generate user requests

In [None]:
## data_generation
from langchain_community.chat_models import ChatFireworks
from langchain_core.output_parsers import NumberedListOutputParser


system_generate_prompt = """
You create a dataset of calls to a chatbot that can book hotels

Generate 20 different requests. 
Each request may or may not include different content: destination, check-in and check-out time, number of adults and children or just guests, various additional information, wishes about the rating of the hotel (according to a 10-point system) and about the stars of the hotel, about the price, etc.

a few examples:
1. Find me cheap hotels in Prague for the period from July 23 to July 27
2. Find four-star hotels in Paris
3. book me the cheapest hotel in the center of Moscow for the period from August 12th to 19th, I'm just one guest
"""


llm = ChatFireworks(model="accounts/fireworks/models/qwen2-72b-instruct")

In [None]:
chain = llm | NumberedListOutputParser()
chain.invoke(system_generate_prompt)

In [None]:
import time

request_dataset = []

# add samples from 72b model
for i in range(20):
    request_dataset.extend(
        chain.invoke(system_generate_prompt)
    )
    time.sleep(.5)

In [None]:
# and from 7b model
llm = ChatFireworks(model="accounts/fireworks/models/qwen2-7b-instruct")
chain = llm | NumberedListOutputParser()

for i in range(20):
    request_dataset.extend(
        chain.invoke(system_generate_prompt)
    )
    time.sleep(.5)

In [None]:
import random
import json

shuffled_request_dataset = random.sample(request_dataset, len(request_dataset))

with open("shuffled_request_dataset.json", "w") as f:
    json.dump(shuffled_request_dataset, f, indent=4)

# generate gpt answers

In [None]:
from langchain_core.prompts import ChatPromptTemplate

search_hotels_system_prompt = """\
Current time: 2024-06-18 17:45:12
You are an assistant who can search and book a hotel for a user.
Hotel search and reservations are made through the use of the following tools:

{rendered_tools}

 Here is a list of possible parameters and their values:
-location: str -> often just the name of the city
-checkin_date: str -> format: YYYY-MM-DD
-checkout_date: str -> format: YYYY-MM-DD
-adults_number: int
-children_number: int
-min_rating: int -> must be in the range from 0 to 10
-min_price: int
-max_price: int
-order_by: str -> possible values: popularity, price, rating
-id: int -> it is used exclusively when booking, when the search has already been called

Given the user input, return the name and input of the tool to use. Return your response as a JSON blob with 'name' and 'arguments' keys.

The `arguments` should be a dictionary, with keys corresponding to the argument names and the values corresponding to the requested values.
If the user has not provided some important information, you still need to send a json blob.
Don't make up the argument values yourself! Take only what the user specified!
"""

prompt = ChatPromptTemplate.from_messages(
            [
                ("system", search_hotels_system_prompt),
                ("user", "{input}"),
            ]
        )



In [None]:
from tools import render_text_description, book_hotel_tool, search_hotels_tool

tools = [book_hotel_tool, search_hotels_tool]

# generate answers from the 72b model
llm = ChatFireworks(model="accounts/fireworks/models/qwen2-72b-instruct")
chain = prompt | llm
rendered_tools = render_text_description(
    tools, ignored_parameters=["data_loader"]
)

gpt_outputs_72b = []

for i, item in enumerate(shuffled_request_dataset):
    llm_output = chain.invoke(
                {
                    "input": item,
                    "rendered_tools": rendered_tools,
                }
            ).content

    gpt_outputs_72b.append(llm_output)


# and from 7b model
llm = ChatFireworks(model="accounts/fireworks/models/qwen2-7b-instruct")
chain = prompt | llm

gpt_outputs_7b = []

for i, item in enumerate(shuffled_request_dataset):
    llm_output = chain.invoke(
                {
                    "input": item,
                    "rendered_tools": rendered_tools,
                }
            ).content

    gpt_outputs_7b.append(llm_output)



In [None]:
from utils import str_to_json

gpt_outputs_7b_json = [str_to_json(x) for x in gpt_outputs_7b]
gpt_outputs_72b_json = [str_to_json(x) for x in gpt_outputs_72b]

In [None]:
system_judgement_prompt = """
Current time: 2024-06-18 17:45:12
You will be presented with a user request and two responses from different gpt.

You need to write a new one based on these answers, either taking the best one or refining the best one.

The responses will be in the form of a json blob.

A common problem with current gpt responses is when different argument values are thought out.
Examples:
-The user did not specify the exact date, but gpt prescribed it.
-The user did not specify the exact number of guests, but gpt thought of it and prescribed this value.
, and so on..

You must correct these errors, if they exist, and write a new, perfect answer.

Here is a list of possible parameters and their values:
-location: str -> often just the name of the city
-checkin_date: str -> format: YYYY-MM-DD
-checkout_date: str -> format: YYYY-MM-DD
-adults_number: int
-children_number: int
-min_rating: int -> must be in the range from 0 to 10
-min_price: int
-max_price: int
-order_by: str -> possible values: popularity, price, rating

User request: {user_input}

Answer 1: {answer_1}

Answer 2: {answer_2}

Your corrected version of the answer in json blob:
"""

In [None]:
import re


prompt = ChatPromptTemplate.from_messages(
    [
        ("user", system_judgement_prompt),
    ]
)

chain = prompt | llm

judge_answers = []


for d1, d2, request in zip(gpt_outputs_7b_json, gpt_outputs_72b_json, shuffled_request_dataset):
    try:
        judge_answer = chain.invoke(
            {
                "user_input": request,
                "answer_1": d1,
                "answer_2": d2
            }
        ).content

        judge_answers.append(str_to_json(judge_answer))
    except Exception as e:
        print(e)
        judge_answer.append(None)

In [None]:

with open("judge_answers.json", "w") as f:
    json.dump(judge_answers, f, indent=4)

In [None]:
import inspect

sig = inspect.signature(search_hotels_tool)
search_parameters = sig.parameters

search_parameters

In [None]:
import copy

corrected_judge_answers = []

for judge_answer in judge_answers:
    corrected_judge_answers.append(copy.deepcopy(judge_answer))
    corrected_judge_answers[-1]['name'] = search_hotels_tool.__name__
    for k, v in judge_answer['arguments'].items():
        if k not in search_parameters:
            corrected_judge_answers[-1]['arguments'].pop(k)
    
corrected_judge_answers

In [None]:
booking_dataset = []
for request, judge_answer in zip(shuffled_request_dataset, corrected_judge_answers):
    booking_dataset.append(
        {
            "instruction": request,
            "input": "",
            "output": str(judge_answer),
        }
    )

with open("booking_dataset.json", "w") as f:
    json.dump(booking_dataset, f, indent=4)

In [None]:
train_data = booking_dataset[:int(len(booking_dataset)*.8)]
val_data = booking_dataset[int(len(booking_dataset)*.8):]

In [None]:
with open("train_data.json", "w") as f:
    json.dump(train_data, f, indent=4)

with open("val_data.json", "w") as f:
    json.dump(val_data, f, indent=4)