In [None]:
from chat_models.chat_unsloth import ChatUnsloth

llm = ChatUnsloth(model_path="pashaprokaz/qwen-7b-instruct-hotel-booking-4bit-v2")


In [None]:
import json

with open("val_data.json", "r") as f:
    val_data = json.load(f)

len(val_data)

In [None]:
prompt = """
Current time: 2024-06-18 17:45:12 UTC (Tuesday)
You are an assistant who can search and book a hotel for a user.
Hotel search and reservations are made through the use of the following tools:

search_hotels_tool(location: str, checkin_date: str, checkout_date: str, adults_number: int, children_number: int = 0, order_by: str = 'popularity', min_rating=None, min_price=None, max_price=None) -> List - Search for hotels in a given parameters.
book_hotel_tool(id: int) -> str - Book hotel by a provided id. Id can be obtained from search_hotels_mock.

 Here is a list of possible parameters and their values:
-location: str -> often just the name of the city
-checkin_date: str -> format: YYYY-MM-DD
-checkout_date: str -> format: YYYY-MM-DD
-adults_number: int
-children_number: int
-min_rating: int -> must be in the range from 0 to 10
-min_price: int
-max_price: int
-order_by: str -> possible values: popularity, price, rating
-id: int -> it is used exclusively when booking, when the search has already been called

Given the user input, return the name and input of the tool to use. Return your response as a JSON blob with 'name' and 'arguments' keys.

The `arguments` should be a dictionary, with keys corresponding to the argument names and the values corresponding to the requested values.
If the user has not provided some important information, you still need to send a json blob.
Don't make up the argument values yourself! Take only what the user specified!

{user_input}
"""


In [None]:
from tqdm import tqdm
from utils import str_to_json

results_finetuned = []
for item in tqdm(val_data):
    results_finetuned.append(llm.invoke(prompt.format(user_input=item['instruction'])))
    results_finetuned[-1] = str_to_json(results_finetuned[-1])

In [None]:
with open('val_results_qwen7b-hotel-instruct-v2.json', 'w') as f:
    json.dump(results_finetuned, f, indent =4)

In [None]:
llm = ChatUnsloth(model_path="unsloth/Qwen2-7B-Instruct-bnb-4bit")

In [None]:
results_qwen = []
for item in tqdm(val_data):
    results_qwen.append(llm.invoke(prompt.format(user_input=item['instruction'])))
    results_qwen[-1] = str_to_json(results_qwen[-1])

In [None]:
with open('val_results_qwen7b.json', 'w') as f:
    json.dump(results_qwen, f, indent=4)

In [2]:
import json
from utils import str_to_json

qwen7b_results = json.load(open("val_results_qwen7b.json", "r"))
qwen7b_finetuned = json.load(open("val_results_qwen7b-hotel-instruct-v2.json", "r"))
val_data = json.load(open("val_data.json", "r"))

qwen7b_results_str = [str(d) for d in qwen7b_results]
qwen7b_finetuned_str = [str(d) for d in qwen7b_finetuned]
val_data_outputs = [d["output"] for d in val_data]

In [3]:
def calculate_metrics(predictions, references):
    from datasets import load_metric

    # ROUGE
    rouge = load_metric("rouge", trust_remote_code=True)
    rouge_result = rouge.compute(predictions=predictions, references=references)

    # BLEU
    bleu = load_metric("bleu", trust_remote_code=True)
    references_bleu = [[ref.split()] for ref in references]
    predictions_bleu = [pred.split() for pred in predictions]
    bleu_result = bleu.compute(predictions=predictions_bleu, references=references_bleu)

    # nulls (errors while parsing json blob)
    nulls_count = sum([item == "None" for item in predictions])
    nulls_ratio = nulls_count / len(predictions)

    # precision on json arguments
    predictions_jsons = [str_to_json(pred) for pred in predictions]
    references_jsons = [str_to_json(ref) for ref in references]

    precision_sum = 0
    for pred, ref in zip(predictions_jsons, references_jsons):
        current_precision = 0
        if pred is not None and pred["name"] == ref["name"]:
            for key in ref["arguments"]:
                if key in pred["arguments"]:
                    if pred["arguments"][key] == ref["arguments"][key]:
                        current_precision += 1
            precision_sum += current_precision / len(ref["arguments"])
    
    precision__on_json = precision_sum / len(predictions_jsons)
    

    return {"rouge": rouge_result, "bleu": bleu_result, "nulls": {"nulls_ratio": nulls_ratio, "nulls_count": nulls_count}, "precision__on_json": precision__on_json}


In [5]:
calculate_metrics(qwen7b_finetuned_str, val_data_outputs)

Downloading builder script: 6.06kB [00:00, 12.1MB/s]                   
Downloading extra modules: 4.07kB [00:00, 8.82MB/s]                   


{'rouge': {'rouge1': AggregateScore(low=Score(precision=0.9314757949873719, recall=0.7869516898164721, fmeasure=0.8401867769029531), mid=Score(precision=0.9457516581142829, recall=0.8152974714404551, fmeasure=0.8624235833105862), high=Score(precision=0.9592311020044126, recall=0.8444324472442732, fmeasure=0.8825403598956094)),
  'rouge2': AggregateScore(low=Score(precision=0.869927013173685, recall=0.745929342077654, fmeasure=0.7955970002657351), mid=Score(precision=0.8937775854449153, recall=0.7756688569481561, fmeasure=0.8194466193846981), high=Score(precision=0.9150814071478635, recall=0.80572531766845, fmeasure=0.8440823984054586)),
  'rougeL': AggregateScore(low=Score(precision=0.912896708613124, recall=0.7697938592045381, fmeasure=0.8237513707290158), mid=Score(precision=0.929850303886506, recall=0.8013452328731068, fmeasure=0.8482072887920368), high=Score(precision=0.9451985923965911, recall=0.8279592537261158, fmeasure=0.8689162468450834)),
  'rougeLsum': AggregateScore(low=Sco

In [6]:
calculate_metrics(qwen7b_results_str, val_data_outputs)

{'rouge': {'rouge1': AggregateScore(low=Score(precision=0.8071773741279352, recall=0.7426246663940608, fmeasure=0.756527201325115), mid=Score(precision=0.8389705433466654, recall=0.7789693304690096, fmeasure=0.7887927278596703), high=Score(precision=0.8690185537697652, recall=0.8103621867618795, fmeasure=0.8175269270378688)),
  'rouge2': AggregateScore(low=Score(precision=0.692019229679848, recall=0.6509876426333121, fmeasure=0.661182815737192), mid=Score(precision=0.7289221309555962, recall=0.6884418245087005, fmeasure=0.6965315432744363), high=Score(precision=0.7617281826261251, recall=0.7191282121571464, fmeasure=0.7254679296619316)),
  'rougeL': AggregateScore(low=Score(precision=0.785782217974349, recall=0.7226537463419145, fmeasure=0.7352121357940978), mid=Score(precision=0.8212541943118195, recall=0.7596278721370863, fmeasure=0.7705980672034578), high=Score(precision=0.8524457948912171, recall=0.7922360431587748, fmeasure=0.8007684949960256)),
  'rougeLsum': AggregateScore(low=S