In [45]:
import pandas as pd
import os
from groq import Groq
client = Groq(

    api_key=os.environ.get("GROQ_API_KEY"),

)

In [63]:
df = pd.read_csv('../data/travel_data.csv')

In [64]:

# Insert ID column at the first position
df.insert(0, 'id', range(1, len(df) + 1))
# df['id'] = range(1, len(df) + 1)
df.head()
# 

Unnamed: 0,id,destination,user_review,travel_tip,rating,best_time_to_visit,local_cuisine_highlights,location_coordinates,popular_attractions,transportation_options,language_spoken,safety_rating,activities_available,cultural_highlights
0,1,"paris, france",Visiting the Eiffel Tower was breathtaking! Th...,Book tickets online to avoid long lines!,5,"April to June, September","Croissants, Macarons","48.8584° N, 2.2945° E","Louvre, Notre-Dame, Montmartre","Metro, Buses, Walking",French,4,"Seine River Cruises, Art Tours","Fashion, Art, Romantic Spots"
1,2,"tokyo, japan",Shibuya Crossing is a must-see. It's an incred...,Visit in the evening for the best lights!,4,"March to May, October","Sushi, Ramen","35.6586° N, 139.7012° E","Senso-ji, Tokyo Tower","Train, Metro, Buses",Japanese,5,"Temple Visits, Food Tours","Technology, Anime Culture"
2,3,"new_york, usa",Central Park is a great escape from the city h...,Rent a bike to explore more of the park.,5,Spring and Fall,"Bagels, Cheesecake","40.7851° N, 73.9683° W","Statue of Liberty, Times Square","Subway, Buses, Taxis",English,4,"Broadway Shows, City Tours","Diverse Cultures, Art Scene"
3,4,"rome, italy",The Colosseum is stunning! Make sure to take a...,Visit early in the morning to avoid crowds.,5,"April to June, September","Pasta Carbonara, Gelato","41.8902° N, 12.4923° E","Vatican City, Pantheon","Metro, Buses, Walking",Italian,4,"Historical Tours, Cooking Classes","Ancient History, Religion"
4,5,"cape_town, south_africa",Table Mountain offers the best views of the ci...,Take the cable car if you’re short on time.,4,November to February,"Bobotie, Biltong","-33.9625° S, 18.4095° E","Cape of Good Hope, Robben Island","Car, Taxi, Walking","Afrikaans, English",4,"Wine Tasting, Hiking","Cultural Heritage, Nature"


In [65]:


documents = df.to_dict(orient='records')

In [66]:
documents[0]

{'id': 1,
 'destination': 'paris, france',
 'user_review': 'Visiting the Eiffel Tower was breathtaking! The views from the top are unforgettable.',
 'travel_tip': 'Book tickets online to avoid long lines!',
 'rating': 5,
 'best_time_to_visit': 'April to June, September',
 'local_cuisine_highlights': 'Croissants, Macarons',
 'location_coordinates': '48.8584° N, 2.2945° E',
 'popular_attractions': 'Louvre, Notre-Dame, Montmartre',
 'transportation_options': 'Metro, Buses, Walking',
 'language_spoken': 'French',
 'safety_rating': 4,
 'activities_available': 'Seine River Cruises, Art Tours',
 'cultural_highlights': 'Fashion, Art, Romantic Spots'}

In [67]:
prompt_template = """
You emulate a user of our travel assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

destination: {destination}
user_review: {user_review}
travel_tip: {travel_tip}
best_time_to_visit: {best_time_to_visit}
local_cuisine_highlights: {local_cuisine_highlights}
location_coordinates: {location_coordinates}
popular_attractions: {popular_attractions}
transportation_options: {transportation_options}
language_spoken: {language_spoken}
activities_available: {activities_available}
cultural_highlights: {cultural_highlights}
Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()


In [68]:
prompt = prompt_template.format(**documents[0])
print(prompt)

You emulate a user of our travel assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

destination: paris, france
user_review: Visiting the Eiffel Tower was breathtaking! The views from the top are unforgettable.
travel_tip: Book tickets online to avoid long lines!
best_time_to_visit: April to June, September
local_cuisine_highlights: Croissants, Macarons
location_coordinates: 48.8584° N, 2.2945° E
popular_attractions: Louvre, Notre-Dame, Montmartre
transportation_options: Metro, Buses, Walking
language_spoken: French
activities_available: Seine River Cruises, Art Tours
cultural_highlights: Fashion, Art, Romantic Spots
Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", ..., "q

In [69]:
def llm(prompt):
    response = client.chat.completions.create(
        model='llama-3.1-70b-versatile',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content


In [70]:
questions = llm(prompt)

In [71]:
print(questions)

{"questions": [
"What is the best time of the year to plan my trip to Paris?",
"How do I avoid long lines when visiting popular attractions in Paris?",
"Are there any popular river-based activities available in Paris that I should consider?",
"Can I easily get around Paris on foot, or what other transportation options are available?",
"What is a must-try local food item when visiting Paris that I should not miss?"
]}


In [72]:
import json

In [73]:
json.loads(questions)

{'questions': ['What is the best time of the year to plan my trip to Paris?',
  'How do I avoid long lines when visiting popular attractions in Paris?',
  'Are there any popular river-based activities available in Paris that I should consider?',
  'Can I easily get around Paris on foot, or what other transportation options are available?',
  'What is a must-try local food item when visiting Paris that I should not miss?']}

In [74]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='llama-3.1-70b-versatile',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [75]:
from tqdm.auto import tqdm


In [76]:
results = {}

In [87]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

100%|██████████| 215/215 [00:03<00:00, 70.61it/s] 


In [88]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [89]:
final_results[0]

(1,
 'What are some of the must-visit museums in Paris that house famous works of art?')

In [90]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [91]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [92]:
!head ../data/ground-truth-retrieval.csv

id,question
1,What are some of the must-visit museums in Paris that house famous works of art?
1,Is it more convenient to explore Paris on foot or rely on public transportation?
1,Are there specific seasons or months when the city's scenery is particularly beautiful?
1,What are some sweet and delicious treats that I should try when visiting Paris?
1,What are some tips for getting a good view of the city from a high vantage point?
2,What are the geographical coordinates of the destination I am planning to visit?
2,Can you recommend the best months of the year to travel to my destination?
2,What are the options I have for getting around my destination?
2,What are the must-try foods when I visit my destination?
