In [2]:
import pandas as pd

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-10-28 20:04:52--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8000::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.5’


2024-10-28 20:04:52 (13.2 MB/s) - ‘minsearch.py.5’ saved [3832/3832]



In [4]:
df = pd.read_csv("../data/travel_data.csv")

In [5]:
df.insert(0, 'id', range(1, len(df) + 1))
df.head()

Unnamed: 0,id,destination,user_review,travel_tip,rating,best_time_to_visit,local_cuisine_highlights,location_coordinates,popular_attractions,transportation_options,language_spoken,safety_rating,activities_available,cultural_highlights
0,1,"paris, france",Visiting the Eiffel Tower was breathtaking! Th...,Book tickets online to avoid long lines!,5,"April to June, September","Croissants, Macarons","48.8584° N, 2.2945° E","Louvre, Notre-Dame, Montmartre","Metro, Buses, Walking",French,4,"Seine River Cruises, Art Tours","Fashion, Art, Romantic Spots"
1,2,"tokyo, japan",Shibuya Crossing is a must-see. It's an incred...,Visit in the evening for the best lights!,4,"March to May, October","Sushi, Ramen","35.6586° N, 139.7012° E","Senso-ji, Tokyo Tower","Train, Metro, Buses",Japanese,5,"Temple Visits, Food Tours","Technology, Anime Culture"
2,3,"new_york, usa",Central Park is a great escape from the city h...,Rent a bike to explore more of the park.,5,Spring and Fall,"Bagels, Cheesecake","40.7851° N, 73.9683° W","Statue of Liberty, Times Square","Subway, Buses, Taxis",English,4,"Broadway Shows, City Tours","Diverse Cultures, Art Scene"
3,4,"rome, italy",The Colosseum is stunning! Make sure to take a...,Visit early in the morning to avoid crowds.,5,"April to June, September","Pasta Carbonara, Gelato","41.8902° N, 12.4923° E","Vatican City, Pantheon","Metro, Buses, Walking",Italian,4,"Historical Tours, Cooking Classes","Ancient History, Religion"
4,5,"cape_town, south_africa",Table Mountain offers the best views of the ci...,Take the cable car if you’re short on time.,4,November to February,"Bobotie, Biltong","-33.9625° S, 18.4095° E","Cape of Good Hope, Robben Island","Car, Taxi, Walking","Afrikaans, English",4,"Wine Tasting, Hiking","Cultural Heritage, Nature"


In [6]:
documents = df.to_dict(orient='records')
documents[0]

{'id': 1,
 'destination': 'paris, france',
 'user_review': 'Visiting the Eiffel Tower was breathtaking! The views from the top are unforgettable.',
 'travel_tip': 'Book tickets online to avoid long lines!',
 'rating': 5,
 'best_time_to_visit': 'April to June, September',
 'local_cuisine_highlights': 'Croissants, Macarons',
 'location_coordinates': '48.8584° N, 2.2945° E',
 'popular_attractions': 'Louvre, Notre-Dame, Montmartre',
 'transportation_options': 'Metro, Buses, Walking',
 'language_spoken': 'French',
 'safety_rating': 4,
 'activities_available': 'Seine River Cruises, Art Tours',
 'cultural_highlights': 'Fashion, Art, Romantic Spots'}

In [7]:
import minsearch

In [8]:
df.columns

Index(['id', 'destination', 'user_review', 'travel_tip', 'rating',
       'best_time_to_visit', 'local_cuisine_highlights',
       'location_coordinates', 'popular_attractions', 'transportation_options',
       'language_spoken', 'safety_rating', 'activities_available',
       'cultural_highlights'],
      dtype='object')

In [9]:

print("Before conversion:")
print(df.dtypes)


# # Convert integer columns to string
# int_columns = ['rating', 'safety_rating']  # specify the integer columns
# df[int_columns] = df[int_columns].astype(str)

# # Display the DataFrame after conversion (optional)
# print("\nAfter conversion:")
# print(df.dtypes)

Before conversion:
id                           int64
destination                 object
user_review                 object
travel_tip                  object
rating                       int64
best_time_to_visit          object
local_cuisine_highlights    object
location_coordinates        object
popular_attractions         object
transportation_options      object
language_spoken             object
safety_rating                int64
activities_available        object
cultural_highlights         object
dtype: object


In [10]:
index = minsearch.Index(
    text_fields=['destination', 'user_review', 'travel_tip',
       'best_time_to_visit', 'local_cuisine_highlights',
       'location_coordinates', 'popular_attractions', 'transportation_options',
       'language_spoken', 'activities_available',
       'cultural_highlights'],
    keyword_fields=['id']
)

In [11]:
index.fit(documents)

<minsearch.Index at 0x7dd4f835c5c0>

In [12]:
import os
from groq import Groq

In [13]:
client = Groq(

    api_key=os.environ.get("GROQ_API_KEY"),

)

In [14]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [15]:
prompt_template = """
You're a travel expert. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
destination: {destination}
user_review: {user_review}
travel_tip: {travel_tip}
best_time_to_visit: {best_time_to_visit}
local_cuisine_highlights: {local_cuisine_highlights}
location_coordinates: {location_coordinates}
popular_attractions: {popular_attractions}
transportation_options: {transportation_options}
language_spoken: {language_spoken}
activities_available: {activities_available}
cultural_highlights: {cultural_highlights}


""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [16]:
def llm(prompt, model='llama3-8b-8192'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [16]:
def rag(query, model='llama-3.1-70b-versatile'):
    search_results = search(query)
    # print(search_results)
    prompt = build_prompt(query, search_results)
    # print(prompt)
    answer = llm(prompt, model=model)
    return answer


In [24]:
question = 'where can i travel in sydney'
answer = rag(question)
print(answer)


You can travel to the following places in Sydney:

1. Bondi Beach: It's a beautiful beach and a must-visit attraction in Sydney. You can also take a coastal walk from Bondi to Coogee, which is a highlight of the city.
2. Sydney Opera House: This iconic building is one of the most popular attractions in Sydney.
3. Harbour Bridge: Another iconic landmark in Sydney, the Harbour Bridge is a must-visit attraction.
4. Coogee: This beach is a great destination for swimming and taking in the scenic views.

You can use the following transportation options to get around Sydney:

1. Train
2. Bus
3. Ferry

Some popular activities to do in Sydney include:

1. Surfing: Sydney has some of the best surfing spots in Australia.
2. Wildlife Tours: You can take a guided tour to explore the wildlife in and around Sydney.

Don't forget to try some of the local cuisine highlights, including:

1. Meat Pies
2. Lamingtons

And remember to bring sunscreen, as the sun can be intense in Sydney!


Retrieval evaluation

In [25]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [26]:
df_question.head()

Unnamed: 0,id,question
0,1,What are some of the must-visit museums in Par...
1,1,Is it more convenient to explore Paris on foot...
2,1,Are there specific seasons or months when the ...
3,1,What are some sweet and delicious treats that ...
4,1,What are some tips for getting a good view of ...


In [27]:
ground_truth = df_question.to_dict(orient='records')

In [28]:
ground_truth[0]

{'id': 1,
 'question': 'What are some of the must-visit museums in Paris that house famous works of art?'}

In [29]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [30]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results


In [31]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        # print(results)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [32]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1075 [00:00<?, ?it/s]

100%|██████████| 1075/1075 [01:36<00:00, 11.18it/s]


{'hit_rate': 0.7730232558139535, 'mrr': 0.5328062015503875}

### Finding the best parameters

In [34]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [35]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [36]:
gt_val = df_validation.to_dict(orient='records')


In [37]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


In [38]:
param_ranges = {
   'destination': (0.0, 3.0), 'user_review': (0.0, 3.0), 'travel_tip': (0.0, 3.0), 'best_time_to_visit': (0.0, 3.0), 'local_cuisine_highlights': (0.0, 3.0), 'location_coordinates': (0.0, 3.0), 'popular_attractions': (0.0, 3.0), 'transportation_options': (0.0, 3.0), 'language_spoken': (0.0, 3.0), 'activities_available': (0.0, 3.0), 'cultural_highlights': (0.0, 3.0)}
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [39]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|██████████| 100/100 [00:06<00:00, 14.67it/s]
100%|██████████| 100/100 [00:06<00:00, 15.75it/s]
100%|██████████| 100/100 [00:06<00:00, 16.15it/s]
100%|██████████| 100/100 [00:06<00:00, 15.55it/s]
100%|██████████| 100/100 [00:06<00:00, 15.66it/s]
100%|██████████| 100/100 [00:06<00:00, 15.77it/s]
100%|██████████| 100/100 [00:06<00:00, 15.16it/s]
100%|██████████| 100/100 [00:06<00:00, 16.21it/s]
100%|██████████| 100/100 [00:06<00:00, 15.71it/s]
100%|██████████| 100/100 [00:06<00:00, 15.49it/s]
100%|██████████| 100/100 [00:06<00:00, 15.67it/s]
100%|██████████| 100/100 [00:06<00:00, 15.88it/s]
100%|██████████| 100/100 [00:06<00:00, 15.54it/s]
100%|██████████| 100/100 [00:06<00:00, 16.11it/s]
100%|██████████| 100/100 [00:06<00:00, 15.74it/s]
100%|██████████| 100/100 [00:06<00:00, 15.48it/s]
100%|██████████| 100/100 [00:06<00:00, 16.06it/s]
100%|██████████| 100/100 [00:06<00:00, 15.73it/s]
100%|██████████| 100/100 [00:06<00:00, 15.44it/s]
100%|██████████| 100/100 [00:06<00:00, 15.48it/s]


({'destination': 2.485673359952678,
  'user_review': 2.9196237991381584,
  'travel_tip': 1.9014614654735604,
  'best_time_to_visit': 0.2946106215502525,
  'local_cuisine_highlights': 1.6224280634698078,
  'location_coordinates': 0.7452008126129724,
  'popular_attractions': 2.5540130530716985,
  'transportation_options': 1.4757377423401572,
  'language_spoken': 0.05435765500930245,
  'activities_available': 1.210048239340701,
  'cultural_highlights': 2.886085497384669},
 0.580511904761905)

In [40]:
def minsearch_improved(query):
    boost = {'destination': 2.485673359952678,
  'user_review': 2.9196237991381584,
  'travel_tip': 1.9014614654735604,
  'best_time_to_visit': 0.2946106215502525,
  'local_cuisine_highlights': 1.6224280634698078,
  'location_coordinates': 0.7452008126129724,
  'popular_attractions': 2.5540130530716985,
  'transportation_options': 1.4757377423401572,
  'language_spoken': 0.05435765500930245,
  'activities_available': 1.210048239340701,
  'cultural_highlights': 2.886085497384669}
    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

100%|██████████| 1075/1075 [01:12<00:00, 14.74it/s]


{'hit_rate': 0.8455813953488373, 'mrr': 0.5796936138796608}

RAG evaluation

In [41]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()


In [42]:
len(ground_truth)

1075

In [43]:
record = ground_truth[0]

In [44]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

NameError: name 'answer_llm' is not defined

In [45]:
import json


In [46]:
df_sample = df_question.sample(n=200, random_state=1)

In [47]:
sample = df_sample.to_dict(orient='records')

In [157]:
sample

[{'id': 198,
  'question': 'What kind of attractions can I typically find in or near Stone Town?'},
 {'id': 137,
  'question': 'What are some of the top attractions to visit in Tbilisi for historical and cultural significance?'},
 {'id': 184,
  'question': 'What are the other official language spoken in Auckland besides English?'},
 {'id': 44,
  'question': 'What is the most recommended time to visit to make the most out of the trip to Naples?'},
 {'id': 213,
  'question': 'What are some must-visit places to see when I travel to Athens, Greece?'},
 {'id': 213,
  'question': 'What options do I have for getting around Athens, especially from one popular attraction to another?'},
 {'id': 41,
  'question': 'What are the most ideal months to plan a trip to Casablanca?'},
 {'id': 195,
  'question': 'Can you please suggest some local dishes to try when I visit Accra, Ghana?'},
 {'id': 144,
  'question': 'What are some types of guided tour options available in Algiers?'},
 {'id': 188,
  'quest

In [48]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    # print(evaluation)
    # evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

100%|██████████| 200/200 [24:26<00:00,  7.33s/it]


In [49]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

In [50]:
df_eval.head()
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])


In [51]:
df_eval

Unnamed: 0,record,answer,evaluation,id,question
0,"{'id': 198, 'question': 'What kind of attracti...",You can typically find cultural and historical...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",198,What kind of attractions can I typically find ...
1,"{'id': 137, 'question': 'What are some of the ...","Based on the context provided, some of the top...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",137,What are some of the top attractions to visit ...
2,"{'id': 184, 'question': 'What are the other of...",The other official language spoken in Auckland...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",184,What are the other official language spoken in...
3,"{'id': 44, 'question': 'What is the most recom...",The most recommended time to visit Naples and ...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",44,What is the most recommended time to visit to ...
4,"{'id': 213, 'question': 'What are some must-vi...","Athens, Greece is a city rich in history and c...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",213,What are some must-visit places to see when I ...
...,...,...,...,...,...
195,"{'id': 74, 'question': 'Can I get by without s...",You can likely get by without speaking the loc...,"{\n ""Relevance"": ""PARTLY_RELEVANT"",\n ""Expla...",74,Can I get by without speaking the local langua...
196,"{'id': 84, 'question': 'What are some of the l...",You should try the following local Icelandic d...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",84,What are some of the local Icelandic dishes I ...
197,"{'id': 159, 'question': 'What are the months d...",Brasilia has the best weather for visiting dur...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",159,What are the months during which Brasilia has ...
198,"{'id': 13, 'question': 'What are some of the m...","Based on the context, one of the most famous m...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",13,What are some of the most famous museums in Ma...


In [52]:
# Function to parse JSON strings
def parse_json(json_str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        return None
# 

In [53]:
# Apply the function to parse the JSON strings in the evaluation column
df_eval['parsed_evaluation'] = df_eval['evaluation'].apply(parse_json)


In [54]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,id,question,parsed_evaluation
0,"{'id': 198, 'question': 'What kind of attracti...",You can typically find cultural and historical...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",198,What kind of attractions can I typically find ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
1,"{'id': 137, 'question': 'What are some of the ...","Based on the context provided, some of the top...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",137,What are some of the top attractions to visit ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
2,"{'id': 184, 'question': 'What are the other of...",The other official language spoken in Auckland...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",184,What are the other official language spoken in...,
3,"{'id': 44, 'question': 'What is the most recom...",The most recommended time to visit Naples and ...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",44,What is the most recommended time to visit to ...,
4,"{'id': 213, 'question': 'What are some must-vi...","Athens, Greece is a city rich in history and c...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",213,What are some must-visit places to see when I ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."


In [55]:
# Now you can safely access the 'Relevance' key
df_eval['relevance'] = df_eval['parsed_evaluation'].apply(lambda d: d['Relevance'] if d else None)

In [56]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,id,question,parsed_evaluation,relevance
0,"{'id': 198, 'question': 'What kind of attracti...",You can typically find cultural and historical...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",198,What kind of attractions can I typically find ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",RELEVANT
1,"{'id': 137, 'question': 'What are some of the ...","Based on the context provided, some of the top...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",137,What are some of the top attractions to visit ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",RELEVANT
2,"{'id': 184, 'question': 'What are the other of...",The other official language spoken in Auckland...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",184,What are the other official language spoken in...,,
3,"{'id': 44, 'question': 'What is the most recom...",The most recommended time to visit Naples and ...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",44,What is the most recommended time to visit to ...,,
4,"{'id': 213, 'question': 'What are some must-vi...","Athens, Greece is a city rich in history and c...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",213,What are some must-visit places to see when I ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",RELEVANT


In [57]:
df_eval.drop(columns=['parsed_evaluation'], inplace=True)

In [58]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,id,question,relevance
0,"{'id': 198, 'question': 'What kind of attracti...",You can typically find cultural and historical...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",198,What kind of attractions can I typically find ...,RELEVANT
1,"{'id': 137, 'question': 'What are some of the ...","Based on the context provided, some of the top...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",137,What are some of the top attractions to visit ...,RELEVANT
2,"{'id': 184, 'question': 'What are the other of...",The other official language spoken in Auckland...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",184,What are the other official language spoken in...,
3,"{'id': 44, 'question': 'What is the most recom...",The most recommended time to visit Naples and ...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",44,What is the most recommended time to visit to ...,
4,"{'id': 213, 'question': 'What are some must-vi...","Athens, Greece is a city rich in history and c...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",213,What are some must-visit places to see when I ...,RELEVANT


In [59]:
# df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])
# Apply the function to parse the JSON strings in the evaluation column
df_eval['parsed_evaluation'] = df_eval['evaluation'].apply(parse_json)

In [60]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,id,question,relevance,parsed_evaluation
0,"{'id': 198, 'question': 'What kind of attracti...",You can typically find cultural and historical...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",198,What kind of attractions can I typically find ...,RELEVANT,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
1,"{'id': 137, 'question': 'What are some of the ...","Based on the context provided, some of the top...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",137,What are some of the top attractions to visit ...,RELEVANT,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
2,"{'id': 184, 'question': 'What are the other of...",The other official language spoken in Auckland...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",184,What are the other official language spoken in...,,
3,"{'id': 44, 'question': 'What is the most recom...",The most recommended time to visit Naples and ...,"{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",44,What is the most recommended time to visit to ...,,
4,"{'id': 213, 'question': 'What are some must-vi...","Athens, Greece is a city rich in history and c...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""...",213,What are some must-visit places to see when I ...,RELEVANT,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."


In [61]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.887324
PARTLY_RELEVANT    0.098592
NON_RELEVANT       0.014085
Name: proportion, dtype: float64

In [62]:
df_eval.to_csv('../data/rag-eval-llama70b.csv', index=False)

In [63]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,record,answer,evaluation,id,question,relevance,parsed_evaluation
53,"{'id': 152, 'question': 'What are some popular...",There is no information in the given context a...,"{\n ""Relevance"": ""NON_RELEVANT"",\n ""Explanat...",152,What are some popular food items that reflect ...,NON_RELEVANT,"{'Relevance': 'NON_RELEVANT', 'Explanation': '..."


### ElasticSearch

In [64]:
from elasticsearch import Elasticsearch

In [65]:
es_client = Elasticsearch('http://localhost:9200') 

In [66]:
df.dtypes

id                           int64
destination                 object
user_review                 object
travel_tip                  object
rating                       int64
best_time_to_visit          object
local_cuisine_highlights    object
location_coordinates        object
popular_attractions         object
transportation_options      object
language_spoken             object
safety_rating                int64
activities_available        object
cultural_highlights         object
dtype: object

In [67]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
          "id": {"type": "integer"},
            "destination": {"type": "text"},
            "user_review": {"type": "text"},
            "travel_tip": {"type": "text"},
            "rating": {"type": "integer"},
            "best_time_to_visit": {"type": "text"},
            "local_cuisine_highlights": {"type": "text"},
            "location_coordinates": {"type": "text"},
            "popular_attractions": {"type": "text"},
            "transportation_options": {"type": "text"},
            "language_spoken": {"type": "text"},
            "safety_rating": {"type": "integer"},
            "activities_available": {"type": "text"},
            "cultural_highlights": {"type": "text"}
        }
    }
}

index_name = "travel-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'travel-questions'})

In [68]:
documents[0]

{'id': 1,
 'destination': 'paris, france',
 'user_review': 'Visiting the Eiffel Tower was breathtaking! The views from the top are unforgettable.',
 'travel_tip': 'Book tickets online to avoid long lines!',
 'rating': 5,
 'best_time_to_visit': 'April to June, September',
 'local_cuisine_highlights': 'Croissants, Macarons',
 'location_coordinates': '48.8584° N, 2.2945° E',
 'popular_attractions': 'Louvre, Notre-Dame, Montmartre',
 'transportation_options': 'Metro, Buses, Walking',
 'language_spoken': 'French',
 'safety_rating': 4,
 'activities_available': 'Seine River Cruises, Art Tours',
 'cultural_highlights': 'Fashion, Art, Romantic Spots'}

In [69]:
from tqdm.auto import tqdm

In [70]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 215/215 [00:19<00:00, 11.06it/s]


In [81]:
query = 'where can i travel in barcelona, spain'

In [76]:
def elastic_search(query):
    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": [
                        "destination",
                        "user_review",
                        "travel_tip",
                        "best_time_to_visit",
                        "local_cuisine_highlights",
                        "location_coordinates",
                        "popular_attractions",
                        "transportation_options",
                        "language_spoken",
                        "activities_available",
                        "cultural_highlights"
                    ],
                    "type": "best_fields"
                }
            },
            "filter": [
                {"range": {"rating": {"gte": 3}}},  # Example filter for ratings
                {"range": {"safety_rating": {"gte": 4}}}  # Example filter for safety
            ]
        }
    }
}


    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [77]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [83]:
print(rag(query))

Based on the context, here are some travel recommendations for Barcelona, Spain:

* Must-visit attractions: La Sagrada Familia, Park Güell, and the Gothic Quarter.
* Transportation options: Metro, buses, and walking.
* Local cuisine highlights: Tapas and Paella.
* Best time to visit: May to September.
* Activities available: Flamenco shows and bike tours.
* Cultural highlights: Architecture and festivals.

These are all based on the specific text provided in the context about Barcelona, Spain.
