# Homework 3
Our goal is to collect the information from the *Michelin Guide* in order to help users to find a restaurant that reflects their unique tastes.

- In order to do that, we firstly gather the data from the *Michelin Guide* [website](https://guide.michelin.com/en/it/restaurants).

- Then, we build two types of search engines that allow users to retrieve restaurants according to their query.

# Collecting the data

## Crawling

We gather all restaurant URLs from the guide and store them in a text file for further data processing.

In [None]:
import requests
from bs4 import BeautifulSoup

# Header for the request (optional, can help prevent the server from blocking the request)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Open a file to write restaurant URLs
with open("michelin_restaurants_urls.txt", "w") as file:
    # Iterate through all pages (from 1 to 100)
    for page in range(1, 101):
        # Base URL with page number
        url = f"https://guide.michelin.com/en/it/restaurants/page/{page}"

        # Make the GET request
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code == 200:
            # Create the BeautifulSoup object to parse the HTML content
            soup = BeautifulSoup(response.text, "html.parser")

            # Find all restaurant links
            restaurant_links = soup.find_all("a", class_="link")

            # Iterate over restaurant links and write URLs to the file
            for link in restaurant_links:
                href = link.get("href")
                if href and "/restaurant/" in href:
                    full_url = f"https://guide.michelin.com{href}"
                    file.write(full_url + "\n")
                    print(f"URL added: {full_url}")
        else:
            print(f"Error in request to page {page}: {response.status_code}")

We download the HTML content of Michelin restaurant pages concurrently, organizing each batch of 20 URLs into separate folders with the restaurant's name as the filename.

In [None]:
import requests
import os
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

# Header for the request (optional)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Load URLs from the file
with open("michelin_restaurants_urls.txt", "r") as file:
    urls = [line.strip() for line in file.readlines()]

# Define the number of URLs per folder and max concurrent threads
url_per_folder = 20
max_threads = 5  # We can adjust the number of threads based on our network and system capacity

def download_html(url, folder_name):
    """Function to download HTML of a given URL and save it in the specified folder with the restaurant name."""
    try:
        # Extract restaurant name from the URL
        restaurant_name = urlparse(url).path.split('/')[-1]
        
        # Make the GET request
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Save the HTML content with the restaurant's name
            file_path = os.path.join(folder_name, f"{restaurant_name}.html")
            with open(file_path, "w", encoding="utf-8") as html_file:
                html_file.write(response.text)
            print(f"Saved: {file_path}")
        else:
            print(f"Error fetching {url}: {response.status_code}")
        sleep(0.1)  # Slight delay to avoid overwhelming the server

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")

# Split URLs into groups of 20 and download concurrently
for i in range(0, len(urls), url_per_folder):
    folder_num = i // url_per_folder + 1
    folder_name = f"page_{folder_num}"
    
    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Get the next 20 URLs
    url_subset = urls[i:i + url_per_folder]

    # Use ThreadPoolExecutor to download concurrently
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Start a thread for each URL in the subset
        futures = [executor.submit(download_html, url, folder_name) for url in url_subset]
        
        # Wait for all threads in the subset to complete
        for future in as_completed(futures):
            future.result()  # Retrieve any exceptions if occurred

Saved: page_1\da-mo.html
Saved: page_1\o-me-o-il-mare.html
Saved: page_1\donevandro.html
Saved: page_1\da-bob-cook-fish.html
Saved: page_1\ape-vino-e-cucina.html
Saved: page_1\charleston.html
Saved: page_1\sa-domu-sarda.html
Saved: page_1\alessandro-feo.html
Saved: page_1\il-tirabuscio262517.html
Saved: page_1\la-buca130947.html
Saved: page_1\il-ristorante-alain-ducasse-napoli.html
Saved: page_1\dama-1213583.html
Saved: page_1\palazzo-utini.html
Saved: page_1\etra.html
Saved: page_1\soul-fish.html
Saved: page_1\la-trattoria-enrico-bartolini.html
Saved: page_1\loro.html
Saved: page_1\20tre.html
Saved: page_1\menage.html
Saved: page_1\procaccini.html
Saved: page_2\metodo-1213628.html
Saved: page_2\gimmy-s.html
Saved: page_2\osteria-dell-accademia.html
Saved: page_2\fratelli-bruzzone.html
Saved: page_2\serrae-villa-fiesole.html
Saved: page_2\locanda-perbellini-ai-beati.html
Saved: page_2\innesti.html
Saved: page_2\salvo.html
Saved: page_2\arnolfo.htmlSaved: page_2\osteria-del-teatro.html


## Parsing

We parse HTML files for Michelin restaurants to extract key information (e.g., name, address, cuisine, facilities), and save each restaurant's data as an individual .tsv file.

In [1]:
from bs4 import BeautifulSoup
import os
import csv

def parse_restaurant_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Initialize dictionary with default empty strings
    data = {
        "restaurantName": '',
        "address": '',
        "city": '',
        "postalCode": '',
        "country": "Italy",
        "priceRange": '',
        "cuisineType": '',
        "description": '',
        "facilitiesServices": '',
        "creditCards": '',
        "phoneNumber": '',
        "website": ''
    }

    # Extract restaurant name
    name_tag = soup.find("h1", class_="data-sheet__title")
    if name_tag:
        data["restaurantName"] = name_tag.get_text(strip=True)

    # Extract address information
    address_tag = soup.find_all("div", class_="data-sheet__block--text")
    if address_tag:
        full_address = address_tag[0].get_text(strip=True)
        address_parts = full_address.split(', ')
        if len(address_parts) >= 3:
            data["country"] = address_parts[-1]
            data["postalCode"] = address_parts[-2]
            data["city"] = address_parts[-3]
            data["address"] = ', '.join(address_parts[:-3])
        elif len(address_parts) == 2:
            data["address"], data["city"] = address_parts
        else:
            data["address"] = full_address

    # Extract price range and cuisine type
    if len(address_tag) > 1:
        full_price_type = address_tag[1].get_text(strip=True)
        price_type_list = full_price_type.split()
        if price_type_list:
            data["priceRange"] = price_type_list[0]
            data["cuisineType"] = ' '.join(price_type_list[2:])

    # Extract description
    description_tag = soup.find("div", class_="data-sheet__description")
    if description_tag:
        data["description"] = description_tag.get_text(strip=True)

    # Extract facilities and services
    facilities_section = soup.find('div', class_="restaurant-details__services")
    if facilities_section:
        services = [li.get_text(strip=True) for li in facilities_section.find_all('li')]
        data["facilitiesServices"] = ', '.join(services)

    # Extract accepted credit cards
    credit_card_tags = soup.find_all("div", class_="restaurant-details__services--info")
    creditCards = []
    for tag in credit_card_tags:
        for img in tag.find_all("img"):
            if 'data-src' in img.attrs:
                credit_card_name = os.path.basename(img['data-src']).split('-')[0]
                creditCards.append(credit_card_name.title())
    data["creditCards"] = ', '.join(creditCards) if creditCards else ''

    # Extract phone number
    phone_tag = soup.find("div", class_="collapse__block-item")
    if phone_tag:
        data["phoneNumber"] = phone_tag.get_text(strip=True)

    # Extract restaurant website
    div_web = soup.find('div', class_='collapse__block-item link-item')
    if div_web:
        web_tag = div_web.find('a', class_='link js-dtm-link')
        if web_tag and web_tag.get('href'):
            data["website"] = web_tag.get('href')

    # Save to .tsv file
    output_file = file_path.replace(".html", ".tsv")
    with open(output_file, mode='w', encoding='utf-8') as file:
        keys = list(data.keys())
        dict_writer = csv.DictWriter(file, fieldnames=keys, delimiter='\t')
        dict_writer.writeheader()
        dict_writer.writerow(data)

    return data

Now, we process each restaurant's HTML file to extract data using concurrent threads, making the data collection process more efficient. Once all files are processed, we gather the generated .tsv files from each folder, read them into individual DataFrames, and then concatenate them into a single, unified DataFrame. Finally, we save this combined dataset to a single .tsv file, which consolidates all restaurant information for easy analysis and access.

In [2]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Base directory containing the HTML files
base_directory = r"C:\Users\Utente\OneDrive - uniroma1.it\Esami\ADM\Homework 3"


# Define the function to process each HTML file
def process_restaurant_file(file_info):
    filename, folder_path = file_info
    file_path = os.path.join(folder_path, filename)
    restaurant_data = parse_restaurant_html(file_path)
    return restaurant_data

# Collect all files to process
files_to_process = []
for folder in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder)
    if os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith(".html"):
                files_to_process.append((filename, folder_path))

# Set up concurrent processing
with ThreadPoolExecutor() as executor:
    # Schedule each file to be processed concurrently
    futures = {executor.submit(process_restaurant_file, file_info): file_info for file_info in files_to_process}

# Combine all created .tsv files into a single DataFrame

# List to collect each DataFrame
all_data = []

# Iterate over each folder in the base directory
for folder in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder)
    if os.path.isdir(folder_path):
        # Iterate over each .tsv file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".tsv"):
                file_path = os.path.join(folder_path, filename)
                # Read each .tsv file and append to the list
                df = pd.read_csv(file_path, sep='\t',dtype={'postalCode': str}, keep_default_na=False)
                all_data.append(df)

# Concatenate all individual DataFrames into one combined DataFrame
data = pd.concat(all_data, ignore_index=True)

We notice that there are some missing values regarding the restaurants' websites, credit cards accepted and available services. We decide to *fill* these missing elements with an empty string. 

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1982 entries, 0 to 1981
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   restaurantName      1982 non-null   object
 1   address             1982 non-null   object
 2   city                1982 non-null   object
 3   postalCode          1982 non-null   object
 4   country             1982 non-null   object
 5   priceRange          1982 non-null   object
 6   cuisineType         1982 non-null   object
 7   description         1982 non-null   object
 8   facilitiesServices  1982 non-null   object
 9   creditCards         1982 non-null   object
 10  phoneNumber         1982 non-null   object
 11  website             1982 non-null   object
dtypes: object(12)
memory usage: 185.9+ KB


We check that now no missing values are present.

In [5]:
# Count the number of missing values in each column
missing_values = data.isnull().sum()
missing_values

restaurantName        0
address               0
city                  0
postalCode            0
country               0
priceRange            0
cuisineType           0
description           0
facilitiesServices    0
creditCards           0
phoneNumber           0
website               0
dtype: int64

Then, we save in a tsv file the dataset just created.

In [6]:
# Export data to a tsv file
output_file = "michelin_restaurants_data.tsv"
data.to_csv(output_file, sep='\t', index=False)

In [7]:
import pandas as pd
# export data to xlsx
data.to_excel("all_restaurants_data.xlsx", index=False)

# Search Engines

## Conjunctive Search Engine

+ A conjunctive search engine retrieves documents that contain all specified search terms, meaning it only returns results where every query term appears.

First of all, we define a new column containing the text that will be compared to the user query. We define it as the union of restaurant type of cuisine and its description. This approach ensures that both cuisine and descriptive details are considered in matches, increasing the chances of accurate results. We then convert all text in the restaurant descriptions to lowercase to ensure that our search engine is case-insensitive. In other words, by standardizing the text format, we eliminate any discrepancies caused by different letter cases, allowing users to find relevant results regardless of capitalization. This step enhances the accuracy and usability of the search engine by treating terms like "Cuisine" and "cuisine" as identical.

In [8]:
# join the cuisine type and the description of the restaurant
data = pd.read_csv("michelin_restaurants_data.tsv", sep='\t', dtype={'postalCode': str})
data['text_to_compare'] = data['cuisineType'] + ' ' + data['description']
data['text_to_compare'].str.lower()

0       farm to table, modern cuisine situated in the ...
1       campanian, seafood in a beautiful stone-vaulte...
2       piedmontese, contemporary this attractive rest...
3       modern cuisine, creative before it became famo...
4       seafood working in partnership with the nearby...
                              ...                        
1977    japanese, asian one of the most popular restau...
1978    italian, creative villa aretusi is a pleasant ...
1979    modern cuisine a young chef with experience in...
1980    contemporary, piedmontese at this restaurant, ...
1981    innovative a young chef of undoubted talent is...
Name: text_to_compare, Length: 1982, dtype: object

In this cell, we import essential libraries for text processing, including the Natural Language Toolkit (NLTK) to support our search engine’s linguistic capabilities. We download key resources like stopwords, tokenizers, and WordNet, which will help us standardize and clean the text by removing irrelevant words and lemmatizing terms to their base forms. Additionally, we include modules for handling regular expressions and JSON data, ensuring that our search engine can efficiently parse, process, and organize restaurant information.

In [10]:
data['postalCode']
all([len(data['postalCode'][j]) == 5 for j in range(len(data['postalCode']))])

True

In [11]:
# Imports for Search Engine

import nltk  # Natural Language Toolkit for text processing

# Download necessary NLTK resources
nltk.download('stopwords')       # Stopwords list to filter out common words (e.g., "the", "is")
nltk.download('punkt')           # Punkt tokenizer for sentence splitting and word tokenization
nltk.download('wordnet')         # WordNet lexical database for lemmatization

# Importing specific modules for text processing
import re                         # Regular expressions for text pattern matching
from nltk.corpus import stopwords # Stopwords list for filtering out common, irrelevant words
from nltk.stem import WordNetLemmatizer # WordNet-based lemmatizer for reducing words to base form
from nltk.tokenize import word_tokenize # Word tokenizer to split text into individual words
import json                       # JSON module for handling JSON data structures

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Here, we define a function to preprocess text by cleaning, tokenizing, and normalizing it for our search engine. First, we convert text to lowercase, remove non-alphanumeric characters, and split it into individual words. We then remove stopwords and apply lemmatization to reduce each word to its base form, ensuring that our search engine captures the essential meaning of each term while discarding irrelevant details.

In [13]:
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    token = word_tokenize(text)  # Tokenize the text into words
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    token = [word for word in token if word not in stop_words]  # Keep only words not in stopwords
    
    # Choose between stemming or lemmatization for standardization
    # We can uncomment the following lines to use stemming:
    # stemmer = nltk.PorterStemmer()
    # token = [stemmer.stem(word) for word in token]  # Apply stemming to each word (choose based on preference)
    
    # Apply lemmatization to each word
    lemmatizer = WordNetLemmatizer()
    token = [lemmatizer.lemmatize(word) for word in token]
    
    return token

Now, we build a vocabulary and an inverted index to efficiently manage and search terms across restaurant descriptions. First, we assign a unique term ID to each word encountered, storing it in the vocabulary, and then we map each term ID to the document IDs where the term appears, creating the inverted index. Finally, we save the vocabulary as a CSV file and the inverted index as a JSON file, providing a structured and searchable representation of the text data for our search engine.

In [14]:
def create_vocabulary_and_inverted_index(descriptions):
    vocabulary = {}         # Dictionary to store unique words with their term IDs
    inverted_index = {}      # Dictionary to store the inverted index
    term_id = 0

    # Loop through each description, assigning a unique doc_id to each
    for doc_id, text in enumerate(descriptions):
        tokens = preprocess_text(text)  # Preprocess and tokenize the text

        for token in tokens:
            # If the word is not in the vocabulary, add it with a new term_id
            if token not in vocabulary:
                vocabulary[token] = term_id
                term_id += 1

            # Get the term_id of the word
            tid = vocabulary[token]

            # Add doc_id to the inverted index for this term_id
            if tid not in inverted_index:
                inverted_index[tid] = []
            if doc_id not in inverted_index[tid]:
                inverted_index[tid].append(doc_id)

    # Save the vocabulary to a CSV file
    with open('vocabulary.csv', 'w') as vocab_file:
        for word, tid in vocabulary.items():
            vocab_file.write(f"{word},{tid}\n")

    # Save the inverted index to a JSON file
    with open('inverted_index.json', 'w') as index_file:
        json.dump(inverted_index, index_file)

    return vocabulary, inverted_index

We implement a function to handle conjunctive queries, allowing us to search for restaurants that contain all specified terms in their descriptions. We process the query by mapping each search term to its unique term ID, then use the inverted index to retrieve only the documents (restaurants) containing all query terms. This approach ensures that our search engine provides precise results by returning only the restaurant data that fully matches the user’s search criteria.

In [15]:
def conjunctive_query(query, vocabulary, inverted_index, restaurant_data):
    # Preprocess the query text and convert it to term IDs using the vocabulary
    query_tokens = preprocess_text(query)
    query_term_ids = [vocabulary.get(token) for token in query_tokens if token in vocabulary]

    # If no terms from the query are found in the vocabulary, return an empty result
    if not query_term_ids:
        return []

    # Find documents containing all term IDs from the query
    doc_lists = [set(inverted_index[tid]) for tid in query_term_ids if tid in inverted_index]

    # Intersect document lists to get documents that contain all query terms
    if doc_lists:
        matching_docs = set.intersection(*doc_lists)
    else:
        matching_docs = set()

    # Return the corresponding restaurant data for matching documents
    results_df = restaurant_data[restaurant_data.index.isin(matching_docs)]

    # Return just the key information (restaurant name, address, description and website)
    return results_df[['restaurantName', 'address', 'description', 'website']]

We now test our search engine by running a sample query for "modern seasonal cuisine" to see if it accurately returns matching restaurants. We generate the vocabulary and inverted index from our dataset and use them to process the query, displaying the names and descriptions of any restaurants that meet the criteria. Finally, we summarize the results by indicating the total number of matches, which helps verify the effectiveness of our search engine in retrieving relevant data.

In [16]:
# Define the query to test the search engine
query = 'modern roman cuisine'

# Build the vocabulary and inverted index from the descriptions
vocabulary, inverted_index = create_vocabulary_and_inverted_index(data['text_to_compare'])

# Run the conjunctive query
results = conjunctive_query(query, vocabulary, inverted_index, data)

# Display the results
print(f"Results for the query '{query}':")
results

Results for the query 'modern roman cuisine':


Unnamed: 0,restaurantName,address,description,website
198,Scrigno del Duomo,piazza Duomo 29,This restaurant boasts a stunning setting that...,https://www.scrignodelduomo.com/
480,Ponte Pietra,via Ponte Pietra 34,An old building next to the Roman bridge Ponte...,https://ristorantepontepietra.com
568,Gellius,calle Pretoria 6,Gellius boasts a unique and atmospheric settin...,https://www.ristorantegellius.it/
828,Gina,via Croce di Città 25,"In the old town, a simple and informal locatio...",http://www.ginacasaconcucina.com
1694,La Locanda del Cardinale,piazza del Vescovado 8,"A medieval house with stone arches, built over...",https://www.lalocandadelcardinale.com/


## Ranked Search Engine
The engine evaluates the importance of each term in each document (restaurant information) using TF-IDF (Term Frequency - Inverse Document Frequency). TF measures how often a term appears in a document, while IDF reduces the weights (the importance) of terms that are present in many documents, prioritizing unique and informative terms.

In this cell, we calculate the Term Frequency-Inverse Document Frequency (TF-IDF) for each word across all restaurant descriptions to measure the relevance of each term within each document. First, we compute Term Frequency (TF) and Document Frequency (DF) for each term, then use these values to calculate TF-IDF scores, which help identify the most significant words in each description. Finally, we create an inverted index with TF-IDF weights, enabling efficient retrieval of relevant documents based on keyword importance in our search engine.

In [33]:
import numpy as np
import pandas as pd
from collections import defaultdict
import math

def calculate_tf_idf(data, vocabulary):
    N = len(data)  # Total number of documents
    tf = defaultdict(lambda: defaultdict(int))  # Term Frequency (TF) for each document and term
    df = defaultdict(int)  # Document Frequency (DF) for each term
    tf_idf = defaultdict(lambda: defaultdict(float))  # TF-IDF scores for each document and term

    # Calculate TF and DF
    for doc_id, description in enumerate(data['text_to_compare']):
        tokens = preprocess_text(description)
        token_counts = defaultdict(int)
        
        for token in tokens:
            if token in vocabulary:
                token_id = vocabulary[token]
                token_counts[token_id] += 1
                tf[doc_id][token_id] = token_counts[token_id]

        # Increment DF for each unique token in the document
        for token_id in set(token_counts.keys()):
            df[token_id] += 1
    
    # Calculate TF-IDF
    for doc_id in tf.keys():
        for token_id, count in tf[doc_id].items():
            term_frequency = count
            epsilon = 1e-10  # Small constant to avoid division by zero
            inverse_document_frequency = math.log(N / (df[token_id] + epsilon))
            tf_idf[doc_id][token_id] = term_frequency * inverse_document_frequency

    # Normalize each document's TF-IDF vector and create an inverted index with normalized TF-IDF weights
    inverted_index = defaultdict(list)
    for doc_id, token_weights in tf_idf.items():
        doc_vector_norm = math.sqrt(sum(weight ** 2 for weight in token_weights.values()))
        for token_id, weight in token_weights.items():
            normalized_weight = weight / (doc_vector_norm + epsilon)  # Normalize each term's weight
            inverted_index[token_id].append((doc_id, normalized_weight))

    return inverted_index

Here, we implement a ranked search function that allows us to retrieve the top results based on similarity to a user query. We calculate TF-IDF scores for the query terms, then compute cosine similarity scores between the query and each document to rank the results. By selecting the highest-ranking matches, we ensure that our search engine returns the most relevant restaurant descriptions for each query.

In [34]:
def ranked_query(query, vocabulary, inverted_index, data, top_k=5):
    # Preprocess the query and convert terms to their IDs using the vocabulary
    query_tokens = preprocess_text(query)
    query_term_ids = [vocabulary.get(token) for token in query_tokens if token in vocabulary]
    
    # Calculate TF-IDF scores for the query
    query_tf_idf = defaultdict(float)
    for term_id in query_term_ids:
        if term_id:
            query_tf_idf[term_id] += 1  # Term Frequency (TF) for the query terms

    # Normalize the query TF-IDF vector
    query_vector_norm = math.sqrt(sum(weight ** 2 for weight in query_tf_idf.values()))
    query_tf_idf = {term_id: weight / (query_vector_norm + 1e-10) for term_id, weight in query_tf_idf.items()}

    # Calculate cosine similarity between query and document vectors
    doc_scores = defaultdict(float)
    for term_id, query_weight in query_tf_idf.items():
        if term_id in inverted_index:
            for doc_id, doc_weight in inverted_index[term_id]:
                doc_scores[doc_id] += query_weight * doc_weight

    # Sort documents by similarity score and select the top-k results
    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # Retrieve and format the top-k results
    results = []
    for doc_id, score in ranked_docs:
        row = data.iloc[doc_id]
        results.append({
            "restaurantName": row['restaurantName'],
            "address": row['address'],
            "description": row['description'],
            "website": row['website'],
            "similarity": round(score, 4)  # Similarity score should now be in the range [0, 1]
        })

    return pd.DataFrame(results)

In this cell, we calculate the TF-IDF-based inverted index, which allows us to weight terms by their importance across all restaurant descriptions. We then test our search engine by running a ranked query for "modern seasonal cuisine," retrieving and displaying the top five most relevant restaurant results. This step helps us evaluate the search engine’s effectiveness in returning high-quality, relevant matches based on user input.

In [35]:
# Calculate the inverted index with TF-IDF weights
inverted_index_tf_idf = calculate_tf_idf(data, vocabulary)

# Perform a ranked query
query = 'modern roman cuisine'
results = ranked_query(query, vocabulary, inverted_index_tf_idf, data, top_k=5)

# Display the results for the query
print(f"Results for the ranked query '{query}':")
results

Results for the ranked query 'modern roman cuisine':


Unnamed: 0,restaurantName,address,description,website,similarity
0,Domenico dal 1968,via Satrico 23,"Situated away from the tourist trail, this aut...",https://www.domenicodal1968.it/,0.266
1,Poldo e Gianna Osteria,vicolo Rosini 6/7,This cheerful and attractive contemporary rest...,http://www.poldoegianna.it,0.2612
2,CiPASSO,via Metastasio 21,This contemporary bistro with a hint of vintag...,http://www.cipassoitalia.it,0.2321
3,Roscioli,via dei Giubbonari 21,This restaurant is part of one of the best foo...,https://www.salumeriaroscioli.com/,0.2001
4,Armando al Pantheon,salita de' Crescenzi 31,This small restaurant just a few metres from t...,https://www.armandoalpantheon.it/,0.1986


# Question 5


In [47]:
def create_vocabularies_and_indexes(data):
    vocabularies = {
        'restaurantName': {},
        'city': {},
        'cuisineType': {}
    }
    inverted_indexes = {
        'restaurantName': defaultdict(set),
        'city': defaultdict(set),
        'cuisineType': defaultdict(set)
    }
    term_ids = {
        'restaurantName': 0,
        'city': 0,
        'cuisineType': 0
    }

    # Loop through each restaurant document
    for doc_id, row in data.iterrows():
        for field in ['restaurantName', 'city', 'cuisineType']:
            tokens = preprocess_text(row[field])

            for token in tokens:
                # Assign a new term ID if token not in vocabulary for the field
                if token not in vocabularies[field]:
                    vocabularies[field][token] = term_ids[field]
                    term_ids[field] += 1

                # Add the doc_id to the inverted index for this token in the field
                term_id = vocabularies[field][token]
                inverted_indexes[field][term_id].add(doc_id)

    return vocabularies, inverted_indexes

In [48]:
def advanced_query(query, vocabularies, inverted_indexes, restaurant_data, filters=None): 
    # Tokenize the query terms for each field in vocabularies
    query_tokens = preprocess_text(query)
    results = set(restaurant_data.index)  # Start with all restaurants as candidates

    # Process query for each field: name, city, cuisine
    for field in ['restaurantName', 'city', 'cuisineType']:
        query_term_ids = [vocabularies[field].get(token) for token in query_tokens if token in vocabularies[field]]
        doc_lists = [inverted_indexes[field][tid] for tid in query_term_ids if tid in inverted_indexes[field]]
        
        # Intersect document lists for the field if terms were found
        if doc_lists:
            results.intersection_update(set.intersection(*doc_lists))

    # Apply filters
    if filters:
        if 'priceRange' in filters:
            results = results.intersection(set(restaurant_data[restaurant_data['priceRange'].isin(filters['priceRange'])].index))

        if 'regions' in filters:
            results = results.intersection(set(restaurant_data[restaurant_data['region'].isin(filters['regions'])].index))
        
        if 'creditCards' in filters:
            # Replace NaN values with empty strings in the 'creditCards' column for safe filtering
            restaurant_data['creditCards'] = restaurant_data['creditCards'].fillna('')
            results = results.intersection(set(
                restaurant_data[restaurant_data['creditCards'].apply(lambda x: any(card in x for card in filters['creditCards']))].index
            ))

        if 'facilities' in filters:
            # Replace NaN values with empty strings in the 'facilitiesServices' column for safe filtering
            restaurant_data['facilitiesServices'] = restaurant_data['facilitiesServices'].fillna('')
            results = results.intersection(set(
                restaurant_data[restaurant_data['facilitiesServices'].apply(lambda x: all(facility in x for facility in filters['facilities']))].index
            ))

    # Extract and format the results
    results_df = restaurant_data.loc[:, ['restaurantName', 'address', 'cuisineType', 'priceRange', 'website']]
    return results_df


In [50]:
filters = {
    'priceRange': ['€', '€€'],
    'regions': ['Lazio', 'Tuscany'],
    'creditCards': ['Visa', 'MasterCard'],
    'facilities': ['Wi-Fi', 'Terrace']
}
vocabularies = create_vocabularies_and_indexes(df)[0]
inverted_indexes = create_vocabularies_and_indexes(df)[1]
# Run the advanced query
query = 'seafood Italian'
results = advanced_query(query = query, vocabularies = vocabularies, inverted_indexes = inverted_indexes, restaurant_data = df, filters=filters)
print(vocabularies)
inverted_indexes


{'restaurantName': {'20tre': 0, 'alessandro': 1, 'feo': 2, 'ape': 3, 'vino': 4, 'e': 5, 'cucina': 6, 'charleston': 7, 'da': 8, 'bob': 9, 'cook': 10, 'fish': 11, 'dam': 12, 'dama': 13, 'donevandro': 14, 'etra': 15, 'il': 16, 'ristorante': 17, 'alain': 18, 'ducasse': 19, 'napoli': 20, 'tirabusci': 21, 'la': 22, 'buca': 23, 'trattoria': 24, 'enrico': 25, 'bartolini': 26, 'loro': 27, 'mnage': 28, 'mare': 29, 'palazzo': 30, 'utini': 31, 'procaccini': 32, 'sa': 33, 'domu': 34, 'sarda': 35, 'soul': 36, 'bcaro': 37, 'gusto': 38, 'casa': 39, 'rispoli': 40, 'castello': 41, 'di': 42, 'fighine': 43, 'dolada': 44, 'eea': 45, 'gasthofstube': 46, 'stafler': 47, 'luogo': 48, 'aimo': 49, 'nadia': 50, 'niko': 51, 'romito': 52, 'larcangelo': 53, 'brughiera': 54, 'tavola': 55, 'locanda': 56, 'delle': 57, 'tre': 58, 'chiavi': 59, 'luminist': 60, 'caf': 61, 'bistrot': 62, 'musciora': 63, 'osteria': 64, 'le': 65, 'panzanelle': 66, 'mercato': 67, 'quintogusto': 68, 'raie': 69, 'torre': 70, 'del': 71, 'saracin

{'restaurantName': defaultdict(set,
             {0: {0},
              1: {1, 184},
              2: {1},
              3: {2},
              4: {2, 124, 597, 729, 781, 1115, 1192},
              5: {2,
               26,
               68,
               131,
               137,
               150,
               250,
               382,
               455,
               479,
               523,
               549,
               597,
               685,
               729,
               817,
               820,
               890,
               945,
               958,
               994,
               1019,
               1063,
               1115,
               1156,
               1192,
               1222,
               1281,
               1570,
               1614,
               1677,
               1678,
               1784,
               1836,
               1976},
              6: {2,
               91,
               204,
               256,
               287,
   

In [None]:
import pandas as pd
from collections import defaultdict

# Carica il dataset Michelin dal file .tsv
dataset_path =  r"C:\Users\Utente\OneDrive - uniroma1.it\Esami\ADM\Homework 3\michelin_restaurants_data.tsv"
df = pd.read_csv(dataset_path, sep='\t')

# Aggiunge la colonna 'region' con valore di default "unknown" per ogni riga, in modo dinamico
df["region"] = ["unknown"] * len(df)
# Aggiunge un ID univoco
df["id"] = df.index

# Debug per verificare la presenza delle colonne
print("Colonne del DataFrame:", df.columns)
print("Esempio di righe:", df.head())

def build_inverted_index(df, field):
    inverted_index = defaultdict(list)
    for idx, row in df.iterrows():
        terms = str(row[field]).lower().split()
        for term in terms:
            inverted_index[term].append(row["id"])
    return dict(inverted_index)

# Creiamo gli indici invertiti per i campi specifici
restaurant_name_index = build_inverted_index(df, "restaurantName")
city_index = build_inverted_index(df, "city")
cuisine_type_index = build_inverted_index(df, "cuisineType")

def filter_by_price_range(df, min_price, max_price):
    price_levels = {"€": 1, "€€": 2, "€€€": 3, "€€€€": 4}
    min_level = price_levels.get(min_price, 1)
    max_level = price_levels.get(max_price, 4)
    return df[df["priceRange"].apply(lambda x: min_level <= price_levels.get(x, 0) <= max_level)]

def filter_by_region(df, regions):
    print("Debug - Esecuzione filter_by_region")  # Debug per sapere quando entriamo nella funzione
    print("Regioni richieste:", regions)         # Stampa le regioni che stiamo cercando
    print("Valori unici nella colonna 'region':", df["region"].unique())  # Controllo sui valori presenti

    regions = [region.lower() for region in regions]
    return df[df["region"].str.lower().isin(regions)]

def filter_by_credit_cards(df, accepted_cards):
    return df[df["creditCards"].apply(lambda x: any(card in str(x) for card in accepted_cards))]

def filter_by_services(df, required_services):
    return df[df["facilitiesServices"].apply(lambda x: all(service in str(x) for service in required_services))]

def advanced_search_2(df, queries=None, min_price=None, max_price=None, regions=None, accepted_cards=None, required_services=None):
    results = df
    if queries:
        if "restaurantName" in queries:
            name_terms = queries["restaurantName"].lower().split()
            name_matches = set()
            for term in name_terms:
                name_matches.update(restaurant_name_index.get(term, []))
            results = results[results["id"].isin(name_matches)]
        
        if "city" in queries:
            city_terms = queries["city"].lower().split()
            city_matches = set()
            for term in city_terms:
                city_matches.update(city_index.get(term, []))
            results = results[results["id"].isin(city_matches)]
        
        if "cuisineType" in queries:
            cuisine_terms = queries["cuisineType"].lower().split()
            cuisine_matches = set()
            for term in cuisine_terms:
                cuisine_matches.update(cuisine_type_index.get(term, []))
            results = results[results["id"].isin(cuisine_matches)]
    
    if min_price and max_price:
        results = filter_by_price_range(results, min_price, max_price)
    
    if regions:
        print("Debug - Chiamata a filter_by_region con:", regions)  # Debug per la funzione di filtro regione
        results = filter_by_region(results, regions)
    
    if accepted_cards:
        results = filter_by_credit_cards(results, accepted_cards)
    
    if required_services:
        results = filter_by_services(results, required_services)

    return results[["restaurantName", "address", "cuisineType", "priceRange", "website"]]

queries = {
    "restaurantName": "Al Camin",
    "city": "Cortina",
    "cuisineType": "Italian"
}
min_price = "€"
max_price = "€€€"
regions = ["known"]
accepted_cards = ["Visa", "MasterCard"]
required_services = ["Wi-Fi", "Terrace"]

results = advanced_search_2(
    df,
    queries=queries,
    min_price=min_price,
    max_price=max_price,
    regions=regions,
    accepted_cards=accepted_cards,
    required_services=required_services
)

print(results)

Colonne del DataFrame: Index(['restaurantName', 'address', 'city', 'postalCode', 'country',
       'priceRange', 'cuisineType', 'description', 'facilitiesServices',
       'creditCards', 'phoneNumber', 'website', 'region', 'id'],
      dtype='object')
Esempio di righe:       restaurantName                    address                    city  \
0              20Tre   via David Chiossone 20 r                   Genoa   
1     Alessandro Feo        via Angelo Lista 24  Marina di Casal Velino   
2  Ape Vino e Cucina      Piazza Risorgimento 3                    Alba   
3         Charleston  via Generale Magliocco 19                 Palermo   
4   Da Bob Cook Fish   largo Parsano vecchio 16                Sorrento   

   postalCode country priceRange                    cuisineType  \
0       16123   Italy         €€  Farm to table, Modern Cuisine   
1       84040   Italy         €€             Campanian, Seafood   
2       12051   Italy         €€      Piedmontese, Contemporary   
3       901

KeyError: 'region'

In [2]:
len(df)

1982