### Collecting the data

We gather all restaurant URLs from the guide and store them in a text file for further data processing.

In [None]:
import requests
from bs4 import BeautifulSoup

# Header for the request (optional, can help prevent the server from blocking the request)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Open a file to write restaurant URLs
with open("michelin_restaurants_urls.txt", "w") as file:
    # Iterate through all pages (from 1 to 100)
    for page in range(1, 101):
        # Base URL with page number
        url = f"https://guide.michelin.com/en/it/restaurants/page/{page}"

        # Make the GET request
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code == 200:
            # Create the BeautifulSoup object to parse the HTML content
            soup = BeautifulSoup(response.text, "html.parser")

            # Find all restaurant links
            restaurant_links = soup.find_all("a", class_="link")

            # Iterate over restaurant links and write URLs to the file
            for link in restaurant_links:
                href = link.get("href")
                if href and "/restaurant/" in href:
                    full_url = f"https://guide.michelin.com{href}"
                    file.write(full_url + "\n")
                    print(f"URL added: {full_url}")
        else:
            print(f"Error in request to page {page}: {response.status_code}")

We download the HTML content of Michelin restaurant pages concurrently, organizing each batch of 20 URLs into separate folders with the restaurant's name as the filename.

In [2]:
import requests
import os
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

# Header for the request (optional)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Load URLs from the file
with open("michelin_restaurants_urls.txt", "r") as file:
    urls = [line.strip() for line in file.readlines()]

# Define the number of URLs per folder and max concurrent threads
url_per_folder = 20
max_threads = 5  # Adjust the number of threads based on your network and system capacity

def download_html(url, folder_name):
    """Function to download HTML of a given URL and save it in the specified folder with the restaurant name."""
    try:
        # Extract restaurant name from the URL
        restaurant_name = urlparse(url).path.split('/')[-1]
        
        # Make the GET request
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Save the HTML content with the restaurant's name
            file_path = os.path.join(folder_name, f"{restaurant_name}.html")
            with open(file_path, "w", encoding="utf-8") as html_file:
                html_file.write(response.text)
            print(f"Saved: {file_path}")
        else:
            print(f"Error fetching {url}: {response.status_code}")
        sleep(0.1)  # Slight delay to avoid overwhelming the server

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")

# Split URLs into groups of 20 and download concurrently
for i in range(0, len(urls), url_per_folder):
    folder_num = i // url_per_folder + 1
    folder_name = f"page_{folder_num}"
    
    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Get the next 20 URLs
    url_subset = urls[i:i + url_per_folder]

    # Use ThreadPoolExecutor to download concurrently
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Start a thread for each URL in the subset
        futures = [executor.submit(download_html, url, folder_name) for url in url_subset]
        
        # Wait for all threads in the subset to complete
        for future in as_completed(futures):
            future.result()  # Retrieve any exceptions if occurred

Saved: page_1\da-mo.html
Saved: page_1\o-me-o-il-mare.html
Saved: page_1\donevandro.html
Saved: page_1\da-bob-cook-fish.html
Saved: page_1\ape-vino-e-cucina.html
Saved: page_1\charleston.html
Saved: page_1\sa-domu-sarda.html
Saved: page_1\alessandro-feo.html
Saved: page_1\il-tirabuscio262517.html
Saved: page_1\la-buca130947.html
Saved: page_1\il-ristorante-alain-ducasse-napoli.html
Saved: page_1\dama-1213583.html
Saved: page_1\palazzo-utini.html
Saved: page_1\etra.html
Saved: page_1\soul-fish.html
Saved: page_1\la-trattoria-enrico-bartolini.html
Saved: page_1\loro.html
Saved: page_1\20tre.html
Saved: page_1\menage.html
Saved: page_1\procaccini.html
Saved: page_2\metodo-1213628.html
Saved: page_2\gimmy-s.html
Saved: page_2\osteria-dell-accademia.html
Saved: page_2\fratelli-bruzzone.html
Saved: page_2\serrae-villa-fiesole.html
Saved: page_2\locanda-perbellini-ai-beati.html
Saved: page_2\innesti.html
Saved: page_2\salvo.html
Saved: page_2\arnolfo.htmlSaved: page_2\osteria-del-teatro.html


We parse HTML files for Michelin restaurants to extract key information (e.g., name, address, cuisine, facilities), and save each restaurant's data as an individual .tsv file.

In [3]:
from bs4 import BeautifulSoup
import os
import csv

def parse_restaurant_html(file_path, url):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Initialize dictionary to store restaurant data
    data = {
        "restaurantName": None,
        "address": None,
        "city": None,
        "postalCode": None,
        "country": "Italy",
        "priceRange": None,
        "cuisineType": None,
        "description": None,
        "facilitiesServices": [],
        "creditCards": [],
        "phoneNumber": None,
        'website': url
    }

    # Extract restaurant name
    name_tag = soup.find("h1", class_="data-sheet__title")
    if name_tag:
        data["restaurantName"] = name_tag.get_text(strip=True)

    # Extract complete address
    address_tag = soup.find_all("div", class_="data-sheet__block--text")[0]
    if address_tag:
        full_address = address_tag.get_text(strip=True)
        address_parts = full_address.split(', ')

        # Parse city, postal code, and country
        if len(address_parts) >= 3:
            country = address_parts[-1]
            postal_code = address_parts[-2]
            city = address_parts[-3]
            address = ', '.join(address_parts[:-3])
        elif len(address_parts) == 2:
            address, city = address_parts
            postal_code = country = ""
        else:
            address = full_address
            city = postal_code = country = ""

        data["address"] = address
        data["city"] = city
        data["postalCode"] = postal_code
        data["country"] = country

    # Extract price range and cuisine type
    price_type = soup.find_all("div", class_="data-sheet__block--text")[1]
    if price_type:
        full_price_type = price_type.get_text(strip=True)
        price_type_list = full_price_type.split()
        price = price_type_list[0]
        types = ' '.join(price_type_list[2:])
        data["priceRange"] = price
        data["cuisineType"] = types

    # Extract description
    description_tag = soup.find("div", class_="data-sheet__description")
    if description_tag:
        data["description"] = description_tag.get_text(strip=True)

    # Extract facilities and services
    facilities_section = soup.find('div', class_="restaurant-details__services")
    if facilities_section:
        services = [li.get_text(strip=True) for li in facilities_section.find_all('li')]
        data["facilitiesServices"] = services

    # Extract accepted credit cards
    credit_card_tags = soup.find_all("div", class_="restaurant-details__services--info")
    creditCards = []
    for tag in credit_card_tags:
        for img in tag.find_all("img"):
            if 'data-src' in img.attrs:
                credit_card_name = os.path.basename(img['data-src']).split('-')[0]
                creditCards.append(credit_card_name.title())
    data["creditCards"] = creditCards if creditCards else None

    # Extract phone number
    phone_tag = soup.find("div", class_="collapse__block-item")
    if phone_tag:
        data["phoneNumber"] = phone_tag.get_text(strip=True)

    # Save each restaurant's data to a .tsv file
    output_file = file_path.replace(".html", ".tsv")
    with open(output_file, mode='w', encoding='utf-8') as file:
        keys = list(data.keys())
        dict_writer = csv.DictWriter(file, fieldnames=keys, delimiter='\t')
        dict_writer.writeheader()
        dict_writer.writerow(data)

    return data

Now, we process each restaurant's HTML file to extract data using concurrent threads, making the data collection process more efficient. Once all files are processed, we gather the generated .tsv files from each folder, read them into individual DataFrames, and then concatenate them into a single, unified DataFrame. Finally, we save this combined dataset to a single .tsv file, which consolidates all restaurant information for easy analysis and access.

In [4]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Base directory containing the HTML files
base_directory = r"C:\Users\Utente\OneDrive - uniroma1.it\Esami\ADM\Homework 3"

# Load URLs from the file
with open("michelin_restaurants_urls.txt", "r") as file:
    urls = [line.strip() for line in file.readlines()]

# Organize URLs into a dictionary by extracting the restaurant name from each URL
url_dict = {url.split('/')[-1]: url for url in urls}

# Define the function to process each HTML file
def process_restaurant_file(file_info):
    filename, folder_path = file_info
    restaurant_name_in_file = filename.replace(".html", "")
    url = url_dict.get(restaurant_name_in_file, "")
    file_path = os.path.join(folder_path, filename)
    restaurant_data = parse_restaurant_html(file_path, url)
    return restaurant_data if restaurant_data["restaurantName"] else None

# Collect all files to process
files_to_process = []
for folder in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder)
    if os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith(".html"):
                files_to_process.append((filename, folder_path))

# Set up concurrent processing
with ThreadPoolExecutor() as executor:
    # Schedule each file to be processed concurrently
    futures = {executor.submit(process_restaurant_file, file_info): file_info for file_info in files_to_process}

# Combine all created .tsv files into a single DataFrame

# List to collect each DataFrame
all_data = []

# Iterate over each folder in the base directory
for folder in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder)
    if os.path.isdir(folder_path):
        # Iterate over each .tsv file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".tsv"):
                file_path = os.path.join(folder_path, filename)
                # Read each .tsv file and append to the list
                df = pd.read_csv(file_path, sep='\t')
                all_data.append(df)

# Concatenate all individual DataFrames into one combined DataFrame
data = pd.concat(all_data, ignore_index=True)

# Save the combined DataFrame to a single .tsv file
output_file = os.path.join(base_directory, "all_restaurants_data.tsv")
data.to_csv(output_file, sep='\t', index=False, encoding="utf-8")

###  Conjunctive Search Engine

In this cell, we import essential libraries for text processing, including the Natural Language Toolkit (NLTK) to support our search engine’s linguistic capabilities. We download key resources like stopwords, tokenizers, and WordNet, which will help us standardize and clean the text by removing irrelevant words and lemmatizing terms to their base forms. Additionally, we include modules for handling regular expressions and JSON data, ensuring that our search engine can efficiently parse, process, and organize restaurant information.

In [5]:
# Imports for Search Engine

import nltk  # Natural Language Toolkit for text processing

# Download necessary NLTK resources
nltk.download('stopwords')       # Stopwords list to filter out common words (e.g., "the", "is")
nltk.download('punkt')           # Punkt tokenizer for sentence splitting and word tokenization
nltk.download('wordnet')         # WordNet lexical database for lemmatization

# Importing specific modules for text processing
import re                         # Regular expressions for text pattern matching
from nltk.corpus import stopwords # Stopwords list for filtering out common, irrelevant words
from nltk.stem import WordNetLemmatizer # WordNet-based lemmatizer for reducing words to base form
from nltk.tokenize import word_tokenize # Word tokenizer to split text into individual words
import json                       # JSON module for handling JSON data structures

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In this cell, we convert all text in the restaurant descriptions to lowercase to ensure that our search engine is case-insensitive. By standardizing the text format, we eliminate any discrepancies caused by different letter cases, allowing users to find relevant results regardless of capitalization. This step enhances the accuracy and usability of the search engine by treating terms like "Cuisine" and "cuisine" as identical.

In [6]:
# Lowercase Restaurant Description to make the building of the search engine case-insensitive
data['description']= data['description'].str.lower()

Here, we define a function to preprocess text by cleaning, tokenizing, and normalizing it for our search engine. First, we convert text to lowercase, remove non-alphanumeric characters, and split it into individual words. We then remove stopwords and apply lemmatization to reduce each word to its base form, ensuring that our search engine captures the essential meaning of each term while discarding irrelevant details.

In [7]:
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    token = word_tokenize(text)  # Tokenize the text into words
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    token = [word for word in token if word not in stop_words]  # Keep only words not in stopwords
    
    # Choose between stemming or lemmatization for standardization
    # We can uncomment the following lines to use stemming:
    # stemmer = nltk.PorterStemmer()
    # token = [stemmer.stem(word) for word in token]  # Apply stemming to each word (choose based on preference)
    
    # Apply lemmatization to each word
    lemmatizer = WordNetLemmatizer()
    token = [lemmatizer.lemmatize(word) for word in token]
    
    return token

Now, we build a vocabulary and an inverted index to efficiently manage and search terms across restaurant descriptions. First, we assign a unique term ID to each word encountered, storing it in the vocabulary, and then we map each term ID to the document IDs where the term appears, creating the inverted index. Finally, we save the vocabulary as a CSV file and the inverted index as a JSON file, providing a structured and searchable representation of the text data for our search engine.

In [8]:
def create_vocabulary_and_inverted_index(descriptions):
    vocabulary = {}         # Dictionary to store unique words with their term IDs
    inverted_index = {}      # Dictionary to store the inverted index
    term_id = 0

    # Loop through each description, assigning a unique doc_id to each
    for doc_id, text in enumerate(descriptions):
        tokens = preprocess_text(text)  # Preprocess and tokenize the text

        for token in tokens:
            # If the word is not in the vocabulary, add it with a new term_id
            if token not in vocabulary:
                vocabulary[token] = term_id
                term_id += 1

            # Get the term_id of the word
            tid = vocabulary[token]

            # Add doc_id to the inverted index for this term_id
            if tid not in inverted_index:
                inverted_index[tid] = []
            if doc_id not in inverted_index[tid]:
                inverted_index[tid].append(doc_id)

    # Save the vocabulary to a CSV file
    with open('vocabulary.csv', 'w') as vocab_file:
        for word, tid in vocabulary.items():
            vocab_file.write(f"{word},{tid}\n")

    # Save the inverted index to a JSON file
    with open('inverted_index.json', 'w') as index_file:
        json.dump(inverted_index, index_file)

    return vocabulary, inverted_index

We implement a function to handle conjunctive queries, allowing us to search for restaurants that contain all specified terms in their descriptions. We process the query by mapping each search term to its unique term ID, then use the inverted index to retrieve only the documents (restaurants) containing all query terms. This approach ensures that our search engine provides precise results by returning only the restaurant data that fully matches the user’s search criteria.

In [9]:
def conjunctive_query(query, vocabulary, inverted_index, restaurant_data):
    # Preprocess the query text and convert it to term IDs using the vocabulary
    query_tokens = preprocess_text(query)
    query_term_ids = [vocabulary.get(token) for token in query_tokens if token in vocabulary]

    # If no terms from the query are found in the vocabulary, return an empty result
    if not query_term_ids:
        return []

    # Find documents containing all term IDs from the query
    doc_lists = [set(inverted_index[tid]) for tid in query_term_ids if tid in inverted_index]

    # Intersect document lists to get documents that contain all query terms
    if doc_lists:
        matching_docs = set.intersection(*doc_lists)
    else:
        matching_docs = set()

    # Return the corresponding restaurant data for matching documents
    results_df = restaurant_data[restaurant_data.index.isin(matching_docs)]

    # Return just the key information (restaurant name, address, description and website)
    return results_df[['restaurantName', 'address', 'description', 'website']]

We now test our search engine by running a sample query for "modern seasonal cuisine" to see if it accurately returns matching restaurants. We generate the vocabulary and inverted index from our dataset and use them to process the query, displaying the names and descriptions of any restaurants that meet the criteria. Finally, we summarize the results by indicating the total number of matches, which helps verify the effectiveness of our search engine in retrieving relevant data.

In [10]:
# Define the query to test the search engine
query = 'modern seasonal cuisine'

# Build the vocabulary and inverted index from the descriptions
vocabulary, inverted_index = create_vocabulary_and_inverted_index(data['description'])

# Run the conjunctive query
results = conjunctive_query(query, vocabulary, inverted_index, data)

# Display the results
print(f"Results for the query '{query}':")
print(results)

Results for the query 'modern seasonal cuisine':
                       restaurantName  \
26              Il Luogo Aimo e Nadia   
144                      Ca' Del Moro   
165                         Contrasto   
178                              Saur   
278                       San Michele   
308                         Chichibio   
509                         Esplanade   
513                          La Valle   
581                         Zum Löwen   
708            Degusteria del Gigante   
793                       La Bandiera   
838                     Secondo Tempo   
996                      Piccolo Lord   
1077                        Ronchi Rò   
1159                            Razzo   
1228                           Flurin   
1258                   Quadri Bistrot   
1408    Gallery Bistrot Contemporaneo   
1502                         [àbitat]   
1506                          Babette   
1525  Cappuccini Cucina San Francesco   
1679                     Retrobottega   
1716    

### Ranked Search Engine

In this cell, we calculate the Term Frequency-Inverse Document Frequency (TF-IDF) for each word across all restaurant descriptions to measure the relevance of each term within each document. First, we compute Term Frequency (TF) and Document Frequency (DF) for each term, then use these values to calculate TF-IDF scores, which help identify the most significant words in each description. Finally, we create an inverted index with TF-IDF weights, enabling efficient retrieval of relevant documents based on keyword importance in our search engine.

In [12]:
import numpy as np
import pandas as pd
from collections import defaultdict
import math

def calculate_tf_idf(data, vocabulary):
    N = len(data)  # Total number of documents
    tf = defaultdict(lambda: defaultdict(int))  # Term Frequency (TF) for each document and term
    df = defaultdict(int)  # Document Frequency (DF) for each term
    tf_idf = defaultdict(lambda: defaultdict(float))  # TF-IDF scores for each document and term

    # Calculate TF and DF
    for doc_id, description in enumerate(data['description']):
        tokens = preprocess_text(description)
        token_counts = defaultdict(int)
        
        for token in tokens:
            if token in vocabulary:
                token_id = vocabulary[token]
                token_counts[token_id] += 1
                tf[doc_id][token_id] = token_counts[token_id]

        # Increment DF for each unique token in the document
        for token_id in set(token_counts.keys()):
            df[token_id] += 1
    
    # Calculate TF-IDF
    for doc_id in tf.keys():
        for token_id, count in tf[doc_id].items():
            term_frequency = count
            epsilon = 1e-10  # Small constant to avoid division by zero
            inverse_document_frequency = math.log(N / (df[token_id] + epsilon))
            tf_idf[doc_id][token_id] = term_frequency * inverse_document_frequency

    # Normalize each document's TF-IDF vector and create an inverted index with normalized TF-IDF weights
    inverted_index = defaultdict(list)
    for doc_id, token_weights in tf_idf.items():
        doc_vector_norm = math.sqrt(sum(weight ** 2 for weight in token_weights.values()))
        for token_id, weight in token_weights.items():
            normalized_weight = weight / (doc_vector_norm + epsilon)  # Normalize each term's weight
            inverted_index[token_id].append((doc_id, normalized_weight))

    return inverted_index

+ Here, we implement a ranked search function that allows us to retrieve the top results based on similarity to a user query. We calculate TF-IDF scores for the query terms, then compute cosine similarity scores between the query and each document to rank the results. By selecting the highest-ranking matches, we ensure that our search engine returns the most relevant restaurant descriptions for each query.

In [13]:
def ranked_query(query, vocabulary, inverted_index, data, top_k=5):
    # Preprocess the query and convert terms to their IDs using the vocabulary
    query_tokens = preprocess_text(query)
    query_term_ids = [vocabulary.get(token) for token in query_tokens if token in vocabulary]
    
    # Calculate TF-IDF scores for the query
    query_tf_idf = defaultdict(float)
    for term_id in query_term_ids:
        if term_id:
            query_tf_idf[term_id] += 1  # Term Frequency (TF) for the query terms

    # Normalize the query TF-IDF vector
    query_vector_norm = math.sqrt(sum(weight ** 2 for weight in query_tf_idf.values()))
    query_tf_idf = {term_id: weight / (query_vector_norm + 1e-10) for term_id, weight in query_tf_idf.items()}

    # Calculate cosine similarity between query and document vectors
    doc_scores = defaultdict(float)
    for term_id, query_weight in query_tf_idf.items():
        if term_id in inverted_index:
            for doc_id, doc_weight in inverted_index[term_id]:
                doc_scores[doc_id] += query_weight * doc_weight

    # Sort documents by similarity score and select the top-k results
    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # Retrieve and format the top-k results
    results = []
    for doc_id, score in ranked_docs:
        row = data.iloc[doc_id]
        results.append({
            "restaurantName": row['restaurantName'],
            "address": row['address'],
            "description": row['description'],
            "website": row['website'],
            "similarity": round(score, 4)  # Similarity score should now be in the range [0, 1]
        })

    return pd.DataFrame(results)

In this cell, we calculate the TF-IDF-based inverted index, which allows us to weight terms by their importance across all restaurant descriptions. We then test our search engine by running a ranked query for "modern seasonal cuisine," retrieving and displaying the top five most relevant restaurant results. This step helps us evaluate the search engine’s effectiveness in returning high-quality, relevant matches based on user input.

In [14]:
# Calculate the inverted index with TF-IDF weights
inverted_index_tf_idf = calculate_tf_idf(data, vocabulary)

# Perform a ranked query
query = 'modern seasonal cuisine'
results = ranked_query(query, vocabulary, inverted_index_tf_idf, data, top_k=5)

# Display the results for the query
print(f"Results for the ranked query '{query}':")
print(results)

Results for the ranked query 'modern seasonal cuisine':
  restaurantName                                   address  \
0           Saur                      via Filippo Turati 8   
1       La Botte                  via Giuseppe Garibaldi 8   
2   Piccolo Lord               corso San Maurizio 69 bis/g   
3          Razzo                     via Andrea Doria 17/f   
4       La Valle  via Umberto I 25, località Valle Sauglio   

                                         description  \
0  in a tiny rural village, this contemporary, al...   
1  a modern and welcoming contemporary bistro sit...   
2  professional service in a welcoming, modern re...   
3  a quiet restaurant with a relaxed, young and m...   
4  a well - run restaurant in a quiet area just o...   

                                             website  similarity  
0  https://guide.michelin.com/en/lombardia/barco/...      0.2329  
1  https://guide.michelin.com/en/piemonte/stresa/...      0.2084  
2  https://guide.michelin.com/en/