In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [10]:
# URL Michelin
url = "https://guide.michelin.com/en/restaurants/page/{}"


# Function to find the total number of pages
def get_total_pages():
    response = requests.get(url.format(1))
    soup = BeautifulSoup(response.content, "html.parser")
    class_names = soup.find_all(
        "a",
        class_=[
            "btn btn-outline-secondary btn-sm active",
            "btn btn-outline-secondary btn-sm",
        ],
    )
    total_pages = max(
        [
            int(class_name.text)
            for class_name in class_names
            if class_name.text.isdigit()
        ]
    )
    return total_pages

In [11]:
# Scrape all restaurant links
def find_all_links():
    total_pages = get_total_pages()
    links = []
    for page in range(1, total_pages + 1):
        response = requests.get(url.format(page))
        soup = BeautifulSoup(response.content, "html.parser")
        for link in soup.find_all("a", href=True):
            if "/restaurant/" in link["href"]:
                full_link = "https://guide.michelin.com" + link["href"]
                links.append(full_link)
    return links

In [12]:
# Create the dataframe with the links
def create_links_dataframe():
    all_links = find_all_links()
    if all_links:
        # Cria o DataFrame com uma coluna 'Links'
        df = pd.DataFrame(all_links, columns=["Links"])
        return df
    else:
        print("No links found.")
        return None
    
df_links = create_links_dataframe()
if df_links is not None:
    print(df_links)
else:
    print("Error creating DataFrame.")

                                                   Links
0      https://guide.michelin.com/en/vaud/vevey/resta...
1      https://guide.michelin.com/en/vaud/vevey/resta...
2      https://guide.michelin.com/en/vaud/vevey/resta...
3      https://guide.michelin.com/en/geneve-region/ge...
4      https://guide.michelin.com/en/geneve-region/ge...
...                                                  ...
56686  https://guide.michelin.com/en/prague/prague/re...
56687  https://guide.michelin.com/en/prague/prague/re...
56688  https://guide.michelin.com/en/prague/prague/re...
56689  https://guide.michelin.com/en/prague/prague/re...
56690  https://guide.michelin.com/en/prague/prague/re...

[56691 rows x 1 columns]


In [13]:
# Remove duplicates and save dataframe
df_links = df_links.drop_duplicates()

df_links.to_csv("links.csv", index=False)