In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep

## Scraping

In [4]:
def scrape_page(page_n):
    url = f"https://www.quandoo.fi/en/helsinki?districtFilter=3637&bookable=true&onlySpecialOffers=false&page={page_n}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    rest_cards = soup.find_all(attrs={"data-qa": "merchant-card"})

    rest_names = [card.find('h3').text.strip() for card in rest_cards]
    rest_locations = [card.find(attrs={"data-qa": "merchant-location"}).text.strip() for card in rest_cards]
    # "Located in ... area" part removed in Location column.
    rest_locations = [location.replace('Located at ','').replace('area','') for location in rest_locations]
    
    rest_cuisines = [card.find(attrs={"data-qa": "merchant-card-cuisine"}).text.strip() for card in rest_cards]
    #Cleaned Cuisine's column data
    rest_cuisines = [cuisine.replace(" Restaurant","") for cuisine in rest_cuisines]

    rest_meals_boxes = [card.find(attrs={"data-qa": "merchant-meal"}) for card in rest_cards]
    rest_meals = [re.sub(r'^\s*Meals:\s*', '', box.text.strip()) if box else None for box in rest_meals_boxes]
    
    rest_rating_boxes = [card.find(attrs={"data-qa": "reviews-score"}) for card in rest_cards]
    rest_ratings = [float(box.text.split('/')[0].strip()) if box else None for box in rest_rating_boxes]

    rest_review_boxes = [card.find(class_="sc-1atis9w-3 dfyExP") for card in rest_cards]
    rest_review_counts = [int(box.text.split()[0].strip()) if box else None for box in rest_review_boxes]

    rest_price_boxes = [card.find(class_=re.compile(r'.*price-indicator')) for card in rest_cards]
    rest_price_levels = [len(box.find_all(class_=re.compile(r'.*oGCHK'))) for box in rest_price_boxes]

    rest_page_urls = [card.find('a')['href'] for card in rest_cards]

    df = pd.DataFrame({'Name': rest_names, 
                    'Location': rest_locations,
                    'Cuisine': rest_cuisines,
                    'Meals': rest_meals,
                    'Price Level (out of 4)': rest_price_levels,
                    'Rating (out of 6)': rest_ratings,
                    'Review Count': rest_review_counts,
                    'Page URL': rest_page_urls
                    })

    return df

In [4]:
restaurant_data = pd.DataFrame()

for page_n in range(1, 15):
    page_data = scrape_page(page_n)
    restaurant_data = pd.concat([restaurant_data, page_data], ignore_index=True)

restaurant_data

NameError: name 'scrape_page' is not defined

In [6]:
restaurant_data.to_csv("restaurant_data.csv", sep="\t")

## Review Scraping

In [2]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
109,Ravintola Piilo,Hietalahti,Italian,"Breakfast, Lunch, Brunch, Dinner",0,4.7,,/en/place/ravintola-piilo-91693/about
335,Bistro Telakka,Lauttasaari,International,"Lunch, Dessert, Dinner",0,5.1,,/en/place/bistro-telakka-92514/menu
64,Kissakahvila Helkatti,Kamppi,Eat & Drink,"Lunch, Cake & Coffee",0,5.0,,/en/place/kissakahvila-helkatti-106475/menu
239,Vietologie,Töölö,Vietnamese,"Lunch, Dinner",0,,,/en/place/vietologie-109409/menu
21,Ravintola Muru,City Centre,French,Dinner,0,5.4,,/en/place/ravintola-muru-9646/menu
156,Bistro Palo,Malmi,International,"Lunch, Dinner",0,5.0,,/en/place/bistro-palo-96188/menu
217,Aito Fresh,City Centre,Asian Fusion,"Lunch, Dinner",0,4.7,,/en/place/aito-fresh-104403/menu
277,Ravintola Rara,Pikku Huopalahti,Nepalese,"Lunch, Dinner",0,5.7,,/en/place/ravintola-rara-97770/menu
233,Chao Phraya Helsinki - Thai Restaurant,Hietalahti,Thai,"Lunch, Dinner",0,4.0,,/en/place/chao-phraya-helsinki-thai-restaurant...
232,Amex Exclusive Lunch: Pastis,Kaartinkaupunki,French,,0,5.0,,/en/place/amex-exclusive-lunch-pastis-104118/a...


In [4]:
def get_reviews_per_rest(rest_name, rel_path):
    review_texts, review_scores = [], []
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/reviews"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    pagination_btns = soup.find_all('button', attrs={"data-qa": "horizontal-filter-button"})
    n_pages = pagination_btns[-2].text if len(pagination_btns) > 3 else None
    n_pages = int(n_pages) if n_pages else 1
    page_url += "?reviewPage="
    for i in range(1, n_pages + 1):
        response = requests.get(page_url + str(i))
        soup = BeautifulSoup(response.content, 'html.parser')
        review_blocks = soup.find_all('div', attrs={"data-name": "shared-review"})
        r_scores = [block.find('span', attrs={"data-qa": "review-score"}).text for block in review_blocks]
        r_scores = [int(score.split('/')[0]) for score in r_scores]
        review_scores.extend(r_scores)
        r_texts = [block.find('p', attrs={"data-qa": "review-description"}).text for block in review_blocks]
        review_texts.extend(r_texts)
    print(rest_name, "–", len(review_scores))  
    return pd.DataFrame({'Restaurant': rest_name, 
                         'Review Score': review_scores, 
                         'Review Text': review_texts})

In [5]:
review_data = pd.DataFrame()

for _, row in data.iterrows():
    rest_reviews = get_reviews_per_rest(row['Name'], row['Page URL'])
    review_data = pd.concat([review_data, rest_reviews], ignore_index=True)

review_data.to_csv("review_data.csv", sep="\t")

Luovuus kukkii kaaoksesta – 478


KeyboardInterrupt: 

## Menu highlight Scraping

In [11]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
164,Pho Nokis,Kamppi,Vietnamese,"Lunch, Dinner",0,5.5,,/en/place/pho-nokis-100857/menu
236,Mashiro Töölö,Töölö,Sushi,"Buffet, Dinner",0,5.0,,/en/place/mashiro-toolo-87853/about
286,Satama Bar & Bistro Herttoniemi,Herttoniemi,International,"Lunch, Dessert, Dinner",0,4.5,,/en/place/satama-bar-bistro-herttoniemi-108342...
122,Relove Freda,Punavuori,European,"Lunch, Dessert, Brunch",0,5.4,,/en/place/relove-freda-95229/menu
289,Akhanda Nepalilainen Ravintola,Pitäjänmäki,Nepalese,"Lunch, Dinner",0,5.0,,/en/place/akhanda-nepalilainen-ravintola-10388...
72,Mamma Rosa,Töölö,International,"Lunch, Dinner",0,5.0,,/en/place/mamma-rosa-24808/menu
171,Ravintola Lukla,Töölö,Nepalese,"Lunch, Dinner",0,5.5,,/en/place/ravintola-lukla-96005/menu
166,Lopez Tacos Kamppi,Kamppi,Mexican,"Lunch, Dinner",0,4.8,,/en/place/lopez-tacos-kamppi-105479/menu
55,The Tart,Kaartinkaupunki,International,"Lunch, Dessert, Dinner",0,5.2,,/en/place/the-tart-108248/menu
337,Black Sea Kitchen,Kaartinkaupunki,Georgian,"Lunch, Dinner",0,,,/en/place/black-sea-kitchen-109077/about


In [84]:
def get_menu_highlights(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/menu"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    food_tags = extract_food_restriction_tags(soup)

    items = []

    sections = soup.find_all("div", attrs={"data-name": "menu-section"})
    for section in sections:
        cat_tag = section.select_one('h4[data-qa^="menu-category-name"]')
        category = cat_tag.get_text(strip=True) if cat_tag else ""

        for dish_div in section.select('div[data-qa*="item-"]'):
            name_tag = dish_div.select_one("h5")
            desc_tag = dish_div.select_one("p[data-qa*='item-description']") 

            name = name_tag.get_text(strip=True) if name_tag else ""
            desc = desc_tag.get_text(strip=True) if desc_tag else ""

            if not name:
                continue

            menu_text = f"{category} | {name}"
            if desc:
                menu_text += f" : {desc}"

            items.append({
                "Restaurant": rest_name,
                "Food restrictions": food_tags,
                "Menu": menu_text
            })

    
    return pd.DataFrame(items)

def extract_food_restriction_tags(soup):
    tags = []
    tag_container = soup.find("div", attrs={"data-qa": "food-restriction-tags"})

    if not tag_container:
        return tags
    
    for p in tag_container.find_all("p"):
        text = p.text.strip()
        if text and not text.lower().startswith("includes"):
            tags.append(text)

    return tags

In [None]:
all_menus = []

for _, row in data.iterrows():
    df = get_menu_highlights(row["Name"], row["Page URL"])
    all_menus.append(df)

menu_df = pd.concat(all_menus, ignore_index=True)
mask = menu_df['Restaurant'] != menu_df['Restaurant'].shift(1)
clean_view = menu_df.copy()
clean_view.loc[~mask, ['Restaurant', 'Food restrictions']] = ''

clean_view.to_csv("menu_highlights.csv", sep="\t", index=False)



## Restaurants' pictures scraping

In [5]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
27,Ravintola Lehtovaara,Töölö,International,"Lunch, Dinner",0,5.4,,/en/place/ravintola-lehtovaara-11619/menu
42,Alfons Pizza,Ullanlinna,Pizza,Dinner,0,5.3,,/en/place/alfons-pizza-102482/menu
265,Ravintola Veturitallit,Pasila,European,"Dessert, Dinner",0,4.3,,/en/place/ravintola-veturitallit-106147/about
112,Bröd Punavuori,Punavuori,Scandinavian,"Breakfast, Lunch, Dessert, Dinner",0,4.9,,/en/place/brod-punavuori-63308/menu
180,La Galleria,Kruununhaka,Pizza,Dinner,0,5.0,,/en/place/la-galleria-109402/about
29,Piccola Trattoria Kalasatama,Kalasatama,Italian,"Lunch, Dinner",0,5.5,,/en/place/piccola-trattoria-kalasatama-100346/...
0,Luovuus kukkii kaaoksesta,Kaartinkaupunki,International,Dinner,0,5.8,,/en/place/luovuus-kukkii-kaaoksesta-90397/menu
313,Ravintola Makasiini - Grand Marina,Katajanokka,Scandinavian,"Breakfast, Dessert, Dinner",0,4.5,,/en/place/ravintola-makasiini-scandic-grand-ma...
58,Ravintola Santa Fé Helsinki,City Centre,Mexican,"Lunch, Dinner",0,5.1,,/en/place/ravintola-santa-fe-30462/menu
171,Ravintola Lukla,Töölö,Nepalese,"Lunch, Dinner",0,5.5,,/en/place/ravintola-lukla-96005/menu


In [None]:
from urllib.parse import urlparse, urljoin
import json
    
def extract_restaurant_images(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/photos#content"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    images = []

    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)

            if isinstance(data, dict):
                items = data.get("itemListElement", [])
            elif isinstance(data, list):
                items = data
            else:
                continue

            for entry in items:
                if not isinstance(entry, dict):
                    continue
                img_data = None

                # Pattern 1: {"@type": "ListItem", "item": {"@type": "ImageObject", ...}}
                if "item" in entry:
                    item_content = entry["item"]
                    if isinstance(item_content, dict) and item_content.get("@type") == "ImageObject":
                        img_data = item_content

                # Pattern 2: direct ImageObject in the list
                elif entry.get("@type") == "ImageObject":
                    img_data = entry

                if img_data:
                    url = img_data.get("contentURL")
                    if url and "qul.imgix.net" in url:
                        clean_url = url.split("?")[0]
                        images.append(clean_url)

        except (json.JSONDecodeError, TypeError, AttributeError) as e:
            print(f"JSON parsing issue in one script: {e}")
            continue

    # Remove duplicates
    seen = set()
    unique_images = []
    for url in images:
        if url not in seen:
            seen.add(url)
            unique_images.append(url)

    return {
        "restaurant": rest_name,
        "images": unique_images,
    }


In [None]:
all_results = []


for _, row in data.iterrows():
    rest_name = row["Name"]
    page_url  = row["Page URL"]
    result = extract_restaurant_images(rest_name,page_url)
    
    images_str = ", ".join(result["images"]) if result["images"] else ""
    
    all_results.append({
        "Restaurant": rest_name,
        "Images": images_str
    })

df_pictures = pd.DataFrame(all_results)
df_pictures.to_csv("restaurant_pictures.csv", sep="\t", index=False)

## Restaurants' addresses Scraping

In [45]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
110,The Tower - Wine & Craft Beer,Pasila,Scandinavian,"Lunch, Dinner",0,4.7,,/en/place/the-tower-wine-craft-beer-85892/menu
30,Oishi 18 Katajanokka,Katajanokka,Sushi,"Lunch, Dinner",0,5.3,,/en/place/oishi-18-katajanokka-95412/menu
215,Harbour Tap & Taste,Kalasatama,Finnish,"Lunch, Dinner, Sunday lunch",0,5.8,,/en/place/harbour-tap-taste-108467/menu
24,Lappi Ravintola,City Centre,Finnish,"Dessert, Dinner",0,5.3,,/en/place/lappi-ravintola-9753/menu
45,Lie Mi Kallio,Kallio,Vietnamese,"Lunch, Dinner",0,5.2,,/en/place/lie-mi-kallio-64811/about
88,Relove Stockmann Helsinki,City Centre,European,"Breakfast, Lunch, Dessert, Brunch",0,4.9,,/en/place/relove-stockmann-helsinki-95230/menu
330,Wave Of Flavors,Vallila,Portuguese,"Lunch, Dessert, Dinner",0,5.1,,/en/place/waves-of-flavors-105468/menu
338,Mad Finn Brewing Co. Taproom Helsinki,Sompasaari,Pizza,Dinner,0,6.0,,/en/place/mad-finn-brewing-co-taproom-helsinki...
341,m/s King – Royal Line,Vuosaari,Scandinavian,"Buffet, Dinner",0,5.0,,/en/place/ms-king-royal-line-102408/menu
131,Casa Haga,Haaga,Spanish,"Tapas, Dessert, Dinner",0,4.7,,/en/place/casa-haga-100321/menu


In [None]:
def get_restaurant_address(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/about#content"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    for p_tag in soup.find_all('p', class_='sc-bdnylu jqzuJW'):
        text_content = p_tag.get_text()
        if '00100' in text_content or 'Helsinki' in text_content:
            address_lines = []
            spans = p_tag.find_all('span')
            for span in spans:
                address_lines.append(span.get_text(strip=True))
        
            full_address = ', '.join(address_lines)
            return {
                "Restaurant": rest_name,
                "Address": full_address
            }
            break

  

In [None]:
all_addresses = []

for _, row in data.iterrows():
    rest_name = row["Name"]
    page_url  = row["Page URL"]
    result = get_restaurant_address(rest_name,page_url)    
    all_addresses.append(result)
    
df = pd.DataFrame(all_addresses)
df.to_csv("restaurant_addresses.csv", sep="\t", index=False)