In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep

## Scraping

In [4]:
def scrape_page(page_n):
    url = f"https://www.quandoo.fi/en/helsinki?districtFilter=3637&bookable=true&onlySpecialOffers=false&page={page_n}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    rest_cards = soup.find_all(attrs={"data-qa": "merchant-card"})

    rest_names = [card.find('h3').text.strip() for card in rest_cards]
    rest_locations = [card.find(attrs={"data-qa": "merchant-location"}).text.strip() for card in rest_cards]
    # "Located in ... area" part removed in Location column.
    rest_locations = [location.replace('Located at ','').replace('area','') for location in rest_locations]
    
    rest_cuisines = [card.find(attrs={"data-qa": "merchant-card-cuisine"}).text.strip() for card in rest_cards]
    #Cleaned Cuisine's column data
    rest_cuisines = [cuisine.replace(" Restaurant","") for cuisine in rest_cuisines]

    rest_meals_boxes = [card.find(attrs={"data-qa": "merchant-meal"}) for card in rest_cards]
    rest_meals = [re.sub(r'^\s*Meals:\s*', '', box.text.strip()) if box else None for box in rest_meals_boxes]
    
    rest_rating_boxes = [card.find(attrs={"data-qa": "reviews-score"}) for card in rest_cards]
    rest_ratings = [float(box.text.split('/')[0].strip()) if box else None for box in rest_rating_boxes]

    rest_review_boxes = [card.find(class_="sc-1atis9w-3 dfyExP") for card in rest_cards]
    rest_review_counts = [int(box.text.split()[0].strip()) if box else None for box in rest_review_boxes]

    rest_price_boxes = [card.find(class_=re.compile(r'.*price-indicator')) for card in rest_cards]
    rest_price_levels = [len(box.find_all(class_=re.compile(r'.*oGCHK'))) for box in rest_price_boxes]

    rest_page_urls = [card.find('a')['href'] for card in rest_cards]

    df = pd.DataFrame({'Name': rest_names, 
                    'Location': rest_locations,
                    'Cuisine': rest_cuisines,
                    'Meals': rest_meals,
                    'Price Level (out of 4)': rest_price_levels,
                    'Rating (out of 6)': rest_ratings,
                    'Review Count': rest_review_counts,
                    'Page URL': rest_page_urls
                    })

    return df

In [5]:
restaurant_data = pd.DataFrame()

for page_n in range(1, 15):
    page_data = scrape_page(page_n)
    restaurant_data = pd.concat([restaurant_data, page_data], ignore_index=True)

restaurant_data

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
0,Luovuus kukkii kaaoksesta,Kaartinkaupunki,International,Dinner,0,5.8,,/en/place/luovuus-kukkii-kaaoksesta-90397/menu
1,Ravintola MyStuu,Punavuori,Swiss,"Lunch, Dessert, Dinner",0,5.6,,/en/place/ravintola-my-stuu-98898/menu
2,Gaucho,City Centre,Brazilian,Dinner,0,5.6,,/en/place/gaucho-105125/menu
3,Finlandia Caviar,City Centre,Gourmet,"Lunch, Dinner",0,5.6,,/en/place/finlandia-caviar-15896/menu
4,Restaurant Armenian House,Kamppi,International,"Lunch, Dinner",0,5.6,,/en/place/armenian-house-55148/menu
...,...,...,...,...,...,...,...,...
337,Black Sea Kitchen,Kaartinkaupunki,Georgian,"Lunch, Dinner",0,,,/en/place/black-sea-kitchen-109077/about
338,Mad Finn Brewing Co. Taproom Helsinki,Sompasaari,Pizza,Dinner,0,6.0,,/en/place/mad-finn-brewing-co-taproom-helsinki...
339,Kahvila Mutteri,Lauttasaari,Dessert,Cake & Coffee,0,,,/en/place/kahvila-mutteri-100773/menu
340,Merisali - Hilton Kalastajatorppa,Munkkiniemi,Scandinavian,"Buffet, Dinner",0,5.0,,/en/place/merisali-hilton-kalastajatorppa-9200...


In [6]:
restaurant_data.to_csv("restaurant_data.csv", sep="\t")

## Review Scraping

In [3]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
92,Hesperia Restaurant & Bar,Töölö,Scandinavian,"Dessert, Dinner",0,4.8,,/en/place/hesperia-restaurant-bar-63302/menu
237,Bibimbap Töölö,Töölö,Korean,"Lunch, Dinner",0,5.0,,/en/place/bibimbap-toolo-108488/menu
32,Lie Mi Töölö,Töölö,Vietnamese,"Lunch, Dinner",0,5.3,,/en/place/lie-mi-toolo-85623/menu
172,Annan Kartano,Tuomarinkylä,Finnish,"Lunch, Sunday lunch",0,4.9,,/en/place/annan-kartano-94514/about
156,Bistro Palo,Malmi,International,"Lunch, Dinner",0,5.0,,/en/place/bistro-palo-96188/menu
225,GTC Café,Kamppi,Eat & Drink,"Lunch, Dessert, Dinner, Cake & Coffee",0,6.0,,/en/place/gtc-cafe-100566/menu
287,Seurahuone Pub Herttoniemi,Herttoniemi,Drinks,Dinner,0,1.0,,/en/place/seurahuone-pub-herttoniemi-100445/menu
230,Annapurna,Punavuori,Nepalese,"Lunch, Dinner, Sunday lunch",0,5.5,,/en/place/annapurna-108526/menu
221,Camp Mount,Jätkäsaari,Nepalese,"Dessert, Dinner",0,,,/en/place/camp-mount-108129/menu
76,Alice Italian,Vallila,Italian,"Breakfast, Lunch, Dinner",0,5.0,,/en/place/alice-italian-48550/about


In [4]:
def get_reviews_per_rest(rest_name, rel_path):
    review_texts, review_scores = [], []
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/reviews"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    pagination_btns = soup.find_all('button', attrs={"data-qa": "horizontal-filter-button"})
    n_pages = pagination_btns[-2].text if len(pagination_btns) > 3 else None
    n_pages = int(n_pages) if n_pages else 1
    page_url += "?reviewPage="
    for i in range(1, n_pages + 1):
        response = requests.get(page_url + str(i))
        soup = BeautifulSoup(response.content, 'html.parser')
        review_blocks = soup.find_all('div', attrs={"data-name": "shared-review"})
        r_scores = [block.find('span', attrs={"data-qa": "review-score"}).text for block in review_blocks]
        r_scores = [int(score.split('/')[0]) for score in r_scores]
        review_scores.extend(r_scores)
        r_texts = [block.find('p', attrs={"data-qa": "review-description"}).text for block in review_blocks]
        review_texts.extend(r_texts)
    print(rest_name, "–", len(review_scores))  
    return pd.DataFrame({'Restaurant': rest_name, 
                         'Review Score': review_scores, 
                         'Review Text': review_texts})

In [None]:
review_data = pd.DataFrame()

for _, row in data.iterrows():
    rest_reviews = get_reviews_per_rest(row['Name'], row['Page URL'])
    review_data = pd.concat([review_data, rest_reviews], ignore_index=True)

review_data.to_csv("review_data.csv", sep="\t")

## Menu highlight Scraping

In [2]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
183,Suski Bar & Kitchen,Malmi,International,"Lunch, Dinner",0,4.9,,/en/place/suski-bar-kitchen-70479/menu
98,Southpark,City Centre,International,"Lunch, Brunch, Dinner",0,4.8,,/en/place/southpark-18882/about
243,Stadin Poseidon,Katajanokka,Drinks,Dinner,0,,,/en/place/stadin-poseidon-100325/menu
43,Itsudemo Helsinki,Kamppi,Sushi,"Buffet, Lunch, Dessert, Dinner",0,5.2,,/en/place/itsudemo-helsinki-55099/menu
77,Tinku Café Bar,Töölö,Latin American,"Lunch, Dinner, Cake & Coffee",0,5.0,,/en/place/tinku-cafe-bar-103705/menu
250,Ravintola T49,Töölö,Asian,"Buffet, Lunch",0,,,/en/place/toolo-49-108288/menu
135,The Schnitzel Tripla,Pasila,Korean,"Lunch, Dinner",0,4.4,,/en/place/the-schnitzel-tripla-90435/menu
333,Ravintola Töölö Juhlasali,Töölö,International,"Buffet, Dessert",0,4.5,,/en/place/ravintola-toolo-juhlasali-88616/about
128,Nepalilainen Ravintola Mountain,Töölö,Nepalese,"Lunch, Dinner",0,5.3,,/en/place/nepalilainen-ravintola-mountain-1032...
138,Ravintola Mestaritalli,Töölö,Scandinavian,Dinner,0,4.2,,/en/place/ravintola-mestaritalli-61082/about


In [3]:
def get_menu_highlights(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/menu"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    food_tags = extract_food_restriction_tags(soup)

    items = []

    sections = soup.find_all("div", attrs={"data-name": "menu-section"})
    for section in sections:
        category_tag = section.find("h4", attrs={"data-qa": lambda x: x and x.startswith("menu-category-name")})
        category = category_tag.text.strip() if category_tag else None

        dish_blocks = section.find_all("div", attrs={"data-qa": lambda x: x and "-item-" in x})

        for dish in dish_blocks:
            name_tag = dish.find("h5", attrs={"data-qa": lambda x: x and "item-name" in x})
            desc_tag = dish.find("p", attrs={"data-qa": lambda x: x and "item-description" in x})

            items.append({
                "Restaurant": rest_name,
                "Category": category,
                "Dish": name_tag.text.strip() if name_tag else None,
                "Description": desc_tag.text.strip() if desc_tag else None,
                "Food restrictions": food_tags,
                "Source": "menu_highlights"
            })
    
    return pd.DataFrame(items)

def extract_food_restriction_tags(soup):
    tags = []
    tag_container = soup.find("div", attrs={"data-qa": "food-restriction-tags"})

    if not tag_container:
        return tags
    
    for p in tag_container.find_all("p"):
        text = p.text.strip()
        if text and not text.lower().startswith("includes"):
            tags.append(text)

    return tags

In [5]:
all_menus = []

for _, row in data.iterrows():
    df = get_menu_highlights(row["Name"], row["Page URL"])
    all_menus.append(df)

menu_df = pd.concat(all_menus, ignore_index=True)
menu_df.to_csv("menu_highlights.csv", sep="\t", index=False)

## Translations

In [None]:
from data_processing import translate_batch

data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
reviews = pd.read_csv("review_data.csv", sep="\t", index_col=0)
translated_review_data = {}

for _, row in data.iterrows():
        rest_reviews = reviews[reviews.Restaurant == row.Name]["Review Text"]
        transl_reviews = translate_batch(rest_reviews)
        all_reviews = '\n'.join(transl_reviews)
        translated_review_data[row.Name] = all_reviews

In [None]:
pd.DataFrame({"Restaurant": translated_review_data.keys(), "Reviews": translated_review_data.values()}).to_csv("translated_review_data.csv", sep="\t")