In [6]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [7]:
options = webdriver.ChromeOptions()
options.add_argument("--disable-notifications")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")

In [9]:
def get_detail_content(url: str, save_path: str, waiting: int = 10):
    driver = webdriver.Chrome(options=options)
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True)
    try:
        driver.set_page_load_timeout(120)
        driver.get(url)
        
        try:
            WebDriverWait(driver, 60).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "div.lists.list-reviews")
                )
            )
        except:
            print(f"[WARNING] Not found list reviews at {url}")
            
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        try:
            WebDriverWait(driver, 30).until(
                lambda d: len(
                    d.find_elements(By.CSS_SELECTOR, "li.review-item.fd-clearbox.ng-scope")
                ) >= 15
            )
        except:
            print("[WARNING] Not found enough review-item!!!")

        
        information_dict = {}
        
        # Get shop_name
        try:
            item_shop_name = driver.find_element(by = By.CSS_SELECTOR, value="div.main-information.disableSection div.main-info-title h1")
            information_dict["shop_name"] = item_shop_name.text
        except Exception:
            information_dict["shop_name"] = url.strip().rsplit(sep="/", maxsplit=1)[1]
            
        # Get address
        information_dict["address"] = []
        try:
            item_street_address = driver.find_element(by = By.CSS_SELECTOR, value='div.main-information.disableSection span[itemprop="streetAddress"]')
            information_dict["address"].append(item_street_address.text)
        except Exception:
            print(f"[WARNING] Cannot get street address information of {url}")
            
        try:
            item_address_locality = driver.find_element(by = By.CSS_SELECTOR, value='div.main-information.disableSection span[itemprop="addressLocality"]')
            information_dict["address"].append(item_address_locality.text)
        except Exception:
            print(f"[WARNING] Cannot get address locality information of {url}")
        
        information_dict["address"] = ", ".join(information_dict["address"])
        
        # Get menu
        information_dict["menu"] = []
        item_list_menu = driver.find_elements(by = By.CSS_SELECTOR, value="div.delivery-dishes-group div.delivery-dishes-item-right div.title-name.ng-binding.ng-isolate-scope")
        for item in item_list_menu:
            information_dict["menu"].append(item.text)
            
        # Get feedback
        information_dict["feedback"] = []
        item_list_feedbacks = driver.find_elements(by = By.CSS_SELECTOR, value="div.lists.list-reviews li.review-item.fd-clearbox.ng-scope")
        
        i = 0
        for each_feedback in item_list_feedbacks:
            i += 1
            if (i > 15):
                break
            
            feed_dict = {}
            try:
                review_point = each_feedback.find_element(by = By.CSS_SELECTOR, value="div.review-user div.review-points span")
                feed_dict["review_point"] = review_point.text
            except Exception:
                feed_dict["review_point"] = "0"
                print(f"[WARNING] Cannot get one of review points information of {url}")
            
            try:
                username = each_feedback.find_element(by = By.CSS_SELECTOR, value="div.review-user div.ru-row a.ru-username")
                feed_dict["username"] = username.text
            except Exception:
                feed_dict["username"] = "Unknown"
                print(f"[WARNING] Cannot get one of username information of {url}")
                
            try:
                describle = each_feedback.find_element(by = By.CSS_SELECTOR, value="div.rd-des span")
                feed_dict["describle"] = describle.text
            except Exception:
                feed_dict["describle"] = "No describle"
                print(f"[WARNING] Cannot get one of describle feedback of {url}")
                
            information_dict["feedback"].append(feed_dict)
            
        with open(save_path, "w", encoding ="utf-8") as f:
            json.dump(information_dict, f, indent=4, ensure_ascii=False)
            
    except Exception as e:
        print(f"[ERROR] {e}")
        print(f"[ERROR] Time out as {url}")

    finally:        
        driver.quit()

In [10]:
PROVINCE = ["ha-noi"]

LINK_STORAGE_PATH = "link"
SAVE_ROOT = "detail"

for i in PROVINCE:
    file_path = f"{LINK_STORAGE_PATH}/{i}.txt"
    with open(file_path, "r", encoding="utf-8") as file:
        all_links = file.readlines()
    
    for link in all_links:
        print(f"[INFO] Processing {link}")
        _, save_path = link.strip().rsplit(sep="/", maxsplit=1)
        get_detail_content(url=link.strip(), save_path=f"{SAVE_ROOT}/{i}/{save_path}.json")
        time.sleep(5)

[INFO] Processing https://www.foody.vn/ha-noi/dung-hanh-banh-mi-gio-cha-le-dai-hanh

[INFO] Processing https://www.foody.vn/ha-noi/bami-king-banh-mi-bo-nuong-com-tho-an-trach

[INFO] Processing https://www.foody.vn/ha-noi/hong-xoi-xeo-ngoc-lam

[INFO] Processing https://www.foody.vn/ha-noi/my-cay-do-an-vat-va-tra-hoa-qua-huyen-linh-food-ngo-169-doan-ke-thien

[INFO] Processing https://www.foody.vn/ha-noi/hanh-beo-vit-nuong

[INFO] Processing https://www.foody.vn/ha-noi/pho-chien-gion-kham-thien

[INFO] Processing https://www.foody.vn/ha-noi/chops-hops-wheat-meat

[INFO] Processing https://www.foody.vn/ha-noi/run-s-house-hoa-qua-got-san-hoa-qua-sach-288-nguyen-xien

[INFO] Processing https://www.foody.vn/ha-noi/banh-goi-ly-quoc-su

[INFO] Processing https://www.foody.vn/ha-noi/hey-bro-buger-pizza-spaghetti-fast-food

[INFO] Processing https://www.foody.vn/ha-noi/phuong-dung-com-ga-chim-quay-tong-duy-tan

[INFO] Processing https://www.foody.vn/ha-noi/homey-bbq-suon-nuong-bbq

[INFO] Proc

In [4]:
# get_detail_content(url="https://www.foody.vn/ho-chi-minh/kebab-sai-gon-nguyen-thuong-hien", save_path = "sample.json")