In [1]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm import tqdm
from typing import List
import time
import glob

In [3]:
options = webdriver.ChromeOptions()
# options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")

In [4]:
def get_detail_content(url: str, save_path: str, waiting: int = 10):
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    driver.set_page_load_timeout(350)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(waiting)
    
    information_dict = {}
    
    # Get shop_name
    try:
        item_shop_name = driver.find_element(by = By.CSS_SELECTOR, value="div.main-information.disableSection div.main-info-title h1")
        information_dict["shop_name"] = item_shop_name.text
    except Exception:
        information_dict["shop_name"] = url.strip().rsplit(sep="/", maxsplit=1)[1]
        
    # Get address
    information_dict["address"] = []
    try:
        item_street_address = driver.find_element(by = By.CSS_SELECTOR, value='div.main-information.disableSection span[itemprop="streetAddress"]')
        information_dict["address"].append(item_street_address.text)
    except Exception:
        print(f"[WARNING] Cannot get street address information of {url}")
        
    try:
        item_address_locality = driver.find_element(by = By.CSS_SELECTOR, value='div.main-information.disableSection span[itemprop="addressLocality"]')
        information_dict["address"].append(item_address_locality.text)
    except Exception:
        print(f"[WARNING] Cannot get address locality information of {url}")
    
    information_dict["address"] = ", ".join(information_dict["address"])
    
    # Get menu
    information_dict["menu"] = []
    item_list_menu = driver.find_elements(by = By.CSS_SELECTOR, value="div.delivery-dishes-group div.delivery-dishes-item-right div.title-name.ng-binding.ng-isolate-scope")
    for item in item_list_menu:
        information_dict["menu"].append(item.text)
        
    # Get feedback
    information_dict["feedback"] = []
    item_list_feedbacks = driver.find_elements(by = By.CSS_SELECTOR, value="div.lists.list-reviews li.review-item.fd-clearbox.ng-scope")
    
    for each_feedback in item_list_feedbacks:
        feed_dict = {}
        try:
            review_point = each_feedback.find_element(by = By.CSS_SELECTOR, value="div.review-user div.review-points span")
            feed_dict["review_point"] = review_point.text
        except Exception:
            feed_dict["review_point"] = "0"
            print(f"[WARNING] Cannot get one of review points information of {url}")
        
        try:
            username = each_feedback.find_element(by = By.CSS_SELECTOR, value="div.review-user div.ru-row a.ru-username")
            feed_dict["username"] = username.text
        except Exception:
            feed_dict["username"] = "Unknown"
            print(f"[WARNING] Cannot get one of username information of {url}")
            
        try:
            describle = each_feedback.find_element(by = By.CSS_SELECTOR, value="div.rd-des span")
            feed_dict["describle"] = describle.text
        except Exception:
            feed_dict["describle"] = "No describle"
            print(f"[WARNING] Cannot get one of describle feedback of {url}")
            
        information_dict["feedback"].append(feed_dict)
        
    with open(save_path, "w", encoding ="utf-8") as f:
        json.dump(information_dict, f, indent=4, ensure_ascii=False)
        
    driver.quit()

In [5]:
PROVINCE = ["ha-noi"]

LINK_STORAGE_PATH = "link"
SAVE_ROOT = "detail"

for i in PROVINCE:
    file_path = f"{LINK_STORAGE_PATH}/{i}.txt"
    with open(file_path, "r", encoding="utf-8") as file:
        all_links = file.readlines()
    
    for link in all_links:
        _, save_path = link.strip().rsplit(sep="/", maxsplit=1)
        get_detail_content(url=link.strip(), save_path=f"{SAVE_ROOT}/{i}/{save_path}.json")
        time.sleep(5)
    

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=60061): Read timed out. (read timeout=120)

In [None]:
# get_detail_content(url="https://www.foody.vn/ho-chi-minh/kfc-co-op-mart-nguyen-anh-thu", save_path = "sample.json")