In [1]:
pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
import datetime
from webdriver_manager.chrome import ChromeDriverManager
import pickle

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import UnexpectedAlertPresentException

In [5]:
def identification(location_data_set):
    person = input("Please input your name: ").lower()
    
    while True:
        if person == 'jacky':
            return person, location_data_set[:1323]
        elif person == 'nicholas':
            return person, location_data_set[1323:2646]
        elif person == 'zhou wei':
            return person, location_data_set[2646:]
        else:
            print("Unknown Person")
            person = input("Please input your name: ").lower()

In [6]:
def get_containers(driver, current_set):
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")
    list_of_places = set(soup.find_all("article", class_ = "item restaurant_item" ))
    return list_of_places

In [7]:
def scraping(current_list_of_locations):
    
    # User Experience
    print(f"You have {len(current_list_of_locations)} left to scrape", end = "\n")
    while True:
        try:
            number_of_scrapes = int(input("Please indicate how many locations would you like to scrape now: "))
        except:
            pass
        else:
            break
    
    # Getting the locations to be scraped
    list_of_locations = current_list_of_locations[:number_of_scrapes]
    list_of_restaurants = set()
    remaining_locations = current_list_of_locations[number_of_scrapes:]
    

    # Setting the parameters for the driver
    chrome_options = Options()
    chrome_options.add_argument("--headless")  
    driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
    delay = 5

    
    # Start scraping the web
    while(len(list_of_locations) != 0):   
        # getting the location from the list
        location = list_of_locations.pop()
        driver.get(f"https://www.hungrygowhere.com/search-results/{location}/?general=1")
        
        sleep(1)
        
        # Getting the result count
        try:
            result_count = BeautifulSoup(driver.page_source, "html.parser").find("span", class_="result").text
            if int(result_count[0]) == 0:
                print(f"No results for {location}")
                continue
        except:
            pass
        
        # Sanity Check
        print(f"{result_count} found!")
        after_split = result_count.split(' ')
        
        # If more than 400 results, skip to next location
        if int(after_split[0]) > 400:
            print(f"{location} has too many results! Passing..")
            continue
        
        else:
            print(f"Currently Scraping: {location}")

            # Get scroll height
            last_height = driver.execute_script("return document.body.scrollHeight")

            try:
                while True:
                    # Scroll down to bottom
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                    # Wait to load page
                    sleep(delay)

                    list_of_restaurants = list_of_restaurants.union(get_containers(driver, list_of_restaurants))
                    # Calculate new scroll height and compare with last scroll height
                    new_height = driver.execute_script("return document.body.scrollHeight")
                    if new_height == last_height:
                        list_of_restaurants = list_of_restaurants.union(get_containers(driver, list_of_restaurants))
                        break
                    last_height = new_height
            # If encounter time out error, restart the driver and just move on
            except:
                print(f"Failed to scape all locations for {location}... Moving on")
                driver.quit()
                driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
                continue
    
    # Sanity Check
    print("Scraping done!")
    driver.quit()
    return list_of_restaurants, remaining_locations

In [8]:
#To extract the shop name, the shop address, cuisine, type of shop(eg. hawker/cafe/restaurant)
def data_extractor(aset):
    # Convert set to list for iteration
    soup_list = list(aset)
    
    # Preparing the container
    big_list = []
    
    # looping through all soups extracted
    for containers in soup_list:
        # looping through all soups extracted
        name = containers.find("h2", class_ = "hneue-bold-mm").find("a").text
        
        # Extracting type of cuisine
        cuisines = []
        try:
            cuisine = containers.find("span", class_ = "cuisine").find_all("a")
            for i in cuisine:
                cuisines.append(i.text)
        except:
            cuisines.append(None)
        
        # Extracting stall type 
        try:
            stall_type = containers.find("span", class_ = "category-name").text
        except:
            stall_type = None
        
        # Extracting address          
        address_container = containers.find("span", class_ = "location")
        try:
            street_address = address_container.find("span", itemprop = "streetAddress").text
        except:
            street_address = ""
            
        try:
            address_locality = address_container.find("a", itemprop = "addressLocality").text
        except:
            address_locality = ""
        
        try:
            address_region = address_container.find("a", itemprop = "addressRegion").text
        except:
            address_region = ""
        
        try:
            postal_code = address_container.find("span", itemprop = "postalCode").text
        except:
            postal_code = ""
        
        address = ( street_address + " " +
                    address_locality + " " +
                    address_region +
                    postal_code)
  
        # Storing data into a overall list
        alist = [name, cuisines, stall_type, address]
        big_list.append(alist)
        
    return pd.DataFrame(big_list, columns= ["Stall Name", "Cuisine", "Shop Type", "Address"])

In [9]:
def main():
    try:
        # Attempt to open a save file. If no save file is found create a new backup
        infile1 = open("remaining locations", "rb")
        remaining = pickle.load(infile1)
        infile1.close()
        
        scraped = pd.read_pickle("scraped_data")
        
    except FileNotFoundError:
        print("Initialising first time user setup...")
        
        # Extracting the new users tasks to scrape
        df = pd.read_excel("Street Names.xlsx")
        list_of_singapore_streets = list(df.values[0:,0])
        name, locations = identification(list_of_singapore_streets)
        
        print()
        print(f"Welcome {name}!")
        
        #begin scraping and 
        scraped_data, remaining = scraping(locations)
        
        # Cleaning the data to be able to convert to a pickle
        scraped_data = data_extractor(scraped_data)
        scraped_data.to_pickle("scraped_data")
        
        # Storing remaining locations as a pickle
        outfile2 = open("remaining locations", 'wb')
        pickle.dump(remaining, outfile2)
        outfile2.close()
        
    else:
        print(f"Welcome back!")
        
        # Continue scaping
        scraped_data, remaining = scraping(remaining)
        scraped_data = data_extractor(scraped_data)
        
        # Merging both scraped dataframes(past and now) together
        df = scraped.append(scraped_data, ignore_index = True)
        df.to_pickle("scraped_data")
        
        # Updating the remaining locations to be mined list
        infile1 = open("remaining locations", "wb")
        pickle.dump(remaining, infile1)
        infile1.close()

In [10]:
main()

Welcome back!
You have 1323 left to scrape
Please indicate how many locations would you like to scrape now: 0








[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/Users/jacky/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Scraping done!


In [11]:
pd.read_csv("Jacky1.csv")

Unnamed: 0,Stall Name,Cuisine,Shop Type,Address
0,Adam Chicken,[None],Hawker,#01-29 Ayer Rajah Food Centre West Coast Drive...
1,NO.1 Adam's Nasi Lemak,"['Asian', 'Halal', 'Malaysian']",Kiosk or Stall,#01-01 Adam Road Food Centre Adam Road 289876
2,Nur Adam’s Delights,['Indian'],Kiosk or Stall,Stall No.3 Sembawang Road 758504
3,Selamat Datang Warong Pak Sapari,"['Chinese', 'Malaysian']",Kiosk or Stall,#01-09 Adam Road Food Centre Adam Road 289876
4,Apit Drinks Stall,['Drinks'],Kiosk or Stall,#01-06 Adam Road Food Centre Adam Road 289876
...,...,...,...,...
5658,Zai Xin Traditional Snacks,['Snacks'],Kiosk or Stall,#01-40 Ghim Moh Road Market & Cooked Food Cent...
5659,Summer Pavilion,"['Asian', 'Cantonese', 'Chinese', 'Dim Sum', '...",Restaurant,"The Ritz-Carlton, Millenia Singapore Raffles ..."
5660,Teochew Mushroom Minced Meat Noodle,"['Asian', 'Chinese', 'Teochew']",Kiosk or Stall,#01-20 Ci Yuan Hawker Centre Hougang Avenue 9 ...
5661,Grove Cafe (Canberra Plaza),[None],Restaurant,#01-18A Canberra Plaza Canberra View 750133
