In [1]:
import requests, time, argparse, traceback, threading
from bs4 import BeautifulSoup 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import numpy as np

import logging

# logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
logging.warning('This will get logged to a file')

In [2]:
urls = {
    "loblaw" : "https://www.google.co.in/maps/place/Loblaws/@44.411419,-79.709587,15z/data=!3m1!5s0x882aa2ee96e05995:0xad0170a78a2eea2c!4m7!3m6!1s0x0:0x2d502f30b7091262!8m2!3d44.411419!4d-79.709587!9m1!1b1",
    "costco" : "https://www.google.co.in/maps/place/Costco+Wholesale/@44.3346476,-79.6836209,17z/data=!4m7!3m6!1s0x882abc4525737595:0x4c5e8a4030c6c21c!8m2!3d44.3346476!4d-79.6814322!9m1!1b1",
    "walmart" : "https://www.google.co.in/maps/place/Walmart+Supercentre/@44.3715951,-79.7345296,13z/data=!4m10!1m2!2m1!1swalmart+barrie!3m6!1s0x882aa2eeba0d9e93:0x9bea7405337a0837!8m2!3d44.4103836!4d-79.7119763!9m1!1b1",
    "zehrs" : "https://www.google.co.in/maps/place/Zehrs/@44.3856202,-79.7075219,13z/data=!4m10!1m2!2m1!1szehrs+barrie!3m6!1s0x882abb6242f50799:0x48515248c8d91db0!8m2!3d44.355263!4d-79.648647!9m1!1b1",
    "no_frills" : "https://www.google.co.in/maps/place/Joe's+No+Frills/@44.3761364,-79.7052004,14.32z/data=!4m10!1m2!2m1!1swalmart+barrie!3m6!1s0x0:0x8139a99b0cc48df9!8m2!3d44.386979!4d-79.7051615!9m1!1b1"
}

In [3]:
class GoogleMapReviewScrapper:
    def __init__(self, EXE_PATH, URL, required_count):
        '''
        EXE_PATH - path of the chrome driver in the desktop
        URL - url of the google maps to be scraped
        required_count - The number of reviews to be scraped
        '''
        self.driver = self.__get_driver(EXE_PATH, URL)
        self.total_count = self.get_total_count()
        if self.total_count < required_count:
            self.required_count = self.get_total_count()
        else:
            self.required_count = required_count
        self.__sort_by_newest()
    
    def __enter__(self):
        '''
        Executed once the object is instanciated 
        '''
        return self

    def __exit__(self, exc_type, exc_value, tb):
        '''
        Executed once the object is exited
        '''
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)

        self.driver.close()
        self.driver.quit()

        return True
        
    def __get_driver(self, EXE_PATH, URL):
        '''
        Create and returns a chrome driver object
        '''
        options = Options()
        
        options.add_argument("--window-size=1366,768")

        options.add_argument("--disable-notifications")
        options.add_argument("--lang=en-GB")

        input_driver = webdriver.Chrome(EXE_PATH, options=options)
        input_driver.get(URL)
        
        return input_driver
    
    def __wait(self, cond):
        '''
        __wait method waits for a part of the web page to be loaded
        '''
        try:
            return WebDriverWait(self.driver, 10).until(cond)
        except:
            print('Element load failed')
    
    def __sort_by_newest(self):
        '''
        __sort_by_newest method sorts the reviews listed by the newest review posted
        '''
        wait = WebDriverWait(self.driver, 10)
        menu_bt = self.__wait(EC.element_to_be_clickable((By.XPATH, "//button[@data-value='Sort']")))
        menu_bt.click()
        li_newest = self.__wait(EC.presence_of_element_located((By.XPATH, "//li[@data-index='1']")))
        li_newest.click()
        time.sleep(10)
    
    def __scroll(self):
        '''
        __scroll method scrolls the review list to fetch more reviews.
        '''
        scrollable_div = self.__wait(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.section-layout.section-scrollbox.scrollable-y.scrollable-show')))
        self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        time.sleep(4)
    
    def __expand_reviews(self):
        '''
        __expand_reviews method expands a review text div
        '''
        links = self.driver.find_elements_by_xpath("//button[@class='section-expand-review mapsConsumerUiCommonButton__blue-link']")
        for l in links:
            l.click()
        time.sleep(2)
        
    def get_reviews(self):
        '''
        get_reviews methods fetches reviews per the required count
        '''
        logging.warning('get_reviews')
        while self.get_review_text_count() < self.required_count and self.get_loaded_count() < self.total_count:
            self.__scroll()
        self.__expand_reviews()
        return self.parse_page_source()
            
        
    def parse_page_source(self):
        '''
        parse_page_source method scrapes the review data from the chrome driver's page source and returns it as a 
        pandas dataframe.
        '''
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        contents = soup.find_all('div', {'class': 'section-review-content'})
        reviews = []
        for content in contents:
            reviews.append({
                'public_date': content.find('span', {'class': 'section-review-publish-date'}).text.strip(),
                'user_name': content.find('div', {'class': 'section-review-title'}).text.strip(),
                'review_text': content.find('span', {'class': 'section-review-text'}).text.strip()
            })
        df = pd.DataFrame(reviews) 
        df['review_text'].replace('', np.nan, inplace=True)
        df = df[df['review_text'].notna()] 
        return df.reset_index(drop=True)
    
    def get_review_text_count(self):
        '''
        get_review_text_count returns the number of text reviews loaded into the page view.
        '''
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        contents = soup.find_all('span', {'class': 'section-review-text'})
        contents_clean = [content.text.strip() for content in contents if len(content.text.strip()) > 0]
        print('review_text_count', len(contents_clean))
        return len(contents_clean)
    
    def get_total_count(self):
        '''
        get_total_count returns the total number of reviews in the web page.
        '''
        self.__wait(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.gm2-caption')))
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        count_str = soup.find('div', {'class': 'gm2-caption'}).text.strip().split(' ')[0]
        return int(''.join(count_str.split(',')))
        
    def get_loaded_count(self):
        '''
        get_loaded_count returns total number of loaded reviews.
        '''
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        count = len(soup.find_all('div', {'class': 'section-review-content'}))
        return count

In [4]:
def scrape_from_url(url, name, required_count):
    '''
    scrape_from_url method creates an instance of the GoogleMapReviewScrapper class to scrape the review data from the url
    and writes the scraped data into a csv file.
    '''
    with GoogleMapReviewScrapper('E:\\chromedriver\\chromedriver.exe', str(url), required_count) as scraper:
        logging.warning('url '+ url)
        start_time = time.time()
        df = scraper.get_reviews()
        print(name, 'size', len(df))
        df.to_csv(name+'_scrape.csv', index=False)
        end_time = time.time()
        print(end_time - start_time)

In [7]:
'''
Uses the url list to loop through the scrape_from_url method and scrape the data from all urls in the list and 
write the scraped data in a csv file.
'''
for name, url in urls.items():
    scrape_from_url(url, name, 100)

review_text_count 8
review_text_count 11
review_text_count 15
review_text_count 19
review_text_count 22
review_text_count 25
review_text_count 29
review_text_count 34
review_text_count 38
review_text_count 44
review_text_count 51
review_text_count 57
review_text_count 62
review_text_count 66
review_text_count 71
review_text_count 72
review_text_count 78
review_text_count 82
review_text_count 90
review_text_count 96
review_text_count 100
loblaw size 100
163.97135066986084
Element load failed


AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
'''
Reads the scraped data in the csv files
'''
loblaws_df = pd.read_csv('scraped_data/loblaw_scrape.csv')
no_frills_df = pd.read_csv('scraped_data/no_frills_scrape.csv')
walmart_df = pd.read_csv('scraped_data/walmart_scrape.csv')
costco_df = pd.read_csv('scraped_data/costco_scrape.csv')
zehrs_df = pd.read_csv('scraped_data/zehrs_scrape.csv')

In [10]:
'''
Creating a new column retailer in the dataframes
'''
loblaws_df.insert(2, "retailer", ['loblaws']*len(loblaws_df), True)
no_frills_df.insert(2, "retailer", ['no_frills']*len(no_frills_df), True)
walmart_df.insert(2, "retailer", ['walmart']*len(walmart_df), True)
costco_df.insert(2, "retailer", ['costco']*len(costco_df), True)
zehrs_df.insert(2, "retailer", ['zehrs']*len(zehrs_df), True)

In [12]:
'''
Merge the data frames into a single dataframe and write the result to a csv file/ 
'''
dfs = [loblaws_df, no_frills_df, walmart_df, costco_df, zehrs_df]
merged_df = pd.concat(dfs)
merged_df.to_csv('retailers_merged.csv', index=False)