In [1]:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
from pathlib import Path
import pandas as pd
import requests
import random
import time
import os
import re

BASE_DIR = Path().resolve()
CHROME_DRIVER = BASE_DIR / "chromedriver-win64" / "chromedriver.exe"
WAIT_TIME = 15
MISSING = "N/A"

ratings_pattern = r'\b\d\.\d\b'
reviews_pattern = r'\(\d+\)'
categories_pattern = r'\b[a-zA-Z][a-zA-Z\s]*[a-zA-Z]\b'
phone_pattern = r'\+\d(?:\s*\d)*\d'
time_pattern = r'\b\d{1,2}:\d{2}\b'


In [2]:
def open_webpage(driver, url):
  driver.get(url)

def wait_for_element(driver, by, element_identifier, timeout=WAIT_TIME, all_elements=False):
  try:
    element_present = EC.presence_of_element_located((by, element_identifier))
    WebDriverWait(driver, timeout).until(element_present)
    if not all_elements:
      return driver.find_element(by, element_identifier)
    else:
      return driver.find_elements(by, element_identifier)
  except Exception as e:
    return MISSING


In [3]:
def cars_search(driver, country):
  search_field = wait_for_element(driver, By.CSS_SELECTOR, "textarea[title='Search']")
  if search_field:
    search_field.clear()
    search_field.send_keys(f"CAR DEALERS in {country}")
    search_field.send_keys(Keys.ENTER)

    places = wait_for_element(driver, By.XPATH, "//div[span[text()='More places']]")
    places.click()
    time.sleep(5)

def validate_data(pattern, target_div, num_tries=3):
  for i in range(num_tries):
    try:
      text = re.findall(pattern, target_div)[0].strip()
      if '(' in text:
        return text[1:-1]
      else:
        return text
    except:  
      if i == num_tries-1:
        return MISSING

def get_text(target_div, by, element_identifier):
  try:
    return target_div.find_element(by, element_identifier).text
  except:
    return MISSING

In [4]:
def get_cars_data(driver, data):
    while True:
        divs = wait_for_element(driver, By.CLASS_NAME, "cXedhc", all_elements=True)
        for div in divs:
            # click on the search result and wait for 3 seconds to let content load             
            div.click()
            time.sleep(3)

            name = get_text(div, By.CSS_SELECTOR, "div[role='heading']")
            
            phone_div = get_text(div, By.XPATH, ".//div[@role='heading']/following-sibling::div[3]")
            meta_div = get_text(div, By.XPATH, ".//div[@role='heading']/following-sibling::div[1]")
            
            rating_stars = validate_data(ratings_pattern, meta_div)
            reviews = validate_data(reviews_pattern, meta_div)
            category = validate_data(categories_pattern, meta_div)
            contact_number = validate_data(phone_pattern, phone_div)

            location = get_text(div, By.XPATH, ".//div[@role='heading']/following-sibling::div[2]")
            
            
            website = wait_for_element(driver, By.LINK_TEXT, "Website", timeout=2)

            show_more_button = wait_for_element(driver, By.CSS_SELECTOR, "a[aria-label='show more']", timeout=2)
            if show_more_button != MISSING:
                show_more_button.click()

            description = wait_for_element(driver, By.XPATH, "//div[@class='PQbOE']/following-sibling::div[@data-long-text]", timeout=2)
            
            if website != MISSING:
                website = website.get_attribute("href")
            
            if description != MISSING:
                description = description.text

            data.append( {
                'Name': name,
                'Rating': rating_stars,
                'Reviews': reviews,
                'Category': category,
                'Location': location,
                'Contact Number': contact_number,
                'Website': website,
                'Description': description,
                'Meta': meta_div
            } )
        
        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//table[1]//td[last()]//a"))
            )
            next_page_link = driver.find_element(By.XPATH, "//table[1]//td[last()]//a")
            next_page_link.click()
            time.sleep(5)
        except Exception as e:
            # No more pages
            break
    return data

In [5]:
def main():
  try:
    service = Service(executable_path=CHROME_DRIVER)
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--headless")

    driver = webdriver.Chrome(service=service, options=options)

    data = []
    open_webpage(driver, "https://www.google.com")
    cars_search(driver, "New Zealand")
    data = get_cars_data(driver, data)
    df = pd.DataFrame(data)
    df.to_csv("cars_meta_data.csv")
    df.drop(columns=['Meta']).to_csv("cars_data.csv")
  except:
    print("An Error Occured")
  finally:
    driver.quit()

if __name__ == "__main__":
  main()

In [6]:
# pd.set_option('display.max_colwidth', None)
# pd.read_csv("cars_meta_data.csv")["Description"]