### Libraries

In [None]:
import re
import time
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException, StaleElementReferenceException

### Global Variables

In [None]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

industries = [
    {"Construction": ["Carpentry", "Plumbing", "Electrical work"]},
    {"Manufacturing": ["Welding", "Machine operation", "Assembly line work"]},
    {"Transportation": ["Truck driving", "Warehouse operations", "Forklift operation"]},
    {"Logistics": []},
    {"Automotive": ["Automotive repair", "Auto maintenance", "Auto Bodywork", "Tire services"]},
    {"Maintenance and Repair": ["HVAC", "Appliance repair", "General maintenance"]},
    {"Retail": ["Boutiques", "Specialty stores", "Online shops"]},
    {"Food and Beverage": ["Restaurants", "Cafes", "Food trucks"]},
    {"Personal Services": ["Hair salons", "Barber shops"]}
]

states_of_interest = ["California", "New Jersey", "New York", "Texas"]

### Functions Used

#### There's a bit going on, so I'll try my best to explain what each function does in the order they are called. I hope it helps understand the main implementation better.

The switch funciton is simply a quality-of-life function to switch the orders of an output.

In [None]:
def switch(el):
    pos1, pos2 = el.split("/")
    return pos2 + ",%20" + pos1

In CitySearch, per industry and location, there's a list of companies. Here we're locating and saving the links to each company profile. Handling exceptions at this stage wasn't too much of an issue, but on rare occasions there were no companies.

In [None]:
def get_job_cards_links(driver):
    try:
        print('looking for job card list')

        job_cards = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.list-container > div.card > a")))

        job_cards_links = [job.get_attribute("href") for job in job_cards]

    except (NoSuchElementException, TimeoutException):
        print("Error: Timed out waiting for page to load. Most likely no job listing in this category")

        job_cards_links = []

    return job_cards_links

<img src="./assets/job_list.png" width="200" height="100">

In [None]:
business details to dict

In [None]:
get email

In [None]:
html to string

In [None]:
get email from contact

In [None]:
save to csv

In [None]:
########################
# iterating over all the states, then cities, then industries and scarping business information
########################
async def main():
    #######################
    # opening and cleaning xlsx file
    #######################
    df = pd.read_excel("assets/google_maps_keywords.xlsx")
    df.loc[:, ["Country", "State"]] = df.loc[:, ["Country", "State"]].ffill()

    ########################
    # grouping dataframe by country then by state
    ########################
    grouped_df = df.groupby("Country")
    grouped_countries = grouped_df.get_group("United States")
    grouped_states = grouped_countries.groupby("State")
    states = grouped_states.groups.keys()

    ########################
    # navigating to the front page of CitySearch
    ########################
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--proxy-server='direct://'")
    chrome_options.add_argument("--proxy-bypass-list=*")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--ignore-certificate-errors')

    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://www.citysearch.com/")

    ########################
    # extracting the links to individual cities
    ########################
    container = driver.find_element(By.CSS_SELECTOR, "div.cities-container")
    cities = container.find_elements(By.CSS_SELECTOR, "li:not([class*='state']) > a")
    city_links = [city.get_attribute("href") for city in cities]

    for state in states:  # using this loop to continue from where code stopped, if it encounters an error
    # for state in states_of_interest:
        visited = set()

        pattern = re.compile(f".*/{us_state_to_abbrev[state]}/.*", re.IGNORECASE)
        where_params = [switch(param) for param in
                        [link.replace("https://www.citysearch.com/", "") for link in city_links if
                         bool(pattern.match(link))]]

        for where_param in where_params:
            business_list = []

            for industry in [list(industry.keys())[0] for industry in industries]:
                url = f"https://www.citysearch.com/results?term={industry.strip().replace(' ', '%20')}&where={where_param}"

                print("--------------------state, params, industry--------------------")
                print("-------------------------", url, "----------------------------")
                print(state, where_param, industry)

                driver.get(url)
                job_cards_links = get_job_cards_links(driver)

                if len(job_cards_links) == 0:
                    continue

                # visiting each job link for the current industry and scraping information
                for job_cards_link in job_cards_links:
                    if job_cards_link in visited:
                        print('already visited skipping')
                        continue

                    visited.add(job_cards_link)

                    print("------visiting profile: ", job_cards_link, "------")
                    driver.get(job_cards_link)

                    business_details_dict = business_details_to_dict(driver, industry)


                    try:
                        print('looking for additional details')
                        additional_info = WebDriverWait(driver, 5).until(
                            EC.presence_of_element_located(
                                (By.CSS_SELECTOR, 'div.panel-container > div.panel-details')))

                        business_details_dict['additional_info'] = additional_info.text

                    except (NoSuchElementException, TimeoutException):
                        print("No additional info container")
                        print("stopped at: ", business_details_dict)
                        business_details_dict['additional_info'] = ''

                    emails = set()

                    external_link = business_details_dict['external-links-container']

                    if external_link != '':
                        await get_email(external_link, emails)
                        await get_email_from_contact(driver, external_link, emails)
                        print("emails have been updated these are emails", emails)

                    business_details_dict['emails'] = str(list(emails))

                    print(business_details_dict)
                    business_list.append(business_details_dict)
                    time.sleep(1) # there's a lot of waiting in between, don't think we need a long wait

            save_to_csv(business_list, where_param)

    driver.quit()

In [None]:
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()