In [1]:
import re
import time
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def _random_user_agent():
    """
    A helper function to generate a random header to 
    avoid getting blocked by the website

    Parameters
    ----------
    None

    Returns
    -------
    str
    a random user agent 

    >>> _random_user_agent()
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
    """
    try:
        ua = UserAgent()
        return ua.random
    except:
        default_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
        return default_ua

def _get_soup(url):
    """
    This is a helper function that will automatically generate a 
    BeautifulSoup object based on the given URL of the apartment 
    webpage

    Parameters
    ----------
    url : str
        the URL of a specific apartment or a general website 

    Returns
    -------
    soup : bs4.BeautifulSoup
        a scraper for a specified webpage
    """

    # generate a random header 
    headers = {'User-Agent': _random_user_agent()}
    # send a request and get the soup
    response = requests.get(url, headers=headers)
    results = response.content
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
    return soup

def _soup_attempts(url, total_attempts=5):

    """
    A helper function that will make several attempts
    to obtain a soup to avoid getting blocked

    Parameters
    ----------
    url : str
        the URL of a specific apartment or a general website 

    total_attempts: int
        the number of attempts you want to try to obtain the 
        soup before you already give up. Default is 5 attempts

    Returns
    -------
    soup : bs4.BeautifulSoup
        a scraper for a specified webpage        

    """

    soup = _get_soup(url)

    # if we get the soup with the first attempt
    if soup:
        return soup
    # if we don't get the soup during our first
    # attempt
    else:
        attempts = 0
        while attempts < total_attempts:
            # put the program idle to avoid detection
            time.sleep(3)
            soup = self._get_soup(url)
            if soup:
                return soup
        # time to give up, try to find what's going on 
        raise ValueError(f'FAILED to get soup for apt url {url}')

In [3]:
police_url = 'https://www.cityprotect.com/map/list/incidents?toUpdateDate=12%2F18%2F2019&fromUpdateDate=11%2F18%2F2019&pageSize=2000&parentIncidentTypeIds=149,150,148,8,97,104,165,98,100,179,178,180,101,99,103,163,168,166,12&zoomLevel=16&latitude=39.94761343841498&longitude=-75.15636979615388&days=1,2,3,4,5,6,7&startHour=0&endHour=24&timezone=-05:00'

In [4]:
def _build_chrome_options():
        chrome_options = webdriver.ChromeOptions()
        chrome_options.accept_untrusted_certs = True
        chrome_options.assume_untrusted_cert_issuer = True
        
        # chrome configuration
        # More: https://github.com/SeleniumHQ/docker-selenium/issues/89
        # And: https://github.com/SeleniumHQ/docker-selenium/issues/87
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-impl-side-painting")
        chrome_options.add_argument("--disable-setuid-sandbox")
        chrome_options.add_argument("--disable-seccomp-filter-sandbox")
        chrome_options.add_argument("--disable-breakpad")
        chrome_options.add_argument("--disable-client-side-phishing-detection")
        chrome_options.add_argument("--disable-cast")
        chrome_options.add_argument("--disable-cast-streaming-hw-encoding")
        chrome_options.add_argument("--disable-cloud-import")
        chrome_options.add_argument("--disable-popup-blocking")
        chrome_options.add_argument("--ignore-certificate-errors")
        chrome_options.add_argument("--disable-session-crashed-bubble")
        chrome_options.add_argument("--disable-ipv6")
        chrome_options.add_argument("--allow-http-screen-capture")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument('--lang=es')

        return chrome_options

def _get_browser():
    """
    A helper function to get the selenium browser in order 
    to perform the scraping tasks 

    Parameters
    ----------
    chromedriver : str
        the path to the location of the chromedriver 

    Returns
    -------
    browser : webdriver.chrome
        a chrome web driver 

    wait : WebDriverWait
        this is wait object that allows the program to hang around for a period
        of time since we need some time to listen to the server 

    """
    options = _build_chrome_options()

    browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
    browser.get(police_url)
    wait = WebDriverWait(browser, 10) # maximum wait time is 20 seconds 
    return browser, wait

In [5]:
browser, wait = _get_browser()

Trying to download new driver from http://chromedriver.storage.googleapis.com/79.0.3945.36/chromedriver_mac64.zip
Unpack archive /Users/nailiding/.wdm/drivers/chromedriver/79.0.3945.36/mac64/chromedriver.zip




In [14]:
# get scrollbar height
scrollHeight = browser.execute_script('return document.getElementById("incidentsList").scrollHeight')

In [15]:
scrollHeight

64800

In [21]:
case_numbers = []
elem = browser.find_element_by_xpath("//*[@id='incidentsList']/div[1]/ce-incident-item")
case_numbers.append(elem.find_element_by_xpath("//*[@id='incident-case-number']").text)

i=1
while 100*(i-1) <= scrollHeight:
    js = f'document.getElementById("incidentsList").scrollTop=100*{i}'
    browser.execute_script(js)
    
    elem = browser.find_element_by_xpath("//*[@id='incidentsList']/div[1]/ce-incident-item")
    case_number = elem.find_element_by_xpath("//*[@id='incident-case-number']").text
    if case_number != case_numbers[-1]:
        case_numbers.append(case_number)
    i += 1

elems = browser.find_elements_by_xpath("//*[@id='incident-case-number']")
for i in range(1,len(elems)):
    case_number = elems[i].text
    case_numbers.append(case_number)

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=79.0.3945.88)


In [17]:
elems = browser.find_elements_by_xpath("//*[@id='incident-case-number']")

In [18]:
elems[0].text

'201906065347'

In [19]:
case_numbers

['201977006461',
 '201906059593',
 '201906059608',
 '201906059626',
 '201903084233',
 '201977006461',
 '201906059664',
 '201906059634',
 '201906059679',
 '201903084233',
 '201903084341',
 '201906059779',
 '201906059774',
 '201906059794',
 '201906059799',
 '201906059801',
 '201906059305',
 '201906059313',
 '201906059330',
 '201906059364',
 '201906059362',
 '201906059433',
 '201906059435',
 '201906059423',
 '201909048263',
 '201909048253',
 '201906059488',
 '201906059561',
 '201903084587',
 '201906059914',
 '201903084596',
 '201903084621',
 '201906059920',
 '201906059937',
 '201906059968',
 '201906059945',
 '201906059977',
 '201909048610',
 '201909048651',
 '201906060007',
 '201906059658',
 '201906059992',
 '201906060140',
 '201906060001',
 '201906060146',
 '201906060047',
 '201906060154',
 '201906060072',
 '201909048651',
 '201906059658',
 '201906060140',
 '201906060146',
 '201906060154',
 '201906060188',
 '201909048802',
 '201909048795',
 '201906060216',
 '201906060157',
 '201906060220

In [20]:
len(case_numbers)

324