In [6]:
import re
import time
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

In [7]:
def _random_user_agent():
    """
    A helper function to generate a random header to 
    avoid getting blocked by the website

    Parameters
    ----------
    None

    Returns
    -------
    str
    a random user agent 

    >>> _random_user_agent()
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
    """
    try:
        ua = UserAgent()
        return ua.random
    except:
        default_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
        return default_ua

def _get_soup(url):
    """
    This is a helper function that will automatically generate a 
    BeautifulSoup object based on the given URL of the apartment 
    webpage

    Parameters
    ----------
    url : str
        the URL of a specific apartment or a general website 

    Returns
    -------
    soup : bs4.BeautifulSoup
        a scraper for a specified webpage
    """

    # generate a random header 
    headers = {'User-Agent': _random_user_agent()}
    # send a request and get the soup
    response = requests.get(url, headers=headers)
    results = response.content
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
    return soup

def _soup_attempts(url, total_attempts=5):

    """
    A helper function that will make several attempts
    to obtain a soup to avoid getting blocked

    Parameters
    ----------
    url : str
        the URL of a specific apartment or a general website 

    total_attempts: int
        the number of attempts you want to try to obtain the 
        soup before you already give up. Default is 5 attempts

    Returns
    -------
    soup : bs4.BeautifulSoup
        a scraper for a specified webpage        

    """

    soup = _get_soup(url)

    # if we get the soup with the first attempt
    if soup:
        return soup
    # if we don't get the soup during our first
    # attempt
    else:
        attempts = 0
        while attempts < total_attempts:
            # put the program idle to avoid detection
            time.sleep(3)
            soup = self._get_soup(url)
            if soup:
                return soup
        # time to give up, try to find what's going on 
        raise ValueError(f'FAILED to get soup for apt url {url}')

In [8]:
police_url = 'https://www.cityprotect.com/map/list/incidents?toUpdateDate=12%2F18%2F2019&fromUpdateDate=11%2F18%2F2019&pageSize=2000&parentIncidentTypeIds=149,150,148,8,97,104,165,98,100,179,178,180,101,99,103,163,168,166,12&zoomLevel=16&latitude=39.94761343841498&longitude=-75.15636979615388&days=1,2,3,4,5,6,7&startHour=0&endHour=24&timezone=-05:00'

In [9]:
def _build_chrome_options():
        chrome_options = webdriver.ChromeOptions()
        chrome_options.accept_untrusted_certs = True
        chrome_options.assume_untrusted_cert_issuer = True
        
        # chrome configuration
        # More: https://github.com/SeleniumHQ/docker-selenium/issues/89
        # And: https://github.com/SeleniumHQ/docker-selenium/issues/87
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-impl-side-painting")
        chrome_options.add_argument("--disable-setuid-sandbox")
        chrome_options.add_argument("--disable-seccomp-filter-sandbox")
        chrome_options.add_argument("--disable-breakpad")
        chrome_options.add_argument("--disable-client-side-phishing-detection")
        chrome_options.add_argument("--disable-cast")
        chrome_options.add_argument("--disable-cast-streaming-hw-encoding")
        chrome_options.add_argument("--disable-cloud-import")
        chrome_options.add_argument("--disable-popup-blocking")
        chrome_options.add_argument("--ignore-certificate-errors")
        chrome_options.add_argument("--disable-session-crashed-bubble")
        chrome_options.add_argument("--disable-ipv6")
        chrome_options.add_argument("--allow-http-screen-capture")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument('--lang=es')

        return chrome_options

def _get_browser():
    """
    A helper function to get the selenium browser in order 
    to perform the scraping tasks 

    Parameters
    ----------
    chromedriver : str
        the path to the location of the chromedriver 

    Returns
    -------
    browser : webdriver.chrome
        a chrome web driver 

    wait : WebDriverWait
        this is wait object that allows the program to hang around for a period
        of time since we need some time to listen to the server 

    """
    options = _build_chrome_options()

    browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
    browser.get(police_url)
    wait = WebDriverWait(browser, 10) # maximum wait time is 20 seconds 
    return browser, wait

In [20]:
browser, wait = _get_browser()


Looking for [chromedriver 79.0.3945.36 mac64] driver in cache 
File found in cache by path [/Users/nailiding/.wdm/drivers/chromedriver/79.0.3945.36/mac64/chromedriver]




In [21]:
# 收集需要的信息，返回list
def collect(elem):
    case_number = elem.find_element_by_xpath("//*[@id='incident-case-number']").text
    title = elem.find_element_by_xpath("//*[@class='incident-title']").text
    address = elem.find_element_by_xpath("//*[@class='incident-subtitle']").text
    
    date = elem.find_element_by_xpath("//*[@id='incident-date']").text
    time = elem.find_element_by_xpath("//*[@id='incident-time']").text  
    agency = elem.find_element_by_xpath("//*[@id='incident-agency']").text
    description = elem.find_element_by_xpath("//*[@id='incident-description']").text
    
    return [case_number, title, address, date, time, agency, description]

In [22]:
def scrape(cases, all_case_number, browser):
    scrollHeight = browser.execute_script('return document.getElementById("incidentsList").scrollHeight')
    i=0
    
    while True:
        # 若该区域内没有incident，find_element_by_xpath会报错
        try:
            test = browser.find_element_by_xpath("//*[@id='incidentsList']/div[1]/ce-incident-item")
            
            # 每次向下 scroll 50，收集此时左边列表中的第一个
            # 如果已经存在，则跳过
            while 50*(i) <= scrollHeight:
                js = f'document.getElementById("incidentsList").scrollTop=50*{i}'
                browser.execute_script(js)
                if i == 0:
                    time.sleep(5)

                elem = browser.find_element_by_xpath("//*[@id='incidentsList']/div[1]/ce-incident-item")
                infomation = collect(elem)
                if infomation[0] not in all_case_number:
                    cases.append(infomation)
                    all_case_number.append(infomation[0])
                i += 1

            # scroll到最底部，收集此时左边列表中的除了第一个之外的剩下所有
            elem = browser.find_element_by_xpath("//ce-incident-item[@class='ng-star-inserted']")
            final_elems = elem.find_elements_by_xpath("//span[@id='incident-case-number']")

            case_numbers = elem.find_elements_by_xpath("//span[@id='incident-case-number']")
            titles = elem.find_elements_by_xpath("//span[@class='incident-title']")
            addresses = elem.find_elements_by_xpath("//span[@class='incident-subtitle']")
            dates = elem.find_elements_by_xpath("//span[@id='incident-date']")
            times = elem.find_elements_by_xpath("//span[@id='incident-time']")  
            agencies = elem.find_elements_by_xpath("//span[@id='incident-agency']")
            descriptions = elem.find_elements_by_xpath("//span[@id='incident-description']")

            for i in range(len(final_elems)):
                if case_numbers[i].text not in all_case_number:
                    all_case_number.append(case_numbers[i].text)
                    cases.append([case_numbers[i].text, titles[i].text, \
                                  addresses[i].text, dates[i].text, times[i].text, \
                              agencies[i].text, descriptions[i].text])
            break
        except:
            print("Oops! This district has no incident!")
            break

    return cases, all_case_number

In [23]:
def move(browser, direction, times):
    if direction == 'left':
        for i in range(500*times):
            browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
                    .send_keys(Keys.LEFT)
            
    elif direction == 'right': 
        for i in range(500*times):
            browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
                    .send_keys(Keys.RIGHT)
    
    elif direction == 'up':
        for i in range(400*times):
            browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
                    .send_keys(Keys.UP)   
    
    elif direction == 'down':
        for i in range(400*times):
            browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
                    .send_keys(Keys.DOWN)    

In [24]:
def scrape_map(browser, left=1, right=1, up=1, down=1):
    
    cases = []
    all_case_number = []
    # move to the up-left corner
    move(browser, 'left', left)
    move(browser, 'up', up)
    time.sleep(5)
    
    for i in range(up+down+1):
        if i%2 == 0:
            for j in range(left+right):
                cases, all_case_number = scrape(cases, all_case_number, browser)
                move(browser, 'right', 1)
                time.sleep(5)
        elif i%2 == 1:
            for j in range(left+right):
                cases, all_case_number = scrape(cases, all_case_number, browser)
                move(browser, 'left', 1)
                time.sleep(5) 
        cases, all_case_number = scrape(cases, all_case_number, browser)
        move(browser, 'down', 1)
        time.sleep(5) 
        
    return cases, all_case_number
    
# 先移动到左上角
# for i in range(up+down+1):
#     如果 i 是偶数:
#         for j in range(left+right):
#             scrape 当前格，向右移动一步
#     如果 i 是奇数:
#         for j in range(left+right):
#             scrape 当前格，向左移动一步
#     scrape当前格，向下移一步

In [25]:
print(time.ctime())
cases, all_case_number = scrape_map(browser, left=2, right=2, up=2, down=2)
print(time.ctime())

Thu Dec 19 18:57:16 2019
Oops! This district has no incident!
Oops! This district has no incident!
Oops! This district has no incident!
Oops! This district has no incident!
Thu Dec 19 19:16:41 2019


In [26]:
len(cases)

1242

In [17]:
len(all_case_number)

868

In [18]:
cases[:10]

[['201977006461',
  'Theft',
  'D & E CHECKPOINT',
  '11/16/2019',
  '5PM',
  'Philadelphia Police Department',
  ''],
 ['201909048372',
  'Theft',
  '1600 Block WALNUT ST',
  '11/16/2019',
  '11PM',
  'Philadelphia Police Department',
  ''],
 ['201906059679',
  'Theft',
  'N 12TH ST',
  '11/17/2019',
  '12AM',
  'Philadelphia Police Department',
  ''],
 ['201909048436',
  'Theft',
  '1500 Block CHESTNUT ST',
  '11/17/2019',
  '1PM',
  'Philadelphia Police Department',
  ''],
 ['201909048443',
  'Theft',
  '1500 Block LOCUST ST',
  '11/17/2019',
  '2PM',
  'Philadelphia Police Department',
  ''],
 ['201909048441',
  'Theft from Vehicle',
  '100 Block N 21ST ST',
  '11/17/2019',
  '2PM',
  'Philadelphia Police Department',
  ''],
 ['201909048452',
  'Theft',
  '1700 Block WALNUT ST',
  '11/17/2019',
  '3PM',
  'Philadelphia Police Department',
  ''],
 ['201909048464',
  'Theft',
  '100 Block S 21ST ST',
  '11/17/2019',
  '5PM',
  'Philadelphia Police Department',
  ''],
 ['201909048475'

In [510]:
# move down
for i in range(400):
    browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
            .send_keys(Keys.DOWN)

In [511]:
# move up
for i in range(400):
    browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
            .send_keys(Keys.UP)

In [514]:
# move left
for i in range(500):
    browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
            .send_keys(Keys.LEFT)

In [515]:
# move right
for i in range(500):
    browser.find_element_by_xpath("//*[@id='mapMainContainer']/ce-map-wrapper/div/uwm-universal-web-map/div/div[1]/div[1]")\
            .send_keys(Keys.RIGHT)

In [None]:
def scrape(cases, all_case_number, browser):
    
    scrollHeight = browser.execute_script('return document.getElementById("incidentsList").scrollHeight')
    i=0

    while 50*(i) <= scrollHeight:
        js = f'document.getElementById("incidentsList").scrollTop=50*{i}'
        browser.execute_script(js)
        if i == 0:
            time.sleep(5)

        elem = browser.find_element_by_xpath("//*[@id='incidentsList']/div[1]/ce-incident-item")
        infomation = collect(elem)
        if infomation[0] not in all_case_number:
            cases.append(infomation)
            all_case_number.append(infomation[0])
        i += 1


    elem = browser.find_element_by_xpath("//ce-incident-item[@class='ng-star-inserted']")
    final_elems = elem.find_elements_by_xpath("//span[@id='incident-case-number']")

    case_numbers = elem.find_elements_by_xpath("//span[@id='incident-case-number']")
    titles = elem.find_elements_by_xpath("//span[@class='incident-title']")
    addresses = elem.find_elements_by_xpath("//span[@class='incident-subtitle']")
    dates = elem.find_elements_by_xpath("//span[@id='incident-date']")
    times = elem.find_elements_by_xpath("//span[@id='incident-time']")  
    agencies = elem.find_elements_by_xpath("//span[@id='incident-agency']")
    descriptions = elem.find_elements_by_xpath("//span[@id='incident-description']")

    for i in range(len(final_elems)):
        if case_numbers[i].text not in all_case_number:
            all_case_number.append(case_numbers[i].text)
            cases.append([case_numbers[i].text, titles[i].text, \
                          addresses[i].text, dates[i].text, times[i].text, \
                      agencies[i].text, descriptions[i].text])

    return cases, all_case_number