In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import re

In [9]:
def _random_user_agent():
    """
    A helper function to generate a random header to 
    avoid getting blocked by the website

    Parameters
    ----------
    None

    Returns
    -------
    str
    a random user agent 

    >>> _random_user_agent()
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
    """
    try:
        ua = UserAgent()
        return ua.random
    except:
        default_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
        return default_ua

def _get_soup(url):


    """
    This is a helper function that will automatically generate a 
    BeautifulSoup object based on the given URL of the apartment 
    webpage

    Parameters
    ----------
    url : str
        the URL of a specific apartment or a general website 

    Returns
    -------
    soup : bs4.BeautifulSoup
        a scraper for a specified webpage
    """

    # generate a random header 
    headers = {'User-Agent': _random_user_agent()}
    # send a request and get the soup
    response = requests.get(url, headers=headers)
    results = response.content
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
    return soup

def _soup_attempts(url, total_attempts=5):

    """
    A helper function that will make several attempts
    to obtain a soup to avoid getting blocked

    Parameters
    ----------
    url : str
        the URL of a specific apartment or a general website 

    total_attempts: int
        the number of attempts you want to try to obtain the 
        soup before you already give up. Default is 5 attempts

    Returns
    -------
    soup : bs4.BeautifulSoup
        a scraper for a specified webpage        

    """

    soup = _get_soup(url)

    # if we get the soup with the first attempt
    if soup:
        return soup
    # if we don't get the soup during our first
    # attempt
    else:
        attempts = 0
        while attempts < total_attempts:
            # put the program idle to avoid detection
            time.sleep(3)
            soup = self._get_soup(url)
            if soup:
                return soup
        # time to give up, try to find what's going on 
        raise ValueError(f'FAILED to get soup for apt url {url}')

In [15]:
police_url = 'https://www.crimereports.com/home/#!/dashboard?lat=40.01144453463675&lng=-75.13275146484375&zoom=13&incident_types=Assault%252CAssault%2520with%2520Deadly%2520Weapon%252CBreaking%2520%2526%2520Entering%252CDisorder%252CDrugs%252CHomicide%252CKidnapping%252CLiquor%252COther%2520Sexual%2520Offense%252CProperty%2520Crime%252CProperty%2520Crime%2520Commercial%252CProperty%2520Crime%2520Residential%252CQuality%2520of%2520Life%252CRobbery%252CSexual%2520Assault%252CSexual%2520Offense%252CTheft%252CTheft%2520from%2520Vehicle%252CTheft%2520of%2520Vehicle&start_date=2019-06-09&end_date=2019-11-30&days=sunday%252Cmonday%252Ctuesday%252Cwednesday%252Cthursday%252Cfriday%252Csaturday&start_time=0&end_time=23&include_sex_offenders=false&current_tab=list&shapeIds=&position_id=33m9-hupm-row-qv5k_airg_asnz&shape_id=false&searchText=Philadelphia%252C%2520Pennsylvania%252C%2520United%2520States'
soup_police = _get_soup(police_url)
soup_police

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="Fh_dHO98PhEAYLXUNSH2UKGJ3JHFukFWVcxCQqoZ_K4" name="google-site-verification"/>
<meta content="#!" name="fragment"/>
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="no" name="msapplication-tap-highlight"/>
<meta content="CrimeReports helps residents see and understand where crime is happening in their neighborhood and engage with their local law enforcement agencies. Find out more!" name="description"/>
<meta content="CrimeReports.com " property="og:title"/>
<meta content="CrimeReports.com allows law enforcement agencies to provide neighborhood crime information to citizens in near real-time and empowers citizen participation in community policing. " property="og:description"/>
<meta content="http://api.url2png.com/v6/P51D9C4D7BDAF31/b2e4fecc0dbf679c392

In [12]:
circles = soup_police.find_all('circle', class_='hole')

In [13]:
circles

[]