### Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import re
import folium
import pandas as pd

### Constants

In [2]:
BOSTON_URL = r'https://311.boston.gov'

### Main Page Scraper Class

In [3]:
class Scraper:
    """Scrapes the Boston's 311 main page with BeautifulSoup"""
    
    def __init__(self):
        self.current_page = 1
        self.report_urls = []
        
        # Get main page soup
        res = requests.get(BOSTON_URL)
        self.soup = BeautifulSoup(res.text, "html5lib")
        
    def get_next_page_soup(self):
        """Adds soup from next page to soup"""
        if not self.current_page == 20:
            time.sleep(1)
            self.current_page += 1
            
            res = requests.get(BOSTON_URL + "/?page=" + str(self.current_page))
            new_soup = BeautifulSoup(res.text, "html5lib")
            
            self.soup.append(new_soup.body)
            
    def get_report_urls(self):
        """Gets all '.report' class 'onclick' attributes, and gets the appropriate URLs"""
        onclick_attrs = [elem['onclick'] for elem in self.soup.select(".report")]
        self.report_urls = [BOSTON_URL + onclick_text.split("'")[1] for onclick_text in onclick_attrs]

In [5]:
scraper = Scraper()
scraper.get_next_page_soup()
scraper.get_report_urls()
scraper.report_urls



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


['https://311.boston.gov/reports/101002277498',
 'https://311.boston.gov/reports/101002277505',
 'https://311.boston.gov/reports/101002277502',
 'https://311.boston.gov/reports/101002277297',
 'https://311.boston.gov/reports/101002277501',
 'https://311.boston.gov/reports/101002277500',
 'https://311.boston.gov/reports/101002277499',
 'https://311.boston.gov/reports/101002277304',
 'https://311.boston.gov/reports/101002277497',
 'https://311.boston.gov/reports/101002277495',
 'https://311.boston.gov/reports/101002277494',
 'https://311.boston.gov/reports/101002277493',
 'https://311.boston.gov/reports/101002277490',
 'https://311.boston.gov/reports/101002277491',
 'https://311.boston.gov/reports/101002277492',
 'https://311.boston.gov/reports/101002277439',
 'https://311.boston.gov/reports/101002277488',
 'https://311.boston.gov/reports/101002277489',
 'https://311.boston.gov/reports/101002277486',
 'https://311.boston.gov/reports/101002277485']

In [55]:
scraper.soup

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--><!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--><!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en"><!--<![endif]--><head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>

  <title>BOS:311</title>

  <meta content="BOS:311 powered by Spot Reporters" name="description"/>
  <meta content="Connected Bits LLC" name="author"/>

  <link href="/assets/application-b6b83d55ffbde5ba59e649aa0b9aaa68.css" media="all" rel="stylesheet" type="text/css"/>
  <style class="js-account-style" id="account-style">
  .site-bg-color {
  	background-color: #FDB71A !important;
  }

  .site-cloud-color {
  	background-color: #C2E9F2 !important;
  }

  .site-button-color {
  		background-color: #77D1E7;
  }


### Report Class


In [6]:
class Report:
    """Gets individual report data"""
    
    def __init__(self, report_url):
        self.report_url = report_url
        
        res = requests.get(self.report_url)
        self.soup = BeautifulSoup(res.text, "html5lib")
        
        self.report_dict = {}
        
    def get_id(self):
        """Get the id of the report"""
        self.report_dict['id'] = self.report_url.split("/")[-1]
        
    def get_category(self):
        """Gets the category of the report"""
        category_text = self.soup.select(".content-head")[0].h2.get_text()
        self.report_dict['category'] = category_text.split(" at ")[0]
        
    def get_lat_long(self):
        """Get latitude and longitude of the report"""
        lat_long_elem_text = self.soup.find("strong", text="coordinates lat,lng: ").find_parent("p").get_text()
        lat_long = re.findall("[+-]?\d+\.\d+", lat_long_elem_text)
        
        self.report_dict['latitude'] = float(lat_long[0])
        self.report_dict['longitude'] = float(lat_long[1])
        
    def get_address(self):
        """Gets the address of the report"""
        address_text = self.soup.find("strong", text="address: ").find_parent("p").get_text()
        self.report_dict['address'] = address_text.split("address: ")[1]

In [7]:
report_dicts = []
for i in range(len(scraper.report_urls)):
    time.sleep(1)
    
    report = Report(scraper.report_urls[i])
    report.get_id()
    report.get_category()
    report.get_lat_long()
    report.get_address()
    
    report_dicts.append(report.report_dict)


In [8]:
report_dicts

[{'address': 'Intersection Of N Bennet St & Salem St, Boston',
  'category': 'Other',
  'id': '101002277498',
  'latitude': 42.36581475162517,
  'longitude': -71.05501641336656},
 {'address': '15 Montrose St, Roxbury',
  'category': 'Illegal Parking',
  'id': '101002277505',
  'latitude': 42.32360285239393,
  'longitude': -71.08106223748538},
 {'address': 'Intersection Of N Bennet St & Salem St, Boston',
  'category': 'Schedule a Bulk Item Pickup',
  'id': '101002277502',
  'latitude': 42.36587506278542,
  'longitude': -71.05506168661309},
 {'address': '36 Worcester Sq, Roxbury',
  'category': 'Other',
  'id': '101002277297',
  'latitude': 42.33642,
  'longitude': -71.073918},
 {'address': "Intersection Of Hawthorne Pl & William Cardinal O'connell Way, Boston",
  'category': 'Illegal Graffiti',
  'id': '101002277501',
  'latitude': 42.363444,
  'longitude': -71.066525},
 {'address': '6 Beethoven St, Roxbury',
  'category': 'Illegal Parking',
  'id': '101002277500',
  'latitude': 42.315

In [9]:
df = pd.DataFrame(report_dicts, columns=['id', 'category', 'address', 'latitude', 'longitude'])
df

Unnamed: 0,id,category,address,latitude,longitude
0,101002277498,Other,"Intersection Of N Bennet St & Salem St, Boston",42.365815,-71.055016
1,101002277505,Illegal Parking,"15 Montrose St, Roxbury",42.323603,-71.081062
2,101002277502,Schedule a Bulk Item Pickup,"Intersection Of N Bennet St & Salem St, Boston",42.365875,-71.055062
3,101002277297,Other,"36 Worcester Sq, Roxbury",42.33642,-71.073918
4,101002277501,Illegal Graffiti,Intersection Of Hawthorne Pl & William Cardina...,42.363444,-71.066525
5,101002277500,Illegal Parking,"6 Beethoven St, Roxbury",42.315362,-71.099243
6,101002277499,Illegal Parking,"222 Foster St, Apt 1, Brighton",42.342879,-71.157976
7,101002277304,Other,"106 Saratoga St, East Boston",42.37685,-71.036091
8,101002277497,Illegal Parking,"1 Centre Ave, Dorchester",42.292879,-71.062721
9,101002277495,Illegal Parking,"131 Arlington St, Boston",42.349046,-71.069664


### Get folium map

In [15]:
locations = df[['latitude', 'longitude']]
categories = df['category'].tolist()
location_list = locations.values.tolist()

loc_map = folium.Map(location=location_list[0], zoom_start=12)

for i in range(len(location_list)):
    folium.Marker(location_list[i], popup=categories[i]).add_to(loc_map)
    
loc_map


### Folium map for Trash category

In [11]:
search_category = input("Enter a category to search for: ")

Enter a category to search for: Illegal Parking


In [13]:
search_df = df.loc[df['category'] == search_category]
search_df

Unnamed: 0,id,category,address,latitude,longitude
1,101002277505,Illegal Parking,"15 Montrose St, Roxbury",42.323603,-71.081062
5,101002277500,Illegal Parking,"6 Beethoven St, Roxbury",42.315362,-71.099243
6,101002277499,Illegal Parking,"222 Foster St, Apt 1, Brighton",42.342879,-71.157976
8,101002277497,Illegal Parking,"1 Centre Ave, Dorchester",42.292879,-71.062721
9,101002277495,Illegal Parking,"131 Arlington St, Boston",42.349046,-71.069664
19,101002277485,Illegal Parking,"428 E Eighth St, South Boston",42.331445,-71.043849


In [14]:
locations = search_df[['latitude', 'longitude']]
categories = search_df['category'].tolist()
location_list = locations.values.tolist()

loc_map = folium.Map(location=location_list[0], zoom_start=12)

for i in range(len(location_list)):
    folium.Marker(location_list[i], popup=categories[i]).add_to(loc_map)
    
loc_map