### Imports

In [51]:
from bs4 import BeautifulSoup
import requests
import time
import re

### Constants

In [52]:
BOSTON_URL = r'https://311.boston.gov'

### Main Page Scraper Class

In [53]:
class Scraper:
    """Scrapes the Boston's 311 main page with BeautifulSoup"""
    
    def __init__(self):
        self.current_page = 1
        self.report_urls = []
        
        # Get main page soup
        res = requests.get(BOSTON_URL)
        self.soup = BeautifulSoup(res.text, "html5lib")
        
    def get_next_page_soup(self):
        """Adds soup from next page to soup"""
        if not self.current_page == 20:
            time.sleep(1)
            self.current_page += 1
            
            res = requests.get(BOSTON_URL + "/?page=" + str(self.current_page), "html5lib")
            new_soup = BeautifulSoup(res.text)
            
            self.soup.append(new_soup.body)
            
    def get_report_urls(self):
        """Gets all '.report' class 'onclick' attributes, and gets the appropriate URLs"""
        onclick_attrs = [elem['onclick'] for elem in self.soup.select(".report")]
        self.report_urls = [BOSTON_URL + onclick_text.split("'")[1][:-2] for onclick_text in onclick_attrs]

In [54]:
scraper = Scraper()
scraper.get_report_urls()
scraper.report_urls

['https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774',
 'https://311.boston.gov/reports/1010022774']

### Report Class


In [48]:
class Report:
    """Gets individual report data"""
    
    def __init__(self, report_url):
        self.report_url = report_url
        
        res = requests.get(self.report_url)
        self.soup = BeautifulSoup(res.text, "html5lib")
        
        self.report_dict = {}
        
    def get_id(self):
        """Get the id of the bid"""
        self.report_dict['id'] = self.report_url.split("/")[-1]
        
    def get_lat_long(self):
        """Get latitude and longitude of the report"""
        lat_long_elem_text = self.soup.find("strong", text="coordinates lat,lng: ").find_parent("p").text()
        lat_long_text = re.findall("\d+\.\d+", lat_long_elem_text)
        print(lat_long_text)
        
    def get_address(self):
        """Gets the address of the report"""
        address_text = self.soup.find("strong", text="address: ").find_parent("p").text()
        self.report_dict['address'] = address_text.split("address: ")[1]

In [49]:
report = Report(scraper.report_urls[0])

In [50]:
report.soup

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--><!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--><!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en"><!--<![endif]--><head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>

  <title>BOS:311</title>

  <meta content="BOS:311 powered by Spot Reporters" name="description"/>
  <meta content="Connected Bits LLC" name="author"/>

  <link href="/assets/application-b6b83d55ffbde5ba59e649aa0b9aaa68.css" media="all" rel="stylesheet" type="text/css"/>
  <style class="js-account-style" id="account-style">
  .site-bg-color {
  	background-color: #FDB71A !important;
  }

  .site-cloud-color {
  	background-color: #C2E9F2 !important;
  }

  .site-button-color {
  		background-color: #77D1E7;
  }


In [46]:
report.get_lat_long()

AttributeError: 'NoneType' object has no attribute 'find_parent'