# Scrape Boston's 311 site with Selenium

### Imports

In [10]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import pandas as pd

### Constants

In [9]:
BOSTON_URL = r'https://311.boston.gov'
SERVICE_ELEMS_XPATH = "//h3[contains(text(),'Services')]/following-sibling::ul/li"

### Scraper Class

In [11]:
class Scraper:
    """Class that scrapes 311.boston.gov"""
    
    def __init__(self, driver):
        self.report_urls = []
        self.driver = driver
        self.driver.get(BOSTON_URL)
        self.services = []
        self.get_services()
        
    def get_services(self):
        """Get service categories and service category count"""
        service_elems = self.driver.find_elements_by_xpath(SERVICE_ELEMS_XPATH)
        
        service_category_elems = [elem.find_element_by_xpath("./a") for elem in service_elems]
        service_count_elems = [elem.find_element_by_xpath("./span") for elem in service_elems]
        
        service_categories = [elem.text for elem in service_category_elems]
        service_counts = [''.join(re.findall('\d+', elem.text)) for elem in service_count_elems]
        
        self.services = list(zip(service_categories, service_counts))
        
    def get_reports_on_page(self):
        """Append report urls from a page onto self.report_urls"""
        report_ids = [elem.get_attribute("onclick").split("location.href='")[1][:-2]
                        for elem in self.driver.find_elements_by_class_name('report')]
        
        self.report_urls += [BOSTON_URL + report_id
                            for report_id in report_ids
                            if BOSTON_URL + report_id not in self.report_urls]
        
    def next_page(self):
        """Go to next page"""
        next_page_button = self.driver.find_element_by_xpath("//li/a[contains(text(),'Next')]")
        next_page_button.click()
        
    def search_for(self, search_text):
        """Search for inputed text in search field and go to the results"""
        search_box = self.driver.find_element_by_class_name('search-query')
        search_box.send_keys(search_text)
        search_box.send_keys(Keys.ENTER)

In [63]:
driver = webdriver.Chrome()

In [64]:
scraper = Scraper(driver)

In [65]:
scraper.get_reports_on_page()

In [15]:
scraper.report_urls

['https://311.boston.gov/reports/101002276877',
 'https://311.boston.gov/reports/101002276975',
 'https://311.boston.gov/reports/101002276873',
 'https://311.boston.gov/reports/101002276974',
 'https://311.boston.gov/reports/101002276910',
 'https://311.boston.gov/reports/101002276944',
 'https://311.boston.gov/reports/101002276972',
 'https://311.boston.gov/reports/101002276973',
 'https://311.boston.gov/reports/101002276971',
 'https://311.boston.gov/reports/101002276939']

### Report Class

In [149]:
class Report:
    """Handles information for individual reports"""
    
    def __init__(self, driver, report_url):
        self.driver = driver
        self.report_url = report_url
        self.driver.get(self.report_url)
        
        self.report_dict = {}
        self.status = ""
        
    def get_id(self):
        """Gets the ID of the ticket using the current URL"""
        self.report_dict['id'] = self.driver.current_url.split('/')[-1]
        
    def get_status(self):
        """Gets the status (Opened, Closed, Submitted) of the report"""
        self.report_dict['status'] = self.driver.find_element_by_class_name("label").text
        
    def get_description(self):
        """Gets user-submitted description of the report"""
        self.report_dict['description'] = self.driver.find_element_by_xpath("//blockquote/p").text
        
    def get_notes_tab(self):
        """Gets info inside the notes tab"""
        # Click on the Notes tab
        self.driver.find_element_by_xpath("//a[@href='#notes-tab']").click()
        
        # Get all items from the table, then organize in tuples
        notes_table_elem = self.driver.find_element_by_xpath("//div[@id='notes-tab']/table/tbody")
        table_elems = notes_table_elem.find_elements_by_xpath("./tr/td")
        table_elems = [elem.text for elem in table_elems]
        row_data = list(zip(table_elems[0::2], table_elems[1::2]))
        
        # Three cases
        # 1. Submitted - 1 row
        # 2. Opened - 2 rows
        # 3. Closed - 3 rows

        # Get submit info
        submit_row = row_data[-1]
        self.report_dict['time submitted'] = submit_row[0]
        self.report_dict['submitted using'] = submit_row[1].split(" ")[-1]
        
        # Get open info
        if len(row_data) >= 2:
            open_row = row_data[-2]
            self.report_dict['time opened'] = open_row[0]
            self.report_dict['open description'] = open_row[1]
            
        # Get closed info
        if len(row_data) == 3:
            closed_row = row_data[0]
            self.report_dict['time closed'] = closed_row[0]
            self.report_dict['closing description'] = closed_row[1]

In [154]:
report = Report(driver, scraper.report_urls[4])

In [155]:
report.get_notes_tab()
report.report_dict

{'submitted using': 'Opened', 'time submitted': 'Sat Dec 02, 2017 02:04pm'}

In [29]:
report.report_dict

{'description': 'Connecticut car parked blocking hydrant since yesterday.',
 'id': '101002276877#notes-tab',
 'status': 'CLOSED'}