In [16]:
import pandas as pd
import selenium
from selenium.webdriver import Chrome
import time
import numpy as np

### Let's grab state names from our data to use in our scraping loop

In [183]:
df = pd.read_csv('../Data/fatalities_geocoded_with_pop.csv', index_col = 0)

In [184]:
states = df.state.unique()

### Base URL is the interactive map on the "action" section of Campaign Zero's web page

In [185]:
# URL for our selected page, which contains our interactive map of interest

base_url = 'https://campaign-zero-cartogram.s3.amazonaws.com/index.html'

# Get and open Chrome instance

webdriver = "chromedriver.exe"

# Set driver to fresh Chrome window

driver = Chrome(webdriver)

### Open page using URL

In [186]:
# Open page using URL

driver.get(base_url)

### Loop through each state, click its corresponding link on the map, pull and store info about legislation passed during the 2014-2019 period

In [204]:
# Create list to store state names, legislation details, and policy category

state_names = []
policy_category = []
bill_num = []
bill_desc = []

# Loop through each state, click its corresponding link on the map,
# pull and store info about legislation passed during the 2014-2019 period

for state in states:
    
    try:
        
# Define current window
        
        window_before = driver.window_handles[0]
        
# Click link        

        driver.find_element_by_xpath("//*[@id='"+state+"']").click()
    
# Wait five seconds for pop-up window to load

        time.sleep(5)
    
# Define pop-up window
    
        window_after = driver.window_handles[1]
        
# Switch driver to new window
        
        driver.switch_to.window(window_after)
        
        
# Find all elements of class "bill"

        bills = driver.find_elements_by_class_name('bill')
    
    
# Loop through each bill and append relevant details to our predefined lists

        for i in range(0, len(bills)):
        
        
# Parsing state names, policy category, and legislation details

            state_names.append(state)
            policy_category.append(bills[i].find_elements_by_tag_name('p')[0].text.split(': ')[1])
            bill_num.append(bills[i].find_elements_by_tag_name('p')[1].text.split(': ')[1])
            bill_desc.append(bills[i].find_elements_by_tag_name('p')[2].text)
            
# Close pop-up window

        driver.close()
    
# Switch driver back to map page

        driver.switch_to.window(window_before)
        
    except:
        
        pass

### Let's create a dataframe from what we scraped and export it as a CSV file

In [205]:
policies = pd.DataFrame({'state' : pd.Series(state_names),
                         'policy_category' : pd.Series(policy_category),
                         'bill_num' : pd.Series(bill_num),
                         'bill_desc' : pd.Series(bill_desc)})

### Note that, just as with the map, 9 states do not have any recorded legislation

In [206]:
len(policies.state.unique())

41

### Export file

In [208]:
policies.to_csv('../Data/state_legislation.csv')

# Sandbox

In [214]:
policy_category_counts = policies.groupby(['state', 'policy_category'], as_index = False)['bill_num'].count().rename({'bill_num' : 'count'}, axis = 1)

In [217]:
policy_category_counts[policy_category_counts['policy_category'] == 'Training']

Unnamed: 0,state,policy_category,count
0,AK,Training,1
8,CA,Training,2
13,CO,Training,1
18,CT,Training,1
33,LA,Training,1
40,MD,Training,2
50,NJ,Training,1
57,OK,Training,1
61,PA,Training,1
73,UT,Training,1


In [218]:
policy_category_counts.policy_category.unique()

array(['Training', 'Limit Use of Force', 'Body Cams/ Film the Police',
       'End For-profit Policing', 'End Broken Windows Policing',
       'Independently Investigate & Prosecute',
       'Community Representation', 'Community Oversight',
       'Fair Police Union Contracts', 'Demilitarization'], dtype=object)