### Package Imports

In [1]:
# Package imports
import pandas as pd
import numpy as np
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup

### Initial Scrape (to Pandas MultiIndex)

In [2]:
# Define primary source URL and driver path
primary_url = 'http://www2.durhamcountync.gov/sheriff/ips/default.aspx'
driver = webdriver.Chrome(executable_path='/Users/orion/Downloads/chromedriver')

# Initiate page and change table view to 'Incarcerated' (automatic view is 'Last 24 hours')
driver.get(primary_url)
driver.find_element_by_xpath('//*[@id="ddlDateListing"]/option[3]').click()

# Parse HTML using BS4 and close browser
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
driver.close()

In [3]:
# Initiate dictionary of inmates and lists of offenses; isolate table
offenses_list = []
inmate_dictionary = {}
table = soup.find('table', id='Table1')
rows = table.find_all('tr')
stripchars = '][\'\"'

# For each inmate row, append data to list
for row in rows:
    data = row.find_all('td')
    data = [element.text.strip() for element in data]
    offenses_list.append([element for element in data if element])

# For each list, set inmate name as key and format as dictionary ({Name: [[Offense 1], [Offense 2],...]...})
for element in offenses_list:
    if len(element) == 1:
        inmate_dictionary[str(element).lstrip(stripchars).rstrip(stripchars)] = []
        last_offender = str(element).lstrip(stripchars).rstrip(stripchars)
    elif (len(element) != 1) & ('D' not in element[0]):
        inmate_dictionary[last_offender].extend([element])

In [4]:
# Convert dictionary into cleaned MultiIndex DataFrame
series = pd.concat({key.replace(',', ''): pd.Series(value) for key, value in inmate_dictionary.items()})
transition_dataframe = pd.DataFrame(series, columns=['Incidents'])
offenses = transition_dataframe['Incidents'].apply(pd.Series)
offenses.columns=['Date Confined', 'Date Charged', 
                 'Date Released', 'Statute Description', 
                 'Bond Type', 'Bond Amount', 'Court Docket', 
                 'Days in Jail/Charge']
offenses.index.levels[0].name = 'Name'

### Secondary Scrape (demographic data from hyperlinks)

In [5]:
# Initiate list of hyperlinks and dictionary of additional data
secondary_urls = []
demographics_dictionary = {}

# Populate list of hyperlinks from NC SAVAN/VINELink
for anchor in soup.findAll('a', href=True):
    secondary_urls.append(anchor['href'])   
secondary_urls = secondary_urls[2:-1]

In [6]:
# Define function to check for presence of xpath
def hasxpath(xpath):
    try: 
        driver.find_element_by_xpath(xpath)
        return True
    except:
        return False

In [7]:
# For each hyperlinked URL, initiate page and wait 3 seconds

for url in url_list:
    driver = webdriver.Chrome('D:/Java/TestChrome/lib/chromedriver.exe')
    print(url)
    driver.get(url)
    time.sleep(3)
    if hasxpath('xxx') == True:

        if hasxpath('xxx') == True:
            driver.find_element_by_xpath('xxx').click()

            # For each field, check if it exists and if so collect data
            # (example below for 'Name' field)
            if hasxpath('xxx') == True:
                name = driver.find_element_by_xpath('xxx')
                name = name.text
            else:
                name = ''
            driver.close()

        else:
            driver.close()

    else:
        driver.close()
        print('Closing Browser : ' + url)

print('pass') 






for url in secondary_urls:
    driver = webdriver.Chrome(executable_path='/Users/orion/Downloads/chromedriver')
    print(url)
    driver.get(url)
    time.sleep(3)
    # driver.implicitly_wait(2)
    
    # If search result page exists, continue, else close driver
    if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[1]/span[1]/div/div/div[2]/span') == True:
        name = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[1]/span[1]/div/div/div[2]/span')
        
        # For each datum, if information exists, collect, else set blank
        
        
        # If 'More Info' button exists, expand page and collect additional data
        if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[1]/button') == True:
            driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[1]/button').click()
           
            # For each datum, if information exists, collect, else set blank
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[10]/div/div/div[2]/span') == True:
                age = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[10]/div/div/div[2]/span')
                age = age.text
            else:
                age = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[5]/div/div/div[2]/span') == True:
                custodystatus = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[5]/div/div/div[2]/span')
                custodystatus = custodystatus.text
            else:
                custodystatus = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[14]/div/div/div[2]/span') == True:
                race = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[14]/div/div/div[2]/span')
                race = race.text
            else:
                race = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[1]/div/div[2]/span') == True:
                offenderid = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[1]/div/div[2]/span')
                offenderid = offenderid.text
            else:
                offenderid = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[2]/div/div[2]/span') == True:
                gender = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[2]/div/div[2]/span')
                gender = gender.text
            else:
                gender = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[4]/div/div[2]/span') == True:
                dob = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[4]/div/div[2]/span')
                dob = dob.text
            else:
                dob = ''
        
            demographics_dictionary[name] = [offenderid, age, dob, race, gender, custodystatus]
            driver.close()
            
        else:
            offenderid = ''
            dob = ''
            gender = ''
            
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[1]/span[1]/div/div/div[2]/span') == True:
                name = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[1]/span[1]/div/div/div[2]/span')
                name = name.text
            else:
                name = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[10]/div/div/div[2]/span') == True:
                age = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[10]/div/div/div[2]/span')
                age = age.text
            else:
                age = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[5]/div/div/div[2]/span') == True:
                custodystatus = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[5]/div/div/div[2]/span')
                custodystatus = custodystatus.text
            else:
                custodystatus = ''
            if hasxpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[14]/div/div/div[2]/span') == True:
                race = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[14]/div/div/div[2]/span')
                race = race.text
            else:
                race = ''
            
            demographics_dictionary[name] = [offenderid, age, dob, race, gender, custodystatus]
            driver.close()
            
    else:
        driver.close()

WebDriverException: Message: unknown error: Element <button type="button" class="btn btn-link underlineLinkMoreInfo ng-binding" ng-click="showAdvanced_$index = !showAdvanced_$index; moreInfoCall(showAdvanced_$index);">...</button> is not clickable at point (92, 370). Other element would receive the click: <span class="us-spinner-wrapper ng-scope" us-spinner="{radius:30, width:8, length: 16, shadow: true}">...</span>
  (Session info: chrome=71.0.3578.98)
  (Driver info: chromedriver=2.44.609545 (c2f88692e98ce7233d2df7c724465ecacfe74df5),platform=Mac OS X 10.14.2 x86_64)


In [8]:
# Strip commas from names so they share format with offenses dataframe
demographics_dictionary = {key.replace(',', ''): item for key, item in demographics_dictionary.items()}

# Convert demographic information into DataFrame and assign column names
demographics = pd.DataFrame.from_dict(demographics_dictionary, 'index')
demographics.columns = ['Offender ID', 'Age', 'Date of Birth', 'Race', 'Gender', 'Custody Status']
demographics.index.name = 'Name'

### Data Merge and Export

In [9]:
# Join offenses and demographics to create final MultiIndex DataFrame
final = offenses.join(demographics)

In [10]:
final

Unnamed: 0_level_0,Unnamed: 1_level_0,Date Confined,Date Charged,Date Released,Statute Description,Bond Type,Bond Amount,Court Docket,Days in Jail/Charge,Offender ID,Age,Date of Birth,Race,Gender,Custody Status
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ALLEN CHRISTOPHER LEE,0,8/20/2018,8/20/2018,[incarcerated],DELIVER COCAINE,SECURED,$0.00,18CRS53065,158,B34159,21,02/16/1997,African American,Male,In Custody
ALLEN CHRISTOPHER LEE,1,8/20/2018,8/20/2018,[incarcerated],M/S/D/P CS W/N 1000FT OF PARK,SECURED,"$30,000.00",18CRS53066,158,B34159,21,02/16/1997,African American,Male,In Custody
ALLEN CHRISTOPHER LEE,2,8/20/2018,8/20/2018,[incarcerated],M/S/D/P CS W/N 1000FT OF PARK,SECURED,"$100,000.00",18CRS53065,158,B34159,21,02/16/1997,African American,Male,In Custody
ALLEN CHRISTOPHER LEE,3,8/20/2018,8/20/2018,[incarcerated],MAINTN VEH/DWELL/PLACE CS,SECURED,$0.00,18CRS53066,158,B34159,21,02/16/1997,African American,Male,In Custody
ALLEN CHRISTOPHER LEE,4,8/20/2018,8/20/2018,[incarcerated],POSSESS DRUG PARAPHERNALIA,SECURED,$0.00,18CRS53066,158,B34159,21,02/16/1997,African American,Male,In Custody
ALLEN CHRISTOPHER LEE,5,8/20/2018,8/20/2018,[incarcerated],SELL COCAINE,SECURED,$0.00,18CRS53065,158,B34159,21,02/16/1997,African American,Male,In Custody
ALSTON LADRRIOUS LAMONTE,0,11/1/2018,11/1/2018,[incarcerated],AID AND ABET ARMED ROBBERY,SECURED,"$200,000.00",18CR057750,85,,,,,,
ALSTON LADRRIOUS LAMONTE,1,11/1/2018,11/1/2018,[incarcerated],FAILURE TO APPEAR ON MISDEMEANOR(M-BREAKING OR...,SECURED,$0.00,18CR053902,85,,,,,,
ALSTON LADRRIOUS LAMONTE,2,11/1/2018,11/1/2018,[incarcerated],FAILURE TO APPEAR ON MISDEMEANOR(MISDEMEANOR L...,SECURED,$0.00,18CR703606,85,,,,,,
ALSTON LADRRIOUS LAMONTE,3,11/1/2018,11/1/2018,[incarcerated],OBTAIN PROPERTY FALSE PRETENSE,SECURED,$0.00,18CR057710,85,,,,,,


In [None]:
# Convert final product to CSV file
final.to_csv('durham-data.csv')