In [2]:
# Package imports

from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from random import randint
import datetime
import requests
import numpy as np
import os
import time
import pandas as pd
import re
import webbrowser
import json

In [3]:
# Define primary source URL
# Change table view to all incarcerated 
# (automatic view is last 24 hours) 
# Parse HTML using BS4

primary_url = 'http://www2.durhamcountync.gov/sheriff/ips/default.aspx'

driver = webdriver.Chrome(executable_path='/Users/orion/Downloads/chromedriver')
driver.get(primary_url)
driver.find_element_by_xpath('//*[@id="ddlDateListing"]/option[3]').click()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
driver.close()

In [4]:
# Pull table records into list
# Transform into a cleaned dictionary
# {Name: [[Offense 1], [Offense 2], ...], ...}

offenses_list = []
offenses_dictionary = {}
table = soup.find('table', id='Table1')
rows = table.find_all('tr')
stripchars = '][\'\"'

for row in rows:
    columns = row.find_all('td')
    columns = [element.text.strip() for element in columns]
    offenses_list.append([element for element in columns if element])

for i in offenses_list:
    if len(i) == 1:
        offenses_dictionary[str(i).lstrip(stripchars).rstrip(stripchars)] = []
        last_offender = str(i).lstrip(stripchars).rstrip(stripchars)
    elif (len(i) != 1) & ('D' not in i[0]):
        offenses_dictionary[last_offender].extend([i])

In [6]:
# Convert dictionary into 
# cleaned MultiIndex DataFrame

series = pd.concat({k.replace(',', ''): pd.Series(v) for k, v in offenses_dictionary.items()})
transition_dataframe = pd.DataFrame(series, columns=['Incidents'])
offenses = transition_dataframe['Incidents'].apply(pd.Series)
offenses.columns=['Date Confined', 'Date Charged', 
                 'Date Released', 'Statute Description', 
                 'Bond Type', 'Bond Amount', 'Court Docket', 
                 'Days in Jail/Charge']
offenses.index.levels[0].name = 'Name'
offenses

Unnamed: 0_level_0,Unnamed: 1_level_0,Date Confined,Date Charged,Date Released,Statute Description,Bond Type,Bond Amount,Court Docket,Days in Jail/Charge
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ADAMS RICKY DEVON,0,8/20/2018,8/21/2018,[incarcerated],CHILD SUPPORT,[N/A],$0.00,12CVD4557,99
ADAMS RICKY DEVON,1,8/20/2018,8/20/2018,[incarcerated],FAILURE TO APPEAR ON MISDEMEANOR(PROBATION VIOL),[N/A],$0.00,17CR050843,100
ADAMS WESLEY CHURCHILL,0,10/20/2018,10/20/2018,[incarcerated],FAILURE TO APPEAR ON FELONY(HABITUAL FELON),SECURED,"$7,000.00",17CRS003412,39
ADAMS WESLEY CHURCHILL,1,10/20/2018,10/20/2018,[incarcerated],FAILURE TO APPEAR ON FELONY(LAR REMOVE/DEST/DE...,SECURED,$0.00,17CRS57151,39
ADCOCK JAMIE LYNN,0,11/6/2018,11/6/2018,[incarcerated],FELONY PROBATION VIOLATION,SECURED,"$10,000.00",18CRS050933,22
ADCOCK JAMIE LYNN,1,11/6/2018,11/14/2018,[incarcerated],FELONY PROBATION VIOLATION,UNSECURED,"$1,000.00",18CRS644,14
ALEXANDER TYRELL,0,2/8/2018,2/8/2018,[incarcerated],ASSAULT ON A FEMALE,SECURED,$0.00,18CR50948,293
ALEXANDER TYRELL,1,2/8/2018,2/13/2018,[incarcerated],ASSAULT ON A FEMALE,SECURED,$0.00,18CR51087,288
ALEXANDER TYRELL,2,2/8/2018,2/8/2018,[incarcerated],ATMPT 2ND DEGREE FORCIBLE RAPE,SECURED,$0.00,18CR50946,293
ALEXANDER TYRELL,3,2/8/2018,2/13/2018,[incarcerated],FALSE IMPRISONMENT,SECURED,$0.00,18CR51087,288


In [None]:
# Create list of all URLs linking
# to offender profile search results
# from NC SAVAN/VINELink system

secondary_urls = []
demographics_dictionary = {}

for anchor in soup.findAll('a', href=True):
    secondary_urls.append(anchor['href'])   
secondary_urls = secondary_urls[2:-1]

In [None]:
### NEEDS TO BE EDITED ###

# For each of above URLs,
# access URL, if search result exists, 
# if More Info button exists,
# click on it, if each demographic datum
# exists, append it to dictionary value for 
# offender name in question
# Otherwise close browser and iterate

for url in secondary_urls:
    driver = webdriver.Chrome(executable_path='/Users/orion/Downloads/chromedriver')
    driver.get(url)
    driver.implicitly_wait(2)
    
#     if driver.find_element_by_xpath('//*[@id="searchInstOff"]/span/span'):
#         driver.close()
#     driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[1]/button'):
    if driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[1]/button'):
        driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[1]/button').click()
        name = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[1]/span[1]/div/div/div[2]/span')
        name = name.text
        offenderid = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[1]/div/div[2]/span')
        offenderid = offenderid.text
        age = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[10]/div/div/div[2]/span')
        age = age.text
        dob = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[4]/div/div[2]/span')
        dob = dob.text
        race = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[14]/div/div/div[2]/span')
        race = race.text
        gender = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[4]/div[1]/more-info/div[2]/div/div/div[2]/div[2]/div/div[2]/span')
        gender = gender.text
        custodystatus = driver.find_element_by_xpath('//*[@id="ngVewDiv"]/div/div/div/div[3]/div[3]/div[2]/div/search-result/div/div[1]/div/div[1]/span[5]/div/div/div[2]/span')
        custodystatus = custodystatus.text
        demographics_dictionary[name] = [offenderid, age, dob, race, gender, custodystatus]
        driver.close()
#         else:
#             xxx
#             driver.close()

In [None]:
# Strip commas from names so they
# share format with offenses dataframe
# Convert demographic information into
# DataFrame and assign column names

demographics_dictionary = {key.replace(',', ''): item for key, item in demographics_dictionary.items()}
demographics = pd.DataFrame.from_dict(demographics_dictionary, 'index')
demographics.columns = ['Offender ID', 'Age', 'Date of Birth', 
                        'Race', 'Gender', 'Custody Status']
demographics.index.name = 'Name'

In [1]:
# Join offenses and demographics
# to create final MultiIndex DataFrame

final = clean.join(demographics)

NameError: name 'clean' is not defined

In [None]:
# Convert final product to CSV file

final.to_csv('durham-data.csv')