In [52]:
# Import packages and specify dependencies
from bs4 import BeautifulSoup
import requests
import os
import re

## Problem statement

**Source:** Inter-university Consortium of Political and Social Research (ICPSR)  
**Parent directory:** https://www.icpsr.umich.edu/files/NACJD/ORIs/

**Goal:** Obtain a list of all law enforcement agencies in the US with their corresponding ORI codes and other pertaining information (county, state, FIPS and UCR codes)

**Definitions:**
- UCR (uniform crime reporting) code:  code
- FIPS (federal information processing standards) code: unique county (and county equivalents) identification code 
- ORI (originating agency identifier) code

### Use BeatifulSoup html parser to extract links

In [24]:
# Extract all links to lists of law enforcement agencies in a state
base_url = r'https://www.icpsr.umich.edu/files/NACJD/ORIs/'
response = requests.get(base_url)

soup = BeautifulSoup(response.text, 'html.parser')

# list of links to pages with counties, county law enforcemen agencies and their ORIs
links = [os.path.join(base_url, link.get('href')) for link in soup.find_all('a')][4:-2]

In [None]:
def find_county_codes(text):
    # - accepts raw string and cleans it
    # - outputs a dictionary with a county name, its FIPS code and UCR
    county_dict = {}
    county_raw, code_raw = text.a.get('name').split('_(FIPS=')
    county = ' '.join(county_raw.split('_')[1:])
    code = [i.strip(')') for i in re.split('=|_UCR', code_raw) if i]
    county_dict['county'] = county
    county_dict['fips'] = code[0]
    county_dict['ucr'] = code[1]
    
    return county_dict

In [None]:
def find_agency_ori(text):
    # - accepts raw string and cleans it
    # - outputs a list of dictionaries with an agency name, its ORI7 and ORI9 codes and abbreviated state name
    agency_raw = text.text.split('ORI9')[1].strip().split('\n')
    agency_ori = []
    for element in agency_raw:
        data = [i for i in element.strip().split('   ') if i]
        agency_dict = {}
        agency_dict['agency'] = data[0]
        agency_dict['ori7'] = data[1]
        agency_dict['ori9'] = data[2]
        agency_dict['state_abbr'] = data[2][:2]
        agency_ori.append(agency_dict)
    
    return agency_ori


### Get a list of all law enforcement agencies with codes

The list of all law enforcement agencies is yielded as a list of dictionaries. Each dictionary contains the following fileds:
- agency
- county
- state_abbr
- ORI7
- ORI9
- FIPS 
- UCR

In [134]:
def scrape_by_state(link):
    # - Scrapes all html text in a given link
    # - Searches for tags 'h3' and 'pre'
    # - Passes found text to find_agency_ori and find_county_codes functions
    # - Merges the two agency and county dictionaties
    # - Outputs a list of dictonaries with keys: county, agency, state_abbr, ori9, ori7, fips, ucr 

    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')

    agencies = []
    for s, a in zip(soup.find_all('h3'), soup.find_all('pre')):
        county = find_county_codes(s)
        ori_codes = find_agency_ori(a)
        for agency in ori_codes:
            agencies.append({**county, **agency})


    return agencies   

agencies = []
for link in links:
    agencies.extend(scrape_by_state(link))

In [142]:
# Example agency record
agencies[100]

{'county': 'COOSA',
 'fips': '037',
 'ucr': '019',
 'agency': "COOSA COUNTY SHERIFF'S OFFICE",
 'ori7': '  AL02200',
 'ori9': 'AL0220000',
 'state_abbr': 'AL'}

### Save data as json file

In [141]:
import json
with open('data/agencies.txt', 'w+') as outfile:
    json.dump(agencies, outfile)