In [1]:
# pyton 3.7

In [146]:
import requests
import pdftotext
import time
import io
import json
from bs4 import BeautifulSoup

In [108]:
crime_log_page   = "http://www.police.ucsd.edu/docs/reports/CallsandArrests/Calls_and_Arrests.asp"
crime_log_prefix = "http://www.police.ucsd.edu/docs/reports/CallsandArrests/"

In [109]:
page = requests.get(crime_log_page)

In [110]:
page_text = BeautifulSoup(page.content)

In [111]:
option_list = page_text.find_all("option")

In [112]:
value_list = []

In [113]:
for option in option_list:
    value_list.append(option["value"])

In [114]:
value_list

['',
 'CallsForService/April 20, 2020.pdf',
 'CallsForService/April 19, 2020.pdf',
 'CallsForService/April 18, 2020.pdf',
 'CallsForService/April 17, 2020.pdf',
 'CallsForService/April 15, 2020.pdf',
 'CallsForService/April 14, 2020.pdf',
 'CallsForService/April 11, 2020.pdf',
 'CallsForService/April 10, 2020.pdf',
 'CallsForService/April 12, 2020.pdf',
 'CallsForService/April 9, 2020.pdf',
 'CallsForService/April 8, 2020.pdf',
 'CallsForService/April 7, 2020.pdf',
 'CallsForService/April 6, 2020.pdf',
 'CallsForService/April 4, 2020.pdf',
 'CallsForService/April 3, 2020.pdf',
 'CallsForService/April 5, 2020.pdf',
 'CallsForService/April 2, 2020.pdf',
 'CallsForService/April 1, 2020.pdf',
 'CallsForService/Mar 31, 2020.pdf',
 'CallsForService/Mar 30, 2020.pdf',
 'CallsForService/Mar 27, 2020.pdf',
 'CallsForService/Mar 29, 2020.pdf',
 'CallsForService/Mar 28, 2020.pdf',
 'CallsForService/Mar 26, 2020.pdf',
 'CallsForService/Mar 25, 2020.pdf',
 'CallsForService/Mar 24, 2020.pdf',
 'Call

### Typical Format

- Call Category
- Location
- Date Report
- Case #
- Date Occurred
- Time Occurred
- Summary
- Dispotion
- Arrest Info (if applicable) 

In [52]:
data_fields = ["CALL_CATEGORY", "LOCATION", "DATE_REPORTED", "CASE_NUMBER",
               "DATE_OCCURRED", "TIME_OCCURRED", "SUMMARY", "DISPOSITION", "ARREST", "IS_UPDATE"]

In [20]:
# splits the texts into sublists of their entries
def get_entries(page_text):
    entries_list = []
    entry        = []
    entry_delim  = [(i - 2) for i in range(0, len(page_text)) if page_text[i].startswith("Date Reported")]
    entry_delim.append(len(page_text))
    entries_list = [page_text[ele0:ele1] 
                        for (ele0, ele1) in zip(entry_delim[:-1], entry_delim[1:])] 
    return entries_list

In [72]:
def parse_log_entry(entry_text_list, is_update):
    VAR_LEN_FIELD_PREFIXES = {"Summary:": "SUMMARY", "Disposition:": "DISPOSITION", "Arrest Date": "ARREST"}
    PREF_LIST              = list(VAR_LEN_FIELD_PREFIXES.keys()) + ["Date Reported", "Incident/Case#", 
                                                                    "Date Occurred", "Time Occurred "]
    LAST_FIXED_LEN_FIELD   = 6
    line_i = 0
    prefix = ""
    entry = dict()
    for field in data_fields[0:LAST_FIXED_LEN_FIELD]:
        value = entry_text_list[line_i]
        prefix_filter = list(filter(entry_text_list[line_i].startswith, PREF_LIST))
        if(prefix_filter != []):
            value = value.strip(prefix_filter[0]).strip()
        entry[field] = value
        line_i+=1
    for line_i in range(line_i, len(entry_text_list)):
        prefix_filter = list(filter(entry_text_list[line_i].startswith, PREF_LIST))
        if(prefix_filter != []):
            prefix = prefix_filter[0]
        if(VAR_LEN_FIELD_PREFIXES[prefix] in entry):
            entry[VAR_LEN_FIELD_PREFIXES[prefix]] = entry[VAR_LEN_FIELD_PREFIXES[prefix]] + (entry_text_list[line_i].strip())
        else:
            entry[VAR_LEN_FIELD_PREFIXES[prefix]] = entry_text_list[line_i].strip(prefix).strip()
    if("ARREST" in entry):
        entry["ARREST"] = True
    else:
        entry["ARREST"] = False
    entry["IS_UPDATE"] = is_update
    return entry

In [137]:
def parse_daily_log(pdf):    
    DATE_LINE = 2 # index with the dateline in the header
    # iterate through the pages
    corpus = []
    for page in pdf:
        # split page into array of strings based on new line character
        page_text = page.split('\n')
        if ("UPDATE" in page_text[DATE_LINE]): # TODO: move this check earlier in the processing
            is_update = True
        else:
            is_update = False
        entries = get_entries(page_text)
        corpus = corpus + [parse_log_entry(entry, is_update) for entry in entries]
    return corpus

In [138]:
full_data = []

In [140]:
for url_suffix in value_list[1:]:
    log_response = requests.get(crime_log_prefix + url_suffix)
    
    raw_pdf_data = log_response.content
    
    
    #with open(("./" + url_suffix), 'wb+') as f:
    #    f.write(raw_pdf_data)
    
    with io.BytesIO(raw_pdf_data) as open_pdf_file:
        read_pdf = pdftotext.PDF(open_pdf_file)
        new_entries = parse_daily_log(read_pdf)
        full_data = full_data + new_entries    
    time.sleep(0.5)

In [144]:
with open("initial_pull.json", "w+") as file:
    json.dump(full_data, file)