In [22]:
raw_data_dir = '../data/raw/'
interim_data_dir = '../data/interim/'

In [23]:
import requests
import json
import pandas as pd

url= "https://crashviewer.nhtsa.dot.gov/CrashAPI"
#/crashes/GetCrashesByLocation?fromCaseYear=2014&toCaseYear=2015&state=1&county=1&format=json

fromCaseYear = "2010"
toCaseYear = "2020"
state = "6"
qurl = f"{url}/crashes/GetCrashesByLocation?fromCaseYear={fromCaseYear}&toCaseYear={toCaseYear}&state={state}&county=73&format=json"

cali = requests.get(qurl).json()

In [24]:
cali_df = pd.DataFrame(cali['Results'][0])
len(cali_df)

1895

In [25]:
year = 2015
st_case = 60022

qurl = f"{url}/crashes/GetCaseDetails?stateCase={st_case}&caseYear={year}&state=6&format=json"
data = requests.get(qurl).json()
case = data['Results'][0][0]['CrashResultSet']


In [26]:
case.keys()

dict_keys(['ARR_HOUR', 'ARR_HOURNAME', 'ARR_MIN', 'ARR_MINNAME', 'CEvents', 'CF1', 'CF1NAME', 'CF2', 'CF2NAME', 'CF3', 'CF3NAME', 'CITY', 'CITYNAME', 'COUNTY', 'COUNTYNAME', 'CaseYear', 'DAY', 'DAY_WEEK', 'DAY_WEEKNAME', 'DRUNK_DR', 'FATALS', 'FUNC_SYS', 'FUNC_SYSNAME', 'HARM_EV', 'HARM_EVNAME', 'HOSP_HR', 'HOSP_HRNAME', 'HOSP_MN', 'HOSP_MNNAME', 'HOUR', 'HOURNAME', 'LATITUDE', 'LATITUDENAME', 'LGT_COND', 'LGT_CONDNAME', 'LONGITUD', 'LONGITUDNAME', 'MAN_COLL', 'MAN_COLLNAME', 'MILEPT', 'MILEPTNAME', 'MINUTE', 'MINUTENAME', 'MONTH', 'MonthName', 'NHS', 'NHSNAME', 'NOT_HOUR', 'NOT_HOURNAME', 'NOT_MIN', 'NOT_MINNAME', 'NPersons', 'NmCrashes', 'NmImpairs', 'NmPriors', 'PEDS', 'PERMVIT', 'PERNOTMVIT', 'PERSONS', 'PVH_INVL', 'ParkWorks', 'PbTypes', 'RAIL', 'RAILNAME', 'RD_OWNER', 'RD_OWNERNAME', 'RELJCT1', 'RELJCT1NAME', 'RELJCT2', 'RELJCT2NAME', 'REL_ROAD', 'REL_ROADNAME', 'ROAD_FNC', 'ROAD_FNCNAME', 'ROUTE', 'ROUTENAME', 'RUR_URB', 'RUR_URBNAME', 'SCH_BUS', 'SCH_BUSNAME', 'SP_JUR', 'SP_JUR

In [27]:
case['LONGITUD']

'-117.063127780'

In [28]:
def extract_people(v):
    for p in v['Persons']:
        yield {
            'Speed Limit Exceeded': v['SPEEDRELNAME'],
            'Speed limit': v['TRAV_SP'],
            'Vin Number': v['VINNAME'],
            'Traveled Speed Veh': v['VSPD_LIM'],
            'Make': v['MAKENAME'],
            'Make/Model': v['MAK_MODNAME'],
            'Model': v['MODELNAME'],
            'Type of Vehicle': v['BODY_TYPNAME'],
            "ZIP Code": v['DR_ZIP'],
            
            "Age": p['AGE'],
            "Age Name": p['AGENAME'],
            "County": p['COUNTYNAME'],
            "Death Day of Month": p['DEATH_DANAME'],
            "DOA Name": p['DOANAME'],
            # injury sev
            "Injury Severity Name": p['INJ_SEVNAME'],
            "Race": p['RACENAME'],
            "Road Type": p["ROAD_FNCNAME"],
            "Sex": p["SEXNAME"],
            "Make": p["MAKENAME"],
        }


        
def get_people(case):

    hour = case['HOUR']
    minute = case['MINUTE']
    time = f"{hour}:{minute}"
        
    accident_info = {
        'Lng': case['LONGITUD'],
        'Lat': case['LATITUDE'],
        'Case Number': case['ST_CASE'],
        "Description of Veh Coll": case['CF2NAME'], 
        "Day of Week": case['DAY_WEEKNAME'],
        "Drunk Driver": case['DRUNK_DR'],
        "Year": case['CaseYear'],
        "Month": case['MonthName'],
        "Hour": hour,
        "Time of Accident": time,
    }

    vehicles = case['Vehicles']
    
    people = [{**accident_info, **p} for v in vehicles for p in extract_people(v)]
    
    return pd.DataFrame(people)

def get_events(case):
    c_events = [{
        'Case Number': case['ST_CASE'],
# In a traffic accident AOI is Area of Impact. The spot the two cars collided is measured 
# to a fixed object, usually the curb, so it can be reconstructed later.
        'Area of Impact': e['AOI1NAME'],
# standard of evidence
# https://safety.fhwa.dot.gov/rsdp/cdip_rpti.aspx
        'Standard of Evenidence': e['SOENAME'],
        'Event Number': e['EVENTNUM'],
        'Vehicle 1': e['VNUMBER1'],
        'Vehicle 2': e['VNUMBER2'],
    } for e in case['CEvents']]
    
    return pd.DataFrame(c_events)




In [73]:
import json
from shapely.geometry import shape, Point
# depending on your version, use: from shapely.geometry import shape, Point


class ZipCoder(object):
    
    def __init__(self):
        self.js = None
    
    def __get_zip(self, lat, lng):
        
        point = Point(lng, lat)

        for feature in js['features']:
            polygon = shape(feature['geometry'])
            if polygon.contains(point):
                zip_code = feature['properties']['zip']
                return zip_code


    def __row_to_zip(self, r):
        lat = r['Lat']
        lng = r['Lng']
        return self.__get_zip(lat, lng)

    
    def ensure_acc_zips(self, df):
        with open(f'{ raw_data_dir }Zip Codes.geojson') as f:
            self.js = json.load(f)
            
        acc_zip_col = 'Accident ZIP'

        if acc_zip_col not in df.columns: 
            zip_codes = df.apply(self.__row_to_zip, axis=1)
            df[acc_zip_col] = zip_codes
            file_path = f"{ interim_data_dir }people.csv"
            df.to_csv(file_path)
            

In [None]:
from IPython.display import clear_output
import grequests
from itertools import islice
import json
import os


def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())

# LARGE CHUNK SIZE WILL BLOW UP SERVER AND CAUSING: AttributeError: 'NoneType' object has no attribute 'json'
chunk_size = 5
fromCaseYear = "2010"
toCaseYear = "2020"
state = "6"
case_file_base = raw_data_dir

data_lists = {}


def url_from_row(r):
    statecase = r["ST_CASE"]
    caseyear = r["CaseYear"]
    return f"{url}/crashes/GetCaseDetails?stateCase={statecase}&caseYear={caseyear}&state=6&format=json"


def get_file_path(case):
    return f'{ case_file_base }{ case["ST_CASE"] }.json'
    
    
def load_case(file_path):
    with open(file_path, 'r') as f:
        case = json.load(f)
        return case

    
def __get_cases():
    urls = []
    found_locally = 0
    for i, r in cali_df.iterrows():
        file_path = get_file_path(r)
        if os.path.exists(file_path):
            found_locally += 1
            clear_output(wait=True)
            print(f'{ found_locally } files found locally')
            yield load_case(file_path)
        else:
            url = url_from_row(r)
            urls.append(url)
    print(f'{ len(urls) } need to be fetched. ')
    for c in __chunk_and_fetch(urls):
        yield c
    

def __fetch_cases(urls):
    rs = (grequests.get(u) for u in urls)
    case_data = grequests.map(rs)
    return [data.json()['Results'][0][0]['CrashResultSet'] for data in case_data]
    
    
def __save_case(case):
    file_path = get_file_path(case)
    with open(file_path, 'w') as json_file:
        json.dump(case, json_file)
    
    
def __chunk_and_fetch(urls):
    chunked = chunk(urls, chunk_size)
    
    i = 0
    for chunked_urls in chunked:
        i += 1
        clear_output(wait=True)
        print(f'Retrieving chunk { i } of { len(urls) / chunk_size } ...')
        cases = __fetch_cases(chunked_urls) 
        for case in cases:
            __save_case(case)
            yield case

        
people_key = 'people'
events_key = 'events'


def __get_case_lists():
    # actualize list to avoid redundant api calls
    case_list = list(__get_cases())
    
    file_path_people = f"{ interim_data_dir }people.csv"
    people_list = [get_people(case) for case in case_list]
    people_df = pd.concat(people_list, ignore_index=True)
    people_df.to_csv(file_path_people)
    data_lists[people_key] = people_df
    
    file_path_events = f"{ interim_data_dir }events.csv"
    event_list = [get_events(case) for case in case_list]
    event_df = pd.concat(event_list)
    event_df.to_csv(file_path_events)
    data_lists[events_key] = event_df

    return people_df, event_df
    

def get_people_list():
    cached = get_cached_list(people_key)
    if cached is not None:
        ZipCoder().ensure_acc_zips(cached)
        return cached
    df = __get_case_lists()[0]
    ZipCoder().ensure_acc_zips(df)
    return df


def get_event_list():
    cached = get_cached_list(events_key)
    if cached is not None:
        return cached
    df = __get_case_lists()[1]
    file_path = f"{ interim_data_dir }events.csv"
    df.to_csv(file_path)
    return df


def get_cached_list(key):
    if key in data_lists:
        return data_lists[key]
    
    file_path = f"{ interim_data_dir }{ key }.csv"
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        data_lists[key] = df
        return df
    
    return None

In [None]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census

# Census API Key
from api_config import census_api_key
c = Census(census_api_key, year=2013)

In [None]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
import os


census_cache = {}

def census_by_year(year):
    
    if year in census_cache:
        return census_cache[year]
    
    file_path = f'{ interim_data_dir }census_{ year }'
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
        
    try:
        census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                              "B19301_001E",
                              "B17001_002E"), {'for': 'zip code tabulation area:*'}, year=year)
    # Convert to DataFrame
        census_pd = pd.DataFrame(census_data)

    # Column Reordering
        census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                              "B01002_001E": "Median Age",
                                              "B19013_001E": "Household Income",
                                              "B19301_001E": "Per Capita Income",
                                              "B17001_002E": "Poverty Count",
                                              "NAME": "Name",
                                              "zip code tabulation area": "Zipcode"})
        census_pd.to_csv(file_path)
        return census_pd

    except:
        print('no data')
        return None

    
# load all census tables 2011 - 2018
years = range(2011, 2019)
for y in years: census_by_year(y)

census_pd = census_by_year(2016)

# Visualize
print(len(census_pd))
census_pd.head()

33120


Unnamed: 0.1,Unnamed: 0,Median Age,Population,Poverty Count,Household Income,Per Capita Income,Name,Zipcode
0,0,46.5,724.0,78.0,57500.0,25551.0,ZCTA5 12810,12810
1,1,45.9,67.0,0.0,-666666666.0,11590.0,ZCTA5 12811,12811
2,2,64.1,58.0,0.0,49583.0,23600.0,ZCTA5 12812,12812
3,3,46.4,1282.0,51.0,58176.0,35508.0,ZCTA5 12814,12814
4,4,54.2,1103.0,159.0,60458.0,30685.0,ZCTA5 12815,12815
