In [52]:
raw_data_dir = '../data/raw/'
interim_data_dir = '../data/interim/'

In [53]:
import requests
import json
import pandas as pd

url= "https://crashviewer.nhtsa.dot.gov/CrashAPI"
#/crashes/GetCrashesByLocation?fromCaseYear=2014&toCaseYear=2015&state=1&county=1&format=json

fromCaseYear = "2010"
toCaseYear = "2020"
state = "6"
qurl = f"{url}/crashes/GetCrashesByLocation?fromCaseYear={fromCaseYear}&toCaseYear={toCaseYear}&state={state}&county=73&format=json"

cali = requests.get(qurl).json()

In [54]:
cali_df = pd.DataFrame(cali['Results'][0])

In [55]:
year = 2015
st_case = 60022

qurl = f"{url}/crashes/GetCaseDetails?stateCase={st_case}&caseYear={year}&state=6&format=json"
data = requests.get(qurl).json()
case = data['Results'][0][0]['CrashResultSet']

In [56]:
def extract_people(v):
    for p in v['Persons']:
        yield {
            'Speed Limit Exceeded': v['SPEEDRELNAME'],
            'Speed limit': v['TRAV_SP'],
            'Vin Number': v['VINNAME'],
            'Traveled Speed Veh': v['VSPD_LIM'],
            'Make': v['MAKENAME'],
            'Make/Model': v['MAK_MODNAME'],
            'Model': v['MODELNAME'],
            'Type of Vehicle': v['BODY_TYPNAME'],
            "ZIP Code": v['DR_ZIP'],
            
            "Age": p['AGE'],
            "Age Name": p['AGENAME'],
            "County": p['COUNTYNAME'],
            "Death Day of Month": p['DEATH_DANAME'],
            "DOA Name": p['DOANAME'],
            # injury sev
            "Injury Severity Name": p['INJ_SEVNAME'],
            "Race": p['RACENAME'],
            "Road Type": p["ROAD_FNCNAME"],
            "Sex": p["SEXNAME"],
            "Make": p["MAKENAME"],
        }


        
def get_people(case):

    hour = case['HOUR']
    minute = case['MINUTE']
    time = f"{hour}:{minute}"
        
    accident_info = {
        'Case Number': case['ST_CASE'],
        "Description of Veh Coll": case['CF2NAME'], 
        "Day of Week": case['DAY_WEEKNAME'],
        "Drunk Driver": case['DRUNK_DR'],
        "Year": case['CaseYear'],
        "Month": case['MonthName'],
        "Hour": hour,
        "Time of Accident": time,
    }

    vehicles = case['Vehicles']
    
    people = [{**accident_info, **p} for v in vehicles for p in extract_people(v)]
    
    return pd.DataFrame(people)

def get_events(case):
    c_events = [{
        'Case Number': case['ST_CASE'],
# In a traffic accident AOI is Area of Impact. The spot the two cars collided is measured 
# to a fixed object, usually the curb, so it can be reconstructed later.
        'Area of Impact': e['AOI1NAME'],
# standard of evidence
# https://safety.fhwa.dot.gov/rsdp/cdip_rpti.aspx
        'Standard of Evenidence': e['SOENAME'],
        'Event Number': e['EVENTNUM'],
        'Vehicle 1': e['VNUMBER1'],
        'Vehicle 2': e['VNUMBER2'],
    } for e in case['CEvents']]
    
    return pd.DataFrame(c_events)




In [57]:
from IPython.display import clear_output


fromCaseYear = "2010"
toCaseYear = "2020"
state = "6"

data_lists = {}

def __get_cases():
    for i, r in cali_df.iterrows():
        statecase = r["ST_CASE"]
        clear_output(wait=True)
        print(f'Retrieving case { i + 1 } of { len(cali_df) }: { statecase } ...')
        caseyear = r["CaseYear"]
        qurl = f"{url}/crashes/GetCaseDetails?stateCase={statecase}&caseYear={caseyear}&state=6&format=json"
        data = requests.get(qurl).json()
        case = data['Results'][0][0]['CrashResultSet']
        yield case

        
people_key = 'people'
events_key = 'events'


def __get_case_lists():
    # actualize list to avoid redundant api calls
    case_list = list(__get_cases())
    
    file_path_people = f"{ interim_data_dir }people.csv"
    people_list = [get_people(case) for case in case_list]
    people_df = pd.concat(people_list, ignore_index=True)
    people_df.to_csv(file_path_people)
    data_lists[people_key] = people_df
    
    file_path_events = f"{ interim_data_dir }events.csv"
    event_list = [get_events(case) for case in case_list]
    event_df = pd.concat(event_list)
    event_df.to_csv(file_path_events)
    data_lists[events_key] = event_df

    return people_df, event_df
    

def get_people_list():
    if people_key in data_lists:
        return data_lists[people_key]
    
    if os.path.exists(file_path):
        people_df = pd.read_csv(file_path)
        data_lists[people_key] = people_df
        return people_df
    
    return __get_case_lists()[0]


def get_event_list():
    if events_key in data_lists:
        return data_lists[events_key]
    
    if os.path.exists(file_path):
        event_df = pd.read_csv(file_path)
        data_lists[events_key] = event_df
        return event_df
    
    return __get_case_lists()[1]


In [58]:
get_people(case)

Unnamed: 0,Age,Age Name,Case Number,County,DOA Name,Day of Week,Death Day of Month,Description of Veh Coll,Drunk Driver,Hour,...,Road Type,Sex,Speed Limit Exceeded,Speed limit,Time of Accident,Traveled Speed Veh,Type of Vehicle,Vin Number,Year,ZIP Code
0,21,21 Years,60022,SAN DIEGO (73),Died at Scene,Sunday,11,,1,0,...,,Male,"Yes, Too Fast for Conditions",88,0:13,40,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",1N6AA07D29N3,2015,91911
1,20,20 Years,60022,SAN DIEGO (73),Not Applicable,Sunday,Not Applicable (Non-Fatal),,1,0,...,,Male,"Yes, Too Fast for Conditions",88,0:13,40,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",1N6AA07D29N3,2015,91911
2,20,20 Years,60022,SAN DIEGO (73),Not Applicable,Sunday,Not Applicable (Non-Fatal),,1,0,...,,Female,"Yes, Too Fast for Conditions",88,0:13,40,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",1N6AA07D29N3,2015,91911
3,21,21 Years,60022,SAN DIEGO (73),Not Applicable,Sunday,Not Applicable (Non-Fatal),,1,0,...,,Female,"Yes, Too Fast for Conditions",88,0:13,40,"Standard pickup (GVWR 4,500 to 10,00 lbs.)(Jee...",1N6AA07D29N3,2015,91911


In [59]:
get_events(case)

Unnamed: 0,Area of Impact,Case Number,Event Number,Standard of Evenidence,Vehicle 1,Vehicle 2
0,Non-Collision,60022,1,Rollover/Overturn,1,9999
1,Non-Harmful Event,60022,2,Ran Off Roadway - Left,1,5555
2,Not Reported,60022,3,Fence,1,9999
3,Not Reported,60022,4,Fence,1,9999
4,Not Reported,60022,5,Other Fixed Object,1,9999
5,Not Reported,60022,6,Traffic Sign Support,1,9999
6,Not Reported,60022,7,Tree (Standing Only),1,9999
7,Not Reported,60022,8,Building,1,9999


In [60]:
# Get average traffic volume for zip code possibly of google maps

In [61]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census

# Census API Key
from api_config import census_api_key
c = Census(census_api_key, year=2013)

In [62]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
import os


census_cache = {}

def census_by_year(year):
    
    if year in census_cache:
        return census_cache[year]
    
    file_path = f'{ interim_data_dir }census_{t[0]}'
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
        
    try:
        census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                              "B19301_001E",
                              "B17001_002E"), {'for': 'zip code tabulation area:*'}, year=year)
    # Convert to DataFrame
        census_pd = pd.DataFrame(census_data)

    # Column Reordering
        census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                              "B01002_001E": "Median Age",
                                              "B19013_001E": "Household Income",
                                              "B19301_001E": "Per Capita Income",
                                              "B17001_002E": "Poverty Count",
                                              "NAME": "Name",
                                              "zip code tabulation area": "Zipcode"})
        census_pd.to_csv(file_path)
        return census_pd

    except:
        print('no data')
        return None

    
# load all census tables 2011 - 2018
years = range(2011, 2019)
for y in years: census_by_year(y)

census_pd = census_by_year(2016)

# Visualize
print(len(census_pd))
census_pd.head()

10124


Unnamed: 0.1,Unnamed: 0,Median Age,Population,Poverty Count,Household Income,Per Capita Income,Name,Zipcode
0,0,38.9,17599.0,11282.0,11757.0,7041.0,ZCTA5 00601,601
1,1,40.9,39209.0,20428.0,16190.0,8978.0,ZCTA5 00602,602
2,2,40.4,50135.0,25176.0,16645.0,10897.0,ZCTA5 00603,603
3,3,42.8,6304.0,4092.0,13387.0,5960.0,ZCTA5 00606,606
4,4,41.4,27590.0,12553.0,18741.0,9266.0,ZCTA5 00610,610
