# Vehicular Crashes in Longmont, Colorado

This script reads in vehicle crash data spreadsheets downloaded from CDOT. It cleans and processes the data from crashes in Longmont only and concatenates data over several years to create a single CSV time series.

Source: https://www.codot.gov/safety/traffic-safety/data-analysis/crash-data

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import json
import os

## 2. Get Organized

In [2]:
# input file list (2021 & newer only, older files follow a different format)
files = ['CDOTRM_CD_Crash_Listing_-_2021.xlsx',
         'CDOTRM_CD_Crash_Listing_-_2022.xlsx',
         'CDOTRM_CD_Crash_Listing_-_2023.xlsx',
         'CDOTRM_CD_Crash_Listing_-_2024.xlsx'
        ]

# create an empty dataframe to populate with each year's data
all_crashes = pd.DataFrame()

# read my API key from text file for geocode.maps.co
with open('google_map_api_key.txt') as fp:
    my_api_key = fp.read().replace('\n','')

## 3. Define Functions

In [3]:
def process_crashes(crashes):
    # Save only crashes in Longmont
    crashes = crashes.loc[crashes.City == 'LONGMONT'].copy()

    # replace CUID with year-CUID
    crashes['CUID'] = crashes['Crash Date'].dt.year.astype(str)+'-'+crashes['CUID'].astype(str)

    # Flag accidents where vehicles were speeding (1=yes, 0=no)
    crashes['TU-1 Speeding'] =  np.where(crashes['TU-1 Estimated Speed'] > crashes['TU-1 Speed Limit'], 1, 0)
    crashes['TU-2 Speeding'] =  np.where(crashes['TU-2 Estimated Speed'] > crashes['TU-2 Speed Limit'], 1, 0)

    # Fill nan values with empty string
    crashes = crashes.fillna('')
    
    return crashes

def geocode_with_google(address, api_key):
    # A function to call Google Maps API & geocode an intersection
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        'address': address,
        'key': api_key
    }

    try:
        response = requests.get(base_url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data['status'] == 'OK':
            result = data['results'][0]['geometry']['location']
            return result['lat'], result['lng']
        else:
            print(f"Geocoding failed: {data['status']} - {data.get('error_message', '')}")
            return None

    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None


# Functions that return an existing coordinate or, if missing, looks up the coordinate
#    in the dictionary of geocoded intersections
def get_lat(intersection, current_lat):
    if pd.notnull(current_lat):
        return current_lat
    return geocoded_intersections.get(intersection, (None, None))[0]
'''
def get_lat(row):
    val = geocoded_results.get(row['intersection'], (None, None))[0]
    print(f"{row['intersection']} -> lat: {val}")
    return row['latitude'] if pd.notnull(row['latitude']) else val
'''
def get_long(intersection, current_long):
    if pd.notnull(current_long):
        return current_long
    return geocoded_intersections.get(intersection, (None, None))[1]

## 4. Compile Cleaned Data into a Single DataFrame

In [4]:
print('Processing:')

for file in files:

    print('\t'+file)
    
    # read each file into a dataframe (calamine engine is many times faster than the default
    #    openpyxl engine
    temp_df = pd.read_excel(file, header = 0, engine='calamine')

    # call function to clean data
    processed_df = process_crashes(temp_df)

    # concatenate newly processed dataframe to previously cleaned data
    all_crashes = pd.concat([all_crashes, processed_df], ignore_index = True)

print('Finished')

Processing:
	CDOTRM_CD_Crash_Listing_-_2021.xlsx
	CDOTRM_CD_Crash_Listing_-_2022.xlsx
	CDOTRM_CD_Crash_Listing_-_2023.xlsx
	CDOTRM_CD_Crash_Listing_-_2024.xlsx
Finished


## 5. Drop Unwanted Columns

In [5]:
# Create a list of unwanted columns to drop
drop_columns = ['Agency Id',
                'City',
                'County',
                'Rd_Section',
                'Rd_Number',
                'Record Status',
                'Processing Status',
                'Last Updated',
                'Link'
               ]

all_crashes.drop(drop_columns, inplace=True, axis=1)

## 6. Create an Intersections Column

This column will contain the cross strees of each accident as well as the city and state formatted such that the value may be submitted to the Google Maps API for geocoding.

In [6]:
# Add a column with the nearest intersection to the accident
all_crashes['intersections'] = all_crashes['Location 1'] + ' and ' + all_crashes['Location 2'] + ', Longmont, CO'

In [14]:
# show number of crashes w/ missing lat & long
n_missing_loc = len(all_crashes[all_crashes.iloc[:, 5]==''])
print('\nNumber of accidents with no Lat & Long: ', n_missing_loc)

# number of unique intersections to geocode
n_intersections = len(all_crashes[all_crashes.iloc[:, 5]==''].intersections.unique())
print('Number of unique intersections among accidents w/out location: ', n_intersections)
print('Time to query (minutes): ', round(n_intersections*0.3/60,1))


Number of accidents with no Lat & Long:  0
Number of unique intersections among accidents w/out location:  0
Time to query (minutes):  0.0


## 7. Create an Array of Unique Intersections

This list will be geocoded. Only unique values are used to minimize calls to the API.

In [8]:
unique_intersections = all_crashes[all_crashes.iloc[:, 5]==''].intersections.unique()

## 8. Use Google Maps API to Geocode Unique Intersections

In [9]:
# Name of file storing geocoded intersections
geocoded_intersections_json = 'geocoded_intersections.json'

# Check for geocoded intersection file & load it
if os.path.exists(geocoded_intersections_json):
    
    with open('geocoded_intersections.json', 'r') as f:
        geocoded_intersections = json.load(f)
    
    geocoded_intersections = {k: tuple(v) for k, v in geocoded_intersections.items()}
    
    print('Loaded geocoded intersections')
    
else:
    print('No geocoded intersection file found, geocoding w/ Google API now...')
    
    geocoded_intersections = {}

    # count = 0 # for testing only

    for intersection in unique_intersections:
        coordinates = geocode_with_google(intersection, my_api_key)
        if coordinates:
            lat = coordinates[0]
            long = coordinates[1]
            #print(coordinates) # for testing
        geocoded_intersections[intersection] = (lat, long)
        ''' # for testing only
        count += 1
        if count >= 3:
            break
        '''
        time.sleep(0.3)


Loaded geocoded intersections


## 9. Populate Geocoded Results for Records Missing Latitude & Longituded

Now that all crash intersections have been geocoded, the latitude & longitude values in the dataframe can be populated.

In [10]:
all_crashes['Latitude'] = all_crashes.apply(
    lambda row: row['Latitude'] if pd.notnull(row['Latitude']) and row['Latitude'] != ''
    else geocoded_intersections.get(row['intersections'], (None, None))[0],
    axis=1
)

all_crashes['Longitude'] = all_crashes.apply(
    lambda row: row['Longitude'] if pd.notnull(row['Longitude']) and row['Longitude'] != ''
    else geocoded_intersections.get(row['intersections'], (None, None))[1],
    axis=1
)

In [11]:
# Verify that all records have lat & long values
all_crashes.head(6)

Unnamed: 0,CUID,System Code,City_Street,Crash Date,Crash Time,Latitude,Longitude,Location 1,Location 2,Location,...,TU-2 NM Safety Helmet,TU-1 NM Alcohol Suspected,TU-2 NM Alcohol Suspected,TU-1 NM Marijuana Suspected,TU-2 NM Marijuana Suspected,TU-1 NM Other Drugs Suspected,TU-2 NM Other Drugs Suspected,TU-1 Speeding,TU-2 Speeding,intersections
0,2021-40665,City Street,HOVER,2021-01-02,17:04:00,40.167007,-105.130895,9TH AVE,HOVER ST,On Roadway,...,,,,,,,,0,0,"9TH AVE and HOVER ST, Longmont, CO"
1,2021-41331,City Street,MAINS,2021-01-03,14:19:00,40.199466,-105.104269,23RD AVE,MAIN ST,On Roadway,...,,,,,,,,0,0,"23RD AVE and MAIN ST, Longmont, CO"
2,2021-41336,City Street,3RDAV,2021-01-03,18:09:00,40.166961,-105.130811,HOVER ST,3RD AVE,On Roadway,...,,,,,,,,0,0,"HOVER ST and 3RD AVE, Longmont, CO"
3,2021-40743,City Street,KENPR,2021-01-04,11:10:00,40.141876,-105.130777,S. HOVER ST,KEN PRATT BLVD,On Roadway,...,,,,,,,,0,0,"S. HOVER ST and KEN PRATT BLVD, Longmont, CO"
4,2021-41333,City Street,WARRE,2021-01-05,09:45:00,40.174925,-105.123685,TULIP ST,WARREN AVE,Ran off right side,...,,,,,,,,0,0,"TULIP ST and WARREN AVE, Longmont, CO"
5,2021-41321,State Highway,,2021-01-05,11:18:00,40.152102,-105.117192,KEN PRATT BLVD E,NELSON RD,Ran off right side,...,,,,,,,,0,0,"KEN PRATT BLVD E and NELSON RD, Longmont, CO"


## 10. Write Crash Data to CSV

In [12]:
# write everything to csv
all_crashes.to_csv('longmont_crashes.csv', sep=',')