Imports

In [2]:
import pandas as pd
import random
import numpy as np
import warnings

Global functions

In [3]:
def get_accident_df(subset="standard_small", n_rows=10000, is_refined=False):
    warnings.warn("This function is outdated and should not be used.")
    if is_refined:
        df = get_accident_df(subset=subset, n_rows=n_rows, is_refined=False)
        return refine_df(df)

    if subset == "standard_small":
        return pd.read_csv("DataSubsetRaw.csv")
    if subset == "new":
        n = 3513617 #number of rows in the dataset
        s = n_rows #number of rows to sample
        skip = sorted(random.sample(range(1, n+1), n-s)) #rows to be skipped
        return pd.read_csv("US_Accidents_June20.csv", skiprows=skip)
    if subset == "all":
        return pd.read_csv("US_Accidents_June20.csv")

Internal functions

In [None]:
def get_small_df(size=100):
    n = 3513617 #number of rows in the dataset                          
    s = size #number of rows to sample                                
    skip = sorted(random.sample(range(1, n+1), n-s)) #rows to be skipped
    return pd.read_csv("US_Accidents_June20.csv", skiprows=skip)        

In [4]:
def refine_df(df):
    df = change_column_names(df)
    df = convert_to_metric(df)
    df = timeStamp_df(df)
    df = duration_df(df)
    return df

In [5]:
def change_column_names(df):
    col_names = df.columns
    col_names = [name.replace("_", "") for name in col_names]
    col_names = [name[:loc] if (loc := name.find("(")) != -1 else name for name in col_names]
    df.columns = col_names
    return df

In [6]:
def convert_to_metric(df):
    #miles to km
    df["Distance"] = df["Distance"] * 1.60934

    #F to C
    df["Temperature"] = (df["Temperature"] - 32) * 5 / 9

    #F to C
    df["WindChill"] = (df["WindChill"] - 32) * 5 / 9

    #inches of mercury to bar
    df["Pressure"] = df["Pressure"] / 29.53

    #miles to km
    df["Visibility"] = df["Visibility"] * 1.60934

    #mph to kph
    df["WindSpeed"] = df["WindSpeed"] * 1.60934

    #inches to cm
    df["Precipitation"] = df["Precipitation"] * 2.54
    return df

In [7]:
#This function converts Start_Time and End_Time data in dataframe to 'datetime64[ns]' datatype.
def timeStamp_df(df):
    df['StartTime'] = pd.to_datetime(df['StartTime'])
    df['EndTime'] = pd.to_datetime(df['EndTime'])
    return df

In [8]:
#This function creates column in df with duration data as dataype 'timedelta64[ns]'.
def duration_df(df):
    df['Duration'] = (df['EndTime']-df['StartTime'])
    return df

Functions for population data

In [None]:
#Function creates dataframe with accident counts per city, state.  
def accidentCounts(df):
    df2 = df.groupby(['City', 'State']).count().reset_index()
    df_counts = df2[['City','State','ID']]
    df_counts = df_counts.rename(columns={'ID': 'Counts'})
    return df_counts

In [None]:
#Function loads and  dataframe with population data
def populationData():
    df_population = pd.read_csv("CityPopulations.csv")

    state_names = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 
     'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 
     'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 
     'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 
     'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 
     'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
     'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 
     'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 
     'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 
     'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

    state_codes = ['AL', 'AK', 'AZ', 'AR', 'CA', 
     'CO', 'CT', 'DE', 'DC', 'FL', 
     'GA', 'HI', 'ID', 'IL', 'IN', 
     'IA', 'KS', 'KY', 'LA', 'ME', 
     'MD', 'MA', 'MI', 'MN', 'MS',
     'MO', 'MT', 'NE', 'NV', 'NH', 
     'NJ', 'NM', 'NY', 'NC', 'ND',
     'OH', 'OK', 'OR', 'PA', 'RI',
     'SC', 'SD', 'TN', 'TX', 'UT', 
     'VT', 'VA', 'WA', 'WV', 'WI',
     'WY']

    df_population['State'] = df_population['State'].replace(state_names,state_codes)
    #To make it easier to create bar plots with city, state as labels
    df_population['City + State'] = df_population['City']+', '+df_population['State']
    return df_population

In [None]:
#Function merging population data and accident counts data. This function calls accidentCounts(df) and populationData()
def mergedPopulationCounts(df):
    df_counts = accidentCounts(df)
    df_population = populationData()
    df_merged = df_population.merge(df_counts)
    df_merged['Counts per 100K'] = df_merged['Counts']/(df_merged['Population']/100000)
    return df_merged