In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.0f' % x)
import os
from sklearn.cluster import KMeans
import numpy as np 

### Extracting Data From NHTSA GES Repository Years 2010-2015

Differing data files and formats across years, as well as storage directories made it difficult to obtain data using a batch iteration directly from the file transfer protocol set up by NHTSA. Directories for specific years were downloaded manually. Due to complex variation of feature column names, value categories and filenames, the attribute names, years and filepaths were first stored in data containers for ease in building a data cleaning script.  

In [2]:
# directories storing all sas files needed for each year in NHTSA GES dataset
dir_years = [r"C:\Users\murra667\Documents\Springboard\Capstone_2\After 2010\GES10_PCSAS\repost GES", 
             r"C:\Users\murra667\Documents\Springboard\Capstone_2\After 2010\GES11_PCSAS", 
             r"C:\Users\murra667\Documents\Springboard\Capstone_2\After 2010\GES12_PCSAS", 
             r"C:\Users\murra667\Documents\Springboard\Capstone_2\After 2010\GES13_PCSAS", 
             r"C:\Users\murra667\Documents\Springboard\Capstone_2\After 2010\GES2014SAS", 
             r"C:\Users\murra667\Documents\Springboard\Capstone_2\After 2010\GES2015sas"]
# store dictionary of attributes to extract from each sas file type: person, accident, vehicle, vevent
sas_file_dict = {
              "person": 
              {"cols" :['VEHNO', 'VEH_NO', 'CASENUM', 'PER_TYP', 'AGE', 'SEX', 'ALC_RES', 'ALTRSULT']}, 
             "accident": 
              {"cols": ['CASENUM', 'MONTH', 'DAY_WEEK', 'YEAR', 'HOUR', 'MINUTE', 'LAND_USE',
                        'RELJCT2', 'REL_ROAD', 'WRK_ZONE',
              'INT_HWY', 'TYP_INT', 'NON_INVL', 'MAN_COL',
              'MAX_SEV', 'MAN_COLL', 'PERNOTMVIT']}, 
             "vehicle" :
              {"cols" : ['VEHNO', 'VEH_NO', 'CASENUM', 'TRAV_SP', 'VSPD_LIM', 'ACC_TYPE', 'MAKE', 'VNUM_LAN', 'NUMOCCS']},
             "vevent": {"cols" : ['CASENUM', 'AOI1', 'GAD', 'VEHNO', 'VEH_NO']}
                }

# range of years stored in list 
years = list(range(2010,2016))
# store data first by file type (person, accident, vehicle, vevent) then by year 
for file in sas_file_dict.keys():
    sas_file_dict[file]['years'] = (dict(zip(years, dir_years)))

### First create a dataframe with Distracted Driving Cases

In [3]:
# store dataframes of sas files containing distracted driving data
distracted_dfs = []
# iterate through years and their respective file directories
for year, file in zip(years, dir_years):
    global df
    df = pd.read_sas(os.path.join(file, "distract.sas7bdat"))
    if year == 2010:
        df = df.rename(columns = {"VEHNO": "VEH_NO"})
    df["VEH_NO"] = df["VEH_NO"].astype(str)
    df["CASENUM"] = df["CASENUM"].astype(str)
    # create unique identifier that creates a row for each car involved in
    # any crash
    df["CASENUM_VEH_NO"] = df["CASENUM"] + df["VEH_NO"]
    df = df.drop_duplicates(subset = "CASENUM_VEH_NO", keep = "first")
    distracted_dfs.append(df)
# concatenate the dataframes across years
global distracted_df
distracted_df = pd.concat(distracted_dfs)
distracted_dfs.clear()
distracted_df = distracted_df.drop_duplicates(keep = "first")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




### Bring together NHTSA GES crash data across years and file types

In [4]:
# update current_df during iteration
global current_df
# update dataframe of non-occupants involved in accidents and their case number
involved_non_occupants  = []
# concat years of each file type (person, accident, event, )
concat_file_type = []
# iterate through filetype and respective columns in sas_file_dict
for file_type, file_type_attrs in sas_file_dict.items():
    # iterate through each year 
    for year, file_list in (file_type_attrs['years'].items()):
        filepath = os.path.join(file_list, (file_type + ".sas7bdat"))
        # access columns specific to year and file type
        columns = file_type_attrs['cols']
        # set mutable variable to adjust columns as necessary for year and file type
        global columns_adj
        # make a copy of columns 
        columns_adj = (columns.copy())
        # adjust columns as necessary for year and file type
        if year == 2010 and file_type != "accident": 
            columns_adj.remove("VEH_NO")
        if year ==  2010 and file_type == "accident":
            columns_adj.remove('MAN_COLL')
            columns_adj.remove('PERNOTMVIT')
        if year ==  2010 and file_type == "vevent":
            columns_adj.remove("AOI1")
        if year ==  2010 and file_type == "person":
            columns_adj.remove('ALC_RES')
        #if year ==  2010 and file_type == "vehicle":
            #columns_adj.remove("MOD_YEAR")
        if year !=  2010 and file_type == "accident":
            columns_adj.remove('MAN_COL')
            columns_adj.remove('NON_INVL')
        if year !=  2010 and file_type != "accident":
            columns_adj.remove("VEHNO")
        if year !=  2010 and file_type == "person":
            columns_adj.remove('ALTRSULT')
        #if year !=  2010 and file_type == "vehicle":
           # columns_adj.remove("MODEL_YR")
        if year !=  2010 and file_type == "vevent":
            columns_adj.remove("GAD")
        
        # read in current df and set columns appropriately 
        current_df = pd.read_sas(filepath)
        # extract only person values where person is driver, i.e "1"
        #if file_type == "person":
        if file_type == "person":
            current_df['non_motorist'] = 0
            current_df.loc[(current_df['PER_TYP'] == 5) | (current_df['PER_TYP'] == 6) | (current_df['PER_TYP'] == 7) | (current_df['PER_TYP'] == 19), 'non_motorist'] = 1
            non_occupants = current_df[['CASENUM', 'non_motorist']]
            # capture data on involved non occupants (peds, cyclists, etc.)
            # who would not be labeled as drivers in PER_TYP, as value 1
            # which will be dropped in the lines below
            involved_non_occupants.append(non_occupants)
            del non_occupants
            current_df = current_df.loc[current_df['PER_TYP'] == 1]
        current_df = current_df[columns_adj]        
        # rename column names appropriately for year and file type for joining
        if year ==  2010 and file_type != "accident":
            current_df = current_df.rename(columns = {"VEHNO" : "VEH_NO"})
        if year ==  2010 and file_type == "accident": 
            current_df = current_df.rename(columns = {"MAN_COL" : "MAN_COLL", "NON_INVL" : "PERNOTMVIT"})
        if file_type == "person":
            current_df = current_df.rename(columns = {"ALC_RES" : "ALTRSULT"})
        #if year ==  2010 and file_type == "vehicle":
            #current_df = current_df.rename(columns = {"MODEL_YR" : "MOD_YEAR"})
        if year ==  2010 and file_type == "vevent":
            current_df = current_df.rename(columns = {"GAD" : "AOI1"})
        current_df["CASENUM"] = current_df["CASENUM"].astype(str)
        if file_type != "accident":
            current_df = current_df.dropna(subset = ["CASENUM", "VEH_NO"]) 
            current_df["VEH_NO"] = current_df["VEH_NO"].astype(str)
            current_df["CASENUM_VEH_NO"] = current_df["CASENUM"] + current_df["VEH_NO"]
        else:
            current_df = current_df.dropna(subset = ["CASENUM"])
        #  for each file type, across years, add to container for concatenation
        concat_file_type.append(current_df)
        
    # concatenate file type across each year
    current_df = pd.concat(concat_file_type)
    # delete data conatiner after concatenation
    concat_file_type.clear()
    # use a special way to join the accident file, which has no 
    # vehicle number, and therefore no attribute CASENUM_VEH_NO which was
    # created for all other file types 
    if file_type == "accident":
        distracted_df = distracted_df.drop(columns = ["CASENUM_y"])
        distracted_df = pd.merge(left=distracted_df, right=current_df, left_on='CASENUM_x', right_on='CASENUM')
    # merge each file type into single dataframe (already concatenated by year)
    else:
        distracted_df = pd.merge(left = distracted_df, right = current_df, left_on="CASENUM_VEH_NO", right_on="CASENUM_VEH_NO")
     

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




### Ensure That Data is Kept On Accidents Where A Non-Motorist Is Involved

In [5]:
# concatenate involved non occupant data
involved_non_occupant = pd.concat(involved_non_occupants)
del (involved_non_occupants)
# drop duplicates along the CASENUM_VEH_NO column
involved_non_occupant = involved_non_occupant.drop_duplicates(subset = "CASENUM")

### Merge Main Dataframe with Dataframe Containing Crashes Involving Non-Motorists

In [6]:
# drop duplicates along the CASENUM_VEH_NO column
distracted_df = distracted_df.drop_duplicates(subset = "CASENUM_VEH_NO")
#del distracted_df
# drop columns
final_df = distracted_df.drop(columns = ["CASENUM_x", "CASENUM_y", "VEH_NO_x", "VEH_NO_y", "PSUSTRAT", "PSU", "STRATUM", "PERNOTMVIT", "PJ"])
involved_non_occupant["CASENUM"] = involved_non_occupant['CASENUM'].astype(float)
final_df["CASENUM"] = final_df["CASENUM"].astype(float)
involved_non_occupant["CASENUM"] = involved_non_occupant['CASENUM'].astype(str)
final_df["CASENUM"] = final_df["CASENUM"].astype(str)
final_df = involved_non_occupant.merge(final_df, how = "right", left_on = "CASENUM", right_on = "CASENUM")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [7]:
final_df.to_csv("nhtsa_ges_extracted.csv")