In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [36]:
import os
import pandas as pd

# Data files to keep for each sub dir in Data
csv_files = {
    'accident': 'accident.csv',
    'drimpair': 'DrImpair.csv',
    'weather': 'weather.csv'
}

# Create an empty DataFrame to store merged data
merged_data = pd.DataFrame()

for year in range(2016, 2020):
    print(f'Processing year {year}')
    
    file_paths = {key: f'Data/FARS{year}NationalCSV/{filename}' for key, filename in csv_files.items()}
    
    # Check if accident.csv exists first (primary dataset)
    if not os.path.exists(file_paths['accident']):
        print(f"File {file_paths['accident']} does not exist, skipping year {year}.")
        continue
    
    # Read accident dataset
    accident_df = pd.read_csv(file_paths['accident'], encoding='ISO-8859-1')
    
    # Read and merge drimpair data if available
    if os.path.exists(file_paths['drimpair']):
        drimpair_df = pd.read_csv(file_paths['drimpair'], encoding='ISO-8859-1')
        accident_df = accident_df.merge(drimpair_df, on=['STATE', 'ST_CASE'], how='left')
    else:
        print(f"File {file_paths['drimpair']} not found for {year}, skipping.")
    
    # Read and merge weather data if available
    if os.path.exists(file_paths['weather']):
        weather_df = pd.read_csv(file_paths['weather'], encoding='ISO-8859-1')
        accident_df = accident_df.merge(weather_df, on=['STATE', 'ST_CASE'], how='left')
    else:
        print(f"File {file_paths['weather']} not found for {year}, skipping.")
    
    # Append merged data for this year
    merged_data = pd.concat([merged_data, accident_df], ignore_index=True)

# Display merged data structure
merged_data

Processing year 2016
File Data/FARS2016NationalCSV/weather.csv not found for 2016, skipping.
Processing year 2017


  accident_df = pd.read_csv(file_paths['accident'], encoding='ISO-8859-1')


File Data/FARS2017NationalCSV/weather.csv not found for 2017, skipping.
Processing year 2018
File Data/FARS2018NationalCSV/weather.csv not found for 2018, skipping.
Processing year 2019


  accident_df = pd.read_csv(file_paths['accident'], encoding='ISO-8859-1')


File Data/FARS2019NationalCSV/weather.csv not found for 2019, skipping.


Unnamed: 0,STATE,STATENAME_x,ST_CASE,VE_TOTAL,VE_FORMS,PVH_INVL,PEDS,PERSONS,PERMVIT,PERNOTMVIT,...,CF2,CF2NAME,CF3,CF3NAME,FATALS,DRUNK_DR,STATENAME_y,VEH_NO,DRIMPAIR,DRIMPAIRNAME
0,1,Alabama,10001,1,1,0,0,1,1,0,...,0,,0,,1,1,Alabama,1,9,"Under the Influence of Alcohol, Drugs or Medic..."
1,1,Alabama,10002,1,1,0,0,2,2,0,...,0,,0,,1,1,Alabama,1,9,"Under the Influence of Alcohol, Drugs or Medic..."
2,1,Alabama,10003,2,1,1,0,2,1,1,...,0,,0,,1,0,Alabama,1,2,Asleep or Fatigued
3,1,Alabama,10004,1,1,0,0,1,1,0,...,0,,0,,1,1,Alabama,1,9,"Under the Influence of Alcohol, Drugs or Medic..."
4,1,Alabama,10005,1,1,0,0,1,1,0,...,0,,0,,1,0,Alabama,1,99,Unknown if Impaired
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210523,56,Wyoming,560119,1,1,0,0,1,1,0,...,0,,0,,1,1,Wyoming,1,9,"Under the Influence of Alcohol, Drugs or Medic..."
210524,56,Wyoming,560120,1,1,0,0,2,2,0,...,0,,0,,1,1,Wyoming,1,2,Asleep or Fatigued
210525,56,Wyoming,560120,1,1,0,0,2,2,0,...,0,,0,,1,1,Wyoming,1,9,"Under the Influence of Alcohol, Drugs or Medic..."
210526,56,Wyoming,560121,2,2,0,0,3,3,0,...,0,,0,,2,1,Wyoming,1,99,Reported as Unknown if Impaired


In [37]:
cols_to_drop = ['MONTHNAME', 'DAY_WEEKNAME', 'HOURNAME', 'MINUTENAME', 'STATENAME_y',
                'NHSNAME', 'ROUTENAME', 'RUR_URBNAME', 'FUNC_SYSNAME', 'RD_OWNERNAME', 'LATITUDENAME', 'LONGITUDNAME', 
                'SP_JURNAME', 'HARM_EVNAME', 'MAN_COLLNAME', 'RELJCT1NAME', 'RELJCT2NAME', 'TYP_INTNAME', 'WRK_ZONENAME', 
                'REL_ROADNAME', 'LGT_CONDNAME', 'WEATHER1NAME', 'WEATHER2NAME', 'WEATHERNAME', 'SCH_BUSNAME', 'RAILNAME']
merged_data.drop(columns=[col for col in cols_to_drop if col in merged_data.columns], inplace=True)
merged_data.rename(columns={'STATENAME_x': 'STATENAME'}, inplace=True)

# Perform Exploratory Data Analysis (EDA)
print("Basic Info:")
print(merged_data.info())
print("\nSummary Statistics:")
print(merged_data.describe())
print("\nMissing Values:")
print(merged_data.isnull().sum())
print("\nTop 5 States with Most Accidents:")
print(merged_data['STATENAME'].value_counts().head())
print("\nTop 5 Weather Conditions in Accidents:")
print(merged_data['WEATHER'].value_counts().head())
print("\nTop 5 Harmful Events in Accidents:")
print(merged_data['HARM_EV'].value_counts().head())

# Save cleaned dataset to CSV
output_file = "merged_fars_data.csv"
merged_data.to_csv(output_file, index=False)
print(f"Cleaned data saved to {output_file}")


Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210528 entries, 0 to 210527
Data columns (total 69 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   STATE         210528 non-null  int64  
 1   STATENAME     210528 non-null  object 
 2   ST_CASE       210528 non-null  int64  
 3   VE_TOTAL      210528 non-null  int64  
 4   VE_FORMS      210528 non-null  int64  
 5   PVH_INVL      210528 non-null  int64  
 6   PEDS          210528 non-null  int64  
 7   PERSONS       210528 non-null  int64  
 8   PERMVIT       210528 non-null  int64  
 9   PERNOTMVIT    210528 non-null  int64  
 10  COUNTY        210528 non-null  int64  
 11  COUNTYNAME    210528 non-null  object 
 12  CITY          210528 non-null  int64  
 13  CITYNAME      210528 non-null  object 
 14  DAY           210528 non-null  int64  
 15  DAYNAME       210528 non-null  int64  
 16  MONTH         210528 non-null  int64  
 17  YEAR          210528 non-null  int64

In [34]:
# # check if STATENAME_x and STATENAME_y are ever different
# if 'STATENAME_x' in merged_data.columns and 'STATENAME_y' in merged_data.columns:
#     different_states = merged_data[merged_data['STATENAME_x'] != merged_data['STATENAME_y']]
#     if not different_states.empty:
#         print("There are rows where STATENAME_x and STATENAME_y are different:")
#         print(different_states[['STATENAME_x', 'STATENAME_y']].head())
#     else:
#         print("STATENAME_x and STATENAME_y are always the same.")

STATENAME_x and STATENAME_y are always the same.
