In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
import os
import pandas as pd

# Data files to keep for each sub dir in Data
csv_files = {
    'accident': 'accident.csv',
    'drimpair': 'DrImpair.csv',
    'weather': 'weather.csv'
}

# Create an empty DataFrame to store merged data
merged_data = pd.DataFrame()

for year in range(2016, 2023):
    print(f'Processing year {year}')
    
    file_paths = {key: f'Data/FARS{year}NationalCSV/{filename}' for key, filename in csv_files.items()}
    
    # Check if accident.csv exists first (primary dataset)
    if not os.path.exists(file_paths['accident']):
        print(f"File {file_paths['accident']} does not exist, skipping year {year}.")
        continue
    
    # Read accident dataset
    accident_df = pd.read_csv(file_paths['accident'], encoding='ISO-8859-1')
    
    # Read and merge drimpair data if available
    if os.path.exists(file_paths['drimpair']):
        drimpair_df = pd.read_csv(file_paths['drimpair'], encoding='ISO-8859-1')
        accident_df = accident_df.merge(drimpair_df, on=['STATE', 'ST_CASE'], how='left')
    else:
        print(f"File {file_paths['drimpair']} not found for {year}, skipping.")
    
    # Read and merge weather data if available
    if os.path.exists(file_paths['weather']):
        weather_df = pd.read_csv(file_paths['weather'], encoding='ISO-8859-1')
        accident_df = accident_df.merge(weather_df, on=['STATE', 'ST_CASE'], how='left')
    else:
        print(f"File {file_paths['weather']} not found for {year}, skipping.")
    
    # Append merged data for this year
    merged_data = pd.concat([merged_data, accident_df], ignore_index=True)

# Display merged data structure
merged_data

Processing year 2016
File Data/FARS2016NationalCSV/weather.csv not found for 2016, skipping.
Processing year 2017


  accident_df = pd.read_csv(file_paths['accident'], encoding='ISO-8859-1')


File Data/FARS2017NationalCSV/weather.csv not found for 2017, skipping.
Processing year 2018
File Data/FARS2018NationalCSV/weather.csv not found for 2018, skipping.
Processing year 2019


  accident_df = pd.read_csv(file_paths['accident'], encoding='ISO-8859-1')


File Data/FARS2019NationalCSV/weather.csv not found for 2019, skipping.
Processing year 2020
Processing year 2021
Processing year 2022


Unnamed: 0,STATE,STATENAME_x,ST_CASE,VE_TOTAL,VE_FORMS,PVH_INVL,PEDS,PERSONS,PERMVIT,PERNOTMVIT,...,DRUNK_DR,STATENAME_y,VEH_NO,DRIMPAIR,DRIMPAIRNAME,WEATHER_x,WEATHERNAME_x,STATENAME,WEATHER_y,WEATHERNAME_y
0,1,Alabama,10001,1,1,0,0,1,1,0,...,1.0,Alabama,1,9,"Under the Influence of Alcohol, Drugs or Medic...",,,,,
1,1,Alabama,10002,1,1,0,0,2,2,0,...,1.0,Alabama,1,9,"Under the Influence of Alcohol, Drugs or Medic...",,,,,
2,1,Alabama,10003,2,1,1,0,2,1,1,...,0.0,Alabama,1,2,Asleep or Fatigued,,,,,
3,1,Alabama,10004,1,1,0,0,1,1,0,...,1.0,Alabama,1,9,"Under the Influence of Alcohol, Drugs or Medic...",,,,,
4,1,Alabama,10005,1,1,0,0,1,1,0,...,0.0,Alabama,1,99,Unknown if Impaired,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389867,56,Wyoming,560114,2,2,0,0,2,2,0,...,,Wyoming,2,0,None/Apparently Normal,1.0,Clear,Wyoming,1.0,Clear
389868,56,Wyoming,560115,1,1,0,0,1,1,0,...,,Wyoming,1,0,None/Apparently Normal,4.0,Snow,Wyoming,4.0,Snow
389869,56,Wyoming,560116,1,1,0,0,1,1,0,...,,Wyoming,1,1,"Ill, Blackout",4.0,Snow,Wyoming,4.0,Snow
389870,56,Wyoming,560117,1,1,0,0,1,1,0,...,,Wyoming,1,9,"Under the Influence of Alcohol, Drugs or Medic...",4.0,Snow,Wyoming,4.0,Snow


In [3]:
cols_to_drop = ['MONTHNAME', 'DAY_WEEKNAME', 'HOURNAME', 'MINUTENAME', 'STATENAME_y', "STATENAME",
                'NHSNAME', 'ROUTENAME', 'RUR_URBNAME', 'FUNC_SYSNAME', 'RD_OWNERNAME', 'LATITUDENAME', 'LONGITUDNAME', 
                'SP_JURNAME', 'HARM_EVNAME', 'MAN_COLLNAME', 'RELJCT1NAME', 'RELJCT2NAME', 'TYP_INTNAME', 'WRK_ZONENAME', 
                'REL_ROADNAME', 'LGT_CONDNAME', 'WEATHER1NAME', 'WEATHER2NAME', 'WEATHERNAME', 'SCH_BUSNAME', 'RAILNAME']
merged_data.drop(columns=[col for col in cols_to_drop if col in merged_data.columns], inplace=True)
merged_data.rename(columns={'STATENAME_x': 'STATENAME'}, inplace=True)

# Perform Exploratory Data Analysis (EDA)
print("Basic Info:")
print(merged_data.info())
print("\nSummary Statistics:")
print(merged_data.describe())
print("\nMissing Values:")
print(merged_data.isnull().sum())
print("\nTop 5 States with Most Accidents:")
print(merged_data['STATENAME'].value_counts().head())
print("\nTop 5 Weather Conditions in Accidents:")
print(merged_data['WEATHER'].value_counts().head())
print("\nTop 5 Harmful Events in Accidents:")
print(merged_data['HARM_EV'].value_counts().head())

# Split the dataset into two halves
split_index = len(merged_data) // 2
merged_data_1 = merged_data.iloc[:split_index]
merged_data_2 = merged_data.iloc[split_index:]

# Save the first half to CSV
output_file_1 = "merged_fars_data_part1.csv"
merged_data_1.to_csv(output_file_1, index=False)
print(f"First half of cleaned data saved to {output_file_1}")

# Save the second half to CSV
output_file_2 = "merged_fars_data_part2.csv"
merged_data_2.to_csv(output_file_2, index=False)
print(f"Second half of cleaned data saved to {output_file_2}")


Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389872 entries, 0 to 389871
Data columns (total 73 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   STATE          389872 non-null  int64  
 1   STATENAME      389872 non-null  object 
 2   ST_CASE        389872 non-null  int64  
 3   VE_TOTAL       389872 non-null  int64  
 4   VE_FORMS       389872 non-null  int64  
 5   PVH_INVL       389872 non-null  int64  
 6   PEDS           389872 non-null  int64  
 7   PERSONS        389872 non-null  int64  
 8   PERMVIT        389872 non-null  int64  
 9   PERNOTMVIT     389872 non-null  int64  
 10  COUNTY         389872 non-null  int64  
 11  COUNTYNAME     389872 non-null  object 
 12  CITY           389872 non-null  int64  
 13  CITYNAME       389872 non-null  object 
 14  DAY            389872 non-null  int64  
 15  DAYNAME        389872 non-null  int64  
 16  MONTH          389872 non-null  int64  
 17  YEAR           38

In [8]:
merged_data['COUNTYNAME'].describe()
# See how many null values are in the COUNTYNAME column
# merged_data['COUNTYNAME'].isnull().sum()

count               389872
unique                3031
top       LOS ANGELES (37)
freq                  7989
Name: COUNTYNAME, dtype: object

In [9]:
merged_data['COUNTYNAME']

0          JEFFERSON (73)
1          JEFFERSON (73)
2          JEFFERSON (73)
3          JEFFERSON (73)
4          JEFFERSON (73)
               ...       
389867    SWEETWATER (37)
389868       LINCOLN (23)
389869       LARAMIE (21)
389870       JOHNSON (19)
389871       LARAMIE (21)
Name: COUNTYNAME, Length: 389872, dtype: object

In [10]:
# get county names where STATENAME is Louisiana
louisiana_counties = merged_data[merged_data['STATENAME'] == 'Louisiana']['COUNTYNAME']

In [11]:
louisiana_counties

22875                CADDO (17)
22876          TERREBONNE (109)
22877          TERREBONNE (109)
22878            CALCASIEU (19)
22879          TANGIPAHOA (105)
                  ...          
355954             ORLEANS (71)
355955          VERMILION (113)
355956    EAST BATON ROUGE (33)
355957    EAST BATON ROUGE (33)
355958             RAPIDES (79)
Name: COUNTYNAME, Length: 8032, dtype: object

In [12]:
merged_data

Unnamed: 0,STATE,STATENAME,ST_CASE,VE_TOTAL,VE_FORMS,PVH_INVL,PEDS,PERSONS,PERMVIT,PERNOTMVIT,...,CF3NAME,FATALS,DRUNK_DR,VEH_NO,DRIMPAIR,DRIMPAIRNAME,WEATHER_x,WEATHERNAME_x,WEATHER_y,WEATHERNAME_y
0,1,Alabama,10001,1,1,0,0,1,1,0,...,,1,1.0,1,9,"Under the Influence of Alcohol, Drugs or Medic...",,,,
1,1,Alabama,10002,1,1,0,0,2,2,0,...,,1,1.0,1,9,"Under the Influence of Alcohol, Drugs or Medic...",,,,
2,1,Alabama,10003,2,1,1,0,2,1,1,...,,1,0.0,1,2,Asleep or Fatigued,,,,
3,1,Alabama,10004,1,1,0,0,1,1,0,...,,1,1.0,1,9,"Under the Influence of Alcohol, Drugs or Medic...",,,,
4,1,Alabama,10005,1,1,0,0,1,1,0,...,,1,0.0,1,99,Unknown if Impaired,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389867,56,Wyoming,560114,2,2,0,0,2,2,0,...,,1,,2,0,None/Apparently Normal,1.0,Clear,1.0,Clear
389868,56,Wyoming,560115,1,1,0,0,1,1,0,...,,1,,1,0,None/Apparently Normal,4.0,Snow,4.0,Snow
389869,56,Wyoming,560116,1,1,0,0,1,1,0,...,,1,,1,1,"Ill, Blackout",4.0,Snow,4.0,Snow
389870,56,Wyoming,560117,1,1,0,0,1,1,0,...,,1,,1,9,"Under the Influence of Alcohol, Drugs or Medic...",4.0,Snow,4.0,Snow
