In [None]:
import sys
import os
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
sys.path.append(os.path.abspath('../source'))

In [None]:
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/FARSData/GetFARSData"

# Encabezados
headers = {
    "Accept": "application/json", #TODO: cambiar a "Accept": "text/csv"
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# Crear la carpeta para almacenar los archivos CSV
output_dir = "../data"
os.makedirs(output_dir, exist_ok=True)

# Automatizar la descarga por año
for year in range(2017, 2023):  # De 2015 a 2022
    # Construir la URL para cada año dinámicamente
    url = f"{base_url}?dataset=Accident&FromYear={year}&ToYear={year}&State=*&format=csv"
    print(f"Descargando datos para el año {year}...")

    try:
        # Realizar la solicitud al API
        response = requests.get(url, headers=headers, timeout=600, stream=True)  # Timeout de 10 minutos

        # Verificar si la solicitud fue exitosa
        if response.status_code == 200:
            # Guardar los datos en el archivo CSV
            output_file = os.path.join(output_dir, f"FARS_data_{year}.csv")
            with open(output_file, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)
            print(f"✅ Datos del año {year} guardados exitosamente en {output_file}")
        else:
            print(f"❌ Error al obtener los datos para el año {year}: Código HTTP {response.status_code}")
            print(response.text)

    except requests.exceptions.Timeout:
        print(f"⏱️ La solicitud para el año {year} excedió el tiempo límite.")
    except requests.exceptions.RequestException as e:
        print(f"❌ Error en la solicitud para el año {year}: {e}")

In [108]:
data_2017 = pd.read_csv('../data/FARS_data_2017.csv')
data_2018 = pd.read_csv('../data/FARS_data_2018.csv')
data_2019 = pd.read_csv('../data/FARS_data_2019.csv')
data_2020 = pd.read_csv('../data/FARS_data_2020.csv')
data_2021 = pd.read_csv('../data/FARS_data_2021.csv')
data_2022 = pd.read_csv('../data/FARS_data_2022.csv')

  data_2017 = pd.read_csv('../data/FARS_data_2017.csv')


In [109]:
df = pd.concat([data_2017, data_2018, data_2019, data_2020, data_2021, data_2022], ignore_index=True)
df.head()

Unnamed: 0,caseyear,state,st_case,statename,ve_total,ve_forms,pvh_invl,peds,pernotmvit,permvit,...,hosp_mn,hosp_mnname,cf1,cf1name,cf2,cf2name,cf3,cf3name,fatals,drunk_dr
0,2017,1,10001,Alabama,1,1,0,0,0,1,...,88,Not Applicable (Not Transported),0.0,,0.0,,0.0,,1,0.0
1,2017,1,10002,Alabama,1,1,0,0,0,1,...,88,Not Applicable (Not Transported),0.0,,0.0,,0.0,,1,0.0
2,2017,1,10003,Alabama,3,3,0,0,0,3,...,88,Not Applicable (Not Transported),0.0,,0.0,,0.0,,1,0.0
3,2017,1,10004,Alabama,1,1,0,0,0,1,...,88,Not Applicable (Not Transported),20.0,Police Pursuit Involved,0.0,,0.0,,1,0.0
4,2017,1,10005,Alabama,1,1,0,0,0,2,...,11,11,0.0,,0.0,,0.0,,1,0.0


In [110]:
df.shape

(218576, 93)

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218576 entries, 0 to 218575
Data columns (total 93 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   caseyear      218576 non-null  int64  
 1   state         218576 non-null  int64  
 2   st_case       218576 non-null  int64  
 3   statename     218576 non-null  object 
 4   ve_total      218576 non-null  int64  
 5   ve_forms      218576 non-null  int64  
 6   pvh_invl      218576 non-null  int64  
 7   peds          218576 non-null  int64  
 8   pernotmvit    218576 non-null  int64  
 9   permvit       218576 non-null  int64  
 10  persons       218576 non-null  int64  
 11  county        218576 non-null  int64  
 12  countyname    218576 non-null  object 
 13  city          218576 non-null  int64  
 14  cityname      218576 non-null  object 
 15  day           218576 non-null  int64  
 16  month         218576 non-null  int64  
 17  monthname     218576 non-null  object 
 18  year

In [112]:
df.columns

Index(['caseyear', 'state', 'st_case', 'statename', 've_total', 've_forms',
       'pvh_invl', 'peds', 'pernotmvit', 'permvit', 'persons', 'county',
       'countyname', 'city', 'cityname', 'day', 'month', 'monthname', 'year',
       'day_week', 'day_weekname', 'hour', 'hourname', 'minute', 'minutename',
       'nhs', 'nhsname', 'rur_urb', 'rur_urbname', 'func_sys', 'func_sysname',
       'rd_owner', 'rd_ownername', 'route', 'routename', 'tway_id', 'tway_id2',
       'milept', 'mileptname', 'latitude', 'latitudename', 'longitud',
       'longitudname', 'sp_jur', 'sp_jurname', 'harm_ev', 'harm_evname',
       'man_coll', 'man_collname', 'reljct1', 'reljct1name', 'reljct2',
       'reljct2name', 'typ_int', 'typ_intname', 'wrk_zone', 'wrk_zonename',
       'road_fnc', 'road_fncname', 'rel_road', 'rel_roadname', 'lgt_cond',
       'lgt_condname', 'weather1', 'weather1name', 'weather2', 'weather2name',
       'weather', 'weathername', 'sch_bus', 'sch_busname', 'rail', 'railname',
       'no

In [113]:
"""df = df[[
    "countyname",      
    "cityname",        
    "statename",       
    "hour",            
    "minute",         
    "rd_ownername",   
    "milept",          
    "harm_evname",     
    "man_collname",            
    "fatals",          
    "drunk_dr",        
    "arr_hour",       
    "arr_min",         
    "hosp_hr",         
    "hosp_mn"          
]]"""

'df = df[[\n    "countyname",      \n    "cityname",        \n    "statename",       \n    "hour",            \n    "minute",         \n    "rd_ownername",   \n    "milept",          \n    "harm_evname",     \n    "man_collname",            \n    "fatals",          \n    "drunk_dr",        \n    "arr_hour",       \n    "arr_min",         \n    "hosp_hr",         \n    "hosp_mn"          \n]]'