# EDA API

### Importación de librerías

In [3]:
import sys
import os
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
sys.path.append(os.path.abspath('../source'))


### Descarga de Datos de Accidentes FARS desde la API de NHTSA

In [38]:
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/FARSData/GetFARSData"

# Encabezados
headers = {
    "Accept": "text/csv", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}


output_dir = "../data"
os.makedirs(output_dir, exist_ok=True)


for year in range(2017, 2023):  

    url = f"{base_url}?dataset=Accident&FromYear={year}&ToYear={year}&State=*&format=csv"
    print(f"Descargando datos para el año {year}...")

    try:
        # Realizar la solicitud al API
        response = requests.get(url, headers=headers, timeout=600, stream=True)  # Timeout de 10 minutos

        # Verificar si la solicitud fue exitosa
        if response.status_code == 200:
            # Guardar los datos en el archivo CSV
            output_file = os.path.join(output_dir, f"FARS_data_{year}.csv")
            with open(output_file, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)
            print(f"Datos del año {year} guardados exitosamente en {output_file}")
        else:
            print(f"Error al obtener los datos para el año {year}: Código HTTP {response.status_code}")
            print(response.text)

    except requests.exceptions.Timeout:
        print(f"La solicitud para el año {year} excedió el tiempo límite.")
    except requests.exceptions.RequestException as e:
        print(f"Error en la solicitud para el año {year}: {e}")

Descargando datos para el año 2017...
Datos del año 2017 guardados exitosamente en ../data\FARS_data_2017.csv
Descargando datos para el año 2018...
Datos del año 2018 guardados exitosamente en ../data\FARS_data_2018.csv
Descargando datos para el año 2019...
Datos del año 2019 guardados exitosamente en ../data\FARS_data_2019.csv
Descargando datos para el año 2020...
Datos del año 2020 guardados exitosamente en ../data\FARS_data_2020.csv
Descargando datos para el año 2021...
Datos del año 2021 guardados exitosamente en ../data\FARS_data_2021.csv
Descargando datos para el año 2022...
Datos del año 2022 guardados exitosamente en ../data\FARS_data_2022.csv


In [39]:
base_url = "https://crashviewer.nhtsa.dot.gov/CrashAPI/FARSData/GetFARSData"

# Encabezados
headers = {
    "Accept": "text/csv", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

output_dir = "../data"
os.makedirs(output_dir, exist_ok=True)

for year in range(2017, 2023):  
    
    url = f"{base_url}?dataset=Person&FromYear={year}&ToYear={year}&State=*&format=csv"
    print(f"Descargando datos para el año {year}...")

    try:
        # Realizar la solicitud al API
        response = requests.get(url, headers=headers, timeout=600, stream=True)  # Timeout de 10 minutos

        # Verificar si la solicitud fue exitosa
        if response.status_code == 200:
            # Guardar los datos en el archivo CSV
            output_file = os.path.join(output_dir, f"FARS_person_{year}.csv")
            with open(output_file, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)
            print(f"Datos del año {year} guardados exitosamente en {output_file}")
        else:
            print(f"Error al obtener los datos para el año {year}: Código HTTP {response.status_code}")
            print(response.text)

    except requests.exceptions.Timeout:
        print(f"La solicitud para el año {year} excedió el tiempo límite.")
    except requests.exceptions.RequestException as e:
        print(f"Error en la solicitud para el año {year}: {e}")

Descargando datos para el año 2017...
Datos del año 2017 guardados exitosamente en ../data\FARS_person_2017.csv
Descargando datos para el año 2018...
Datos del año 2018 guardados exitosamente en ../data\FARS_person_2018.csv
Descargando datos para el año 2019...
Datos del año 2019 guardados exitosamente en ../data\FARS_person_2019.csv
Descargando datos para el año 2020...
Datos del año 2020 guardados exitosamente en ../data\FARS_person_2020.csv
Descargando datos para el año 2021...
Datos del año 2021 guardados exitosamente en ../data\FARS_person_2021.csv
Descargando datos para el año 2022...
Datos del año 2022 guardados exitosamente en ../data\FARS_person_2022.csv


#### Carga de Datos de Accidentes FARS desde Archivos CSV

In [4]:
# Definir los años a procesar
years = range(2017, 2023)  # Desde 2017 hasta 2022

# Lista para almacenar DataFrames
dataframes = []

# Cargar archivos en un ciclo
for year in years:
    file_path = f'../data/FARS_data_{year}.csv'  # Ajusta el path si es necesario
    df = pd.read_csv(file_path)
    df["Year"] = year  # Agregar la columna de año para referencia
    dataframes.append(df)

# Concatenar todos los DataFrames en uno solo
accidents = pd.concat(dataframes, ignore_index=True)

# Ver los primeros registros
accidents.head()

  df = pd.read_csv(file_path)


Unnamed: 0,caseyear,state,st_case,statename,ve_total,ve_forms,pvh_invl,peds,pernotmvit,permvit,...,hosp_mnname,cf1,cf1name,cf2,cf2name,cf3,cf3name,fatals,drunk_dr,Year
0,2017,1,10001,Alabama,1,1,0,0,0,1,...,Not Applicable (Not Transported),0.0,,0.0,,0.0,,1,0.0,2017
1,2017,1,10002,Alabama,1,1,0,0,0,1,...,Not Applicable (Not Transported),0.0,,0.0,,0.0,,1,0.0,2017
2,2017,1,10003,Alabama,3,3,0,0,0,3,...,Not Applicable (Not Transported),0.0,,0.0,,0.0,,1,0.0,2017
3,2017,1,10004,Alabama,1,1,0,0,0,1,...,Not Applicable (Not Transported),20.0,Police Pursuit Involved,0.0,,0.0,,1,0.0,2017
4,2017,1,10005,Alabama,1,1,0,0,0,2,...,11,0.0,,0.0,,0.0,,1,0.0,2017


In [5]:
# Años a procesar
years = range(2017, 2023)  # Hasta 2022 

# Lista para almacenar DataFrames
dataframes = []

# Cargar archivos en un ciclo
for year in years:
    file_path = f'../data/FARS_person_{year}.csv'  # Ajusta el path si es necesario
    df = pd.read_csv(file_path)
    df["Year"] = year  # Agregar la columna de año para referencia
    dataframes.append(df)

# Concatenar todos los DataFrames en uno solo
persons = pd.concat(dataframes, ignore_index=True)

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


#### Unión de Datos de Accidentes y Persons 

In [128]:
print(accidents.columns)  # Muestra todas las columnas disponibles

Index(['caseyear', 'state', 'st_case', 'statename', 've_total', 've_forms',
       'pvh_invl', 'peds', 'pernotmvit', 'permvit', 'persons', 'county',
       'countyname', 'city', 'cityname', 'day', 'dayname', 'month',
       'monthname', 'year', 'day_week', 'day_weekname', 'hour', 'hourname',
       'minute', 'minutename', 'nhs', 'nhsname', 'rur_urb', 'rur_urbname',
       'func_sys', 'func_sysname', 'rd_owner', 'rd_ownername', 'route',
       'routename', 'tway_id', 'tway_id2', 'milept', 'mileptname', 'latitude',
       'latitudename', 'longitud', 'longitudname', 'sp_jur', 'sp_jurname',
       'harm_ev', 'harm_evname', 'man_coll', 'man_collname', 'reljct1',
       'reljct1name', 'reljct2', 'reljct2name', 'typ_int', 'typ_intname',
       'wrk_zone', 'wrk_zonename', 'road_fnc', 'road_fncname', 'rel_road',
       'rel_roadname', 'lgt_cond', 'lgt_condname', 'weather1', 'weather1name',
       'weather2', 'weather2name', 'weather', 'weathername', 'sch_bus',
       'sch_busname', 'rail', 'rai

In [129]:
print(persons.columns)  

Index(['caseyear', 'state', 'statename', 'st_case', 've_forms', 'veh_no',
       'per_no', 'str_veh', 'str_vehname', 'county',
       ...
       'icfinalbodyname', 'gvwr_from', 'gvwr_fromname', 'gvwr_to',
       'gvwr_toname', 'devtype', 'devtypename', 'devmotor', 'devmotorname',
       'Year'],
      dtype='object', length=160)


In [6]:
df = accidents.merge(persons[['st_case', 'age', 'sex', 'alc_res']], on='st_case', how='left')

In [7]:
df_copy = df.copy()

In [131]:
df.head()


Unnamed: 0,caseyear,state,st_case,statename,ve_total,ve_forms,pvh_invl,peds,pernotmvit,permvit,...,cf2,cf2name,cf3,cf3name,fatals,drunk_dr,Year,age,sex,alc_res
0,2017,1,10001,Alabama,1,1,0,0,0,1,...,0.0,,0.0,,1,0.0,2017,42,1,996
1,2017,1,10001,Alabama,1,1,0,0,0,1,...,0.0,,0.0,,1,0.0,2017,55,1,0
2,2017,1,10001,Alabama,1,1,0,0,0,1,...,0.0,,0.0,,1,0.0,2017,34,2,996
3,2017,1,10001,Alabama,1,1,0,0,0,1,...,0.0,,0.0,,1,0.0,2017,53,1,996
4,2017,1,10001,Alabama,1,1,0,0,0,1,...,0.0,,0.0,,1,0.0,2017,59,1,0


### Generalidades

In [132]:
df.shape

(3012888, 98)

#### Tipos de Datos

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3012888 entries, 0 to 3012887
Data columns (total 98 columns):
 #   Column        Dtype  
---  ------        -----  
 0   caseyear      int64  
 1   state         int64  
 2   st_case       int64  
 3   statename     object 
 4   ve_total      int64  
 5   ve_forms      int64  
 6   pvh_invl      int64  
 7   peds          int64  
 8   pernotmvit    int64  
 9   permvit       int64  
 10  persons       int64  
 11  county        int64  
 12  countyname    object 
 13  city          int64  
 14  cityname      object 
 15  day           int64  
 16  dayname       int64  
 17  month         int64  
 18  monthname     object 
 19  year          int64  
 20  day_week      int64  
 21  day_weekname  object 
 22  hour          int64  
 23  hourname      object 
 24  minute        int64  
 25  minutename    object 
 26  nhs           int64  
 27  nhsname       object 
 28  rur_urb       int64  
 29  rur_urbname   object 
 30  func_sys      int6

In [134]:
df.columns

Index(['caseyear', 'state', 'st_case', 'statename', 've_total', 've_forms',
       'pvh_invl', 'peds', 'pernotmvit', 'permvit', 'persons', 'county',
       'countyname', 'city', 'cityname', 'day', 'dayname', 'month',
       'monthname', 'year', 'day_week', 'day_weekname', 'hour', 'hourname',
       'minute', 'minutename', 'nhs', 'nhsname', 'rur_urb', 'rur_urbname',
       'func_sys', 'func_sysname', 'rd_owner', 'rd_ownername', 'route',
       'routename', 'tway_id', 'tway_id2', 'milept', 'mileptname', 'latitude',
       'latitudename', 'longitud', 'longitudname', 'sp_jur', 'sp_jurname',
       'harm_ev', 'harm_evname', 'man_coll', 'man_collname', 'reljct1',
       'reljct1name', 'reljct2', 'reljct2name', 'typ_int', 'typ_intname',
       'wrk_zone', 'wrk_zonename', 'road_fnc', 'road_fncname', 'rel_road',
       'rel_roadname', 'lgt_cond', 'lgt_condname', 'weather1', 'weather1name',
       'weather2', 'weather2name', 'weather', 'weathername', 'sch_bus',
       'sch_busname', 'rail', 'rai

### Limpieza

In [13]:
columns = [
    've_total', 'fatals', 'peds','arr_hour', 'arr_min',
    'year', 'monthname', 'day_weekname','hour','minute',
    'state', 'statename', 'rur_urbname', 'func_sysname', 
    'weathername', 'lgt_condname','harm_evname',
    'age', 'sex', 'alc_res'
]

df = df[columns]


In [210]:
df.head()

Unnamed: 0,ve_total,fatals,peds,arr_hour,arr_min,year,monthname,day_weekname,hour,minute,state,statename,rur_urbname,func_sysname,weathername,lgt_condname,harm_evname,age,sex,alc_res
0,1,1,0,99,99,2017,February,Sunday,23,35,1,Alabama,Urban,Interstate,Clear,Dark - Not Lighted,Fence,42,1,996
1,1,1,0,99,99,2017,February,Sunday,23,35,1,Alabama,Urban,Interstate,Clear,Dark - Not Lighted,Fence,55,1,0
2,1,1,0,99,99,2017,February,Sunday,23,35,1,Alabama,Urban,Interstate,Clear,Dark - Not Lighted,Fence,34,2,996
3,1,1,0,99,99,2017,February,Sunday,23,35,1,Alabama,Urban,Interstate,Clear,Dark - Not Lighted,Fence,53,1,996
4,1,1,0,99,99,2017,February,Sunday,23,35,1,Alabama,Urban,Interstate,Clear,Dark - Not Lighted,Fence,59,1,0


In [211]:
df.dtypes

ve_total         int64
fatals           int64
peds             int64
arr_hour         int64
arr_min          int64
year             int64
monthname       object
day_weekname    object
hour             int64
minute           int64
state            int64
statename       object
rur_urbname     object
func_sysname    object
weathername     object
lgt_condname    object
harm_evname     object
age              int64
sex              int64
alc_res          int64
dtype: object

#### Manejo de datos de faltantes

In [212]:
df.isnull().sum()

ve_total        0
fatals          0
peds            0
arr_hour        0
arr_min         0
year            0
monthname       0
day_weekname    0
hour            0
minute          0
state           0
statename       0
rur_urbname     0
func_sysname    0
weathername     0
lgt_condname    0
harm_evname     0
age             0
sex             0
alc_res         0
dtype: int64

#### Manejo de datos duplicados

In [14]:
df.duplicated().sum()

106711

In [214]:
df[df.duplicated()]

Unnamed: 0,ve_total,fatals,peds,arr_hour,arr_min,year,monthname,day_weekname,hour,minute,state,statename,rur_urbname,func_sysname,weathername,lgt_condname,harm_evname,age,sex,alc_res
17,1,1,0,15,9,2017,February,Tuesday,14,59,1,Alabama,Urban,Interstate,Clear,Daylight,Rollover/Overturn,24,1,996
40,3,1,0,99,99,2017,January,Tuesday,20,31,1,Alabama,Urban,Interstate,Clear,Dark - Not Lighted,Motor Vehicle In-Transport,22,1,0
55,1,1,0,16,58,2017,January,Sunday,16,55,1,Alabama,Urban,Minor Arterial,Rain,Dark - Lighted,Utility Pole/Light Support,18,1,996
85,2,1,0,18,46,2017,January,Friday,18,40,1,Alabama,Rural,Interstate,Cloudy,Dark - Not Lighted,Ditch,43,1,996
86,2,1,0,18,46,2017,January,Friday,18,40,1,Alabama,Rural,Interstate,Cloudy,Dark - Not Lighted,Ditch,38,1,996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3012725,2,1,0,23,2,2022,November,Thursday,22,43,56,Wyoming,Rural,Interstate,Clear,Dark - Not Lighted,Motor Vehicle In-Transport,33,1,996
3012728,2,1,0,23,2,2022,November,Thursday,22,43,56,Wyoming,Rural,Interstate,Clear,Dark - Not Lighted,Motor Vehicle In-Transport,25,1,996
3012794,4,1,0,16,40,2022,November,Monday,16,30,56,Wyoming,Rural,Interstate,Snow,Dusk,Motor Vehicle In-Transport,46,1,996
3012798,4,1,0,16,40,2022,November,Monday,16,30,56,Wyoming,Rural,Interstate,Snow,Dusk,Motor Vehicle In-Transport,46,1,996


In [15]:
df = df.drop_duplicates()

In [216]:
df.shape

(2906177, 20)

In [217]:
df['rur_urbname'].value_counts()    

rur_urbname
Urban                                1690093
Rural                                1207277
Trafficway Not in State Inventory       5351
Unknown                                 2811
Not Reported                             645
Name: count, dtype: int64

In [16]:
df = df.loc[~df["rur_urbname"].isin(["Not Reported", "Trafficway Not in State Inventory", "Unknown"])].copy()

In [219]:
df['weathername'].value_counts()  

weathername
Clear                       2057900
Cloudy                       390569
Rain                         206128
Not Reported                 157998
Fog, Smog, Smoke              31155
Snow                          25821
Reported as Unknown            9883
Severe Crosswinds              4758
Sleet or Hail                  3512
Other                          2720
Freezing Rain or Drizzle       2318
Blowing Snow                   2298
Unknown                        1442
Blowing Sand, Soil, Dirt        868
Name: count, dtype: int64

In [17]:
df = df.loc[~df["weathername"].isin([
    "Not Reported", "Other", "Unknown", "Reported as Unknown"
])].copy()

In [221]:
df.shape

(2725327, 20)

In [18]:
# Diccionario con códigos especiales por columna
special_codes = {
    'arr_hour': [88, 99],
    'arr_min': [88, 97, 98, 99],
}

for col, codes in special_codes.items():
    df[col] = df[col].replace({code: -1 for code in codes})  # -1 como marcador en lugar de NaN


In [19]:
df = df.loc[~df[['arr_hour', 'arr_min']].isin([-1]).any(axis=1)].copy()



Las siguientes variables de tiempo del dataset FARS fueron procesadas para eliminar códigos especiales o valores fuera del rango válido:

- `arr_hour`: hora de llegada
- `arr_min`: minuto de llegada


Estas variables contenían códigos utilizados por el sistema FARS para representar valores especiales:

| Código | Significado común en FARS |
|--------|----------------------------|
| `88`   | Not Applicable             |
| `96`   | Anómalo (fuera de rango)   |
| `97`   | Imputed                    |
| `98`   | Estimated                  |
| `99`   | Unknown / Missing          |

Estos códigos fueron reemplazados por `-1` y eliminados dado que no eran muchos.

Por ejemplo:
- `arr_min` contenía valores como `97`, `98`, `99`, que no representan minutos válidos.

### Manejo de la variable `sex`

In [20]:
df["sex"].value_counts()[[8, 9]]

sex
8     5943
9    11595
Name: count, dtype: int64

In [22]:
df = df.loc[~df["sex"].isin(["Not Reported", "Unknown"])].copy()

In [23]:
sex_mapping = {
    1: "Male",
    2: "Female",
    3: "Other",
}

df["sex"] = df["sex"].map(sex_mapping)

In [178]:
df.describe(include="object")

Unnamed: 0,monthname,day_weekname,hourname,statename,rur_urbname,func_sysname,weathername,lgt_condname,harm_evname,sex
count,2709327,2709327,2709327,2709327,2709327,2709327,2709327,2709327,2709327,2709327
unique,12,7,24,52,2,9,10,10,60,4
top,July,Saturday,9:00pm-9:59pm,California,Urban,Principal Arterial - Other,Clear,Daylight,Motor Vehicle In-Transport,Male
freq,256027,469510,162904,306328,1594521,820980,2046007,1268291,1141442,1781050


In [179]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ve_total,2709327.0,1.646331,0.986304,1.0,1.0,1.0,2.0,59.0
fatals,2709327.0,1.097594,0.38678,1.0,1.0,1.0,1.0,20.0
peds,2709327.0,0.240353,0.622343,0.0,0.0,0.0,0.0,73.0
arr_hour,2709327.0,58.774042,42.982939,0.0,15.0,99.0,99.0,99.0
arr_min,2709327.0,66.61609,36.483364,0.0,32.0,98.0,99.0,99.0
year,2709327.0,2019.566633,1.720164,2017.0,2018.0,2020.0,2021.0,2022.0
state,2709327.0,26.604721,16.287862,1.0,12.0,26.0,41.0,56.0
age,2709327.0,60.695062,140.41569,0.0,24.0,38.0,56.0,999.0
alc_res,2709327.0,658.501497,453.663407,0.0,56.0,996.0,996.0,999.0


In [180]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

outlier_log = []

for col in numerical_cols:
    print(f"\n--- Analyzing Outliers for '{col}' ---")

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_limit) | (df[col] > upper_limit)]
    num_outliers = len(outliers)

    print(f"Number of Outliers: {num_outliers}")
    print(f"Lower Limit: {lower_limit:.2f}, Upper Limit: {upper_limit:.2f}")

    outlier_log.append({
        'Variable': col,
        'Outliers': num_outliers,
        'Lower Limit': lower_limit,
        'Upper Limit': upper_limit,
        'Porcentaje de Outliers': round(num_outliers / len(df) * 100, 2)
    })

print("\nResumen de Outliers:")
print(outlier_log)



--- Analyzing Outliers for 've_total' ---
Number of Outliers: 82922
Lower Limit: -0.50, Upper Limit: 3.50

--- Analyzing Outliers for 'fatals' ---
Number of Outliers: 210798
Lower Limit: 1.00, Upper Limit: 1.00

--- Analyzing Outliers for 'peds' ---
Number of Outliers: 596239
Lower Limit: 0.00, Upper Limit: 0.00

--- Analyzing Outliers for 'arr_hour' ---
Number of Outliers: 0
Lower Limit: -111.00, Upper Limit: 225.00

--- Analyzing Outliers for 'arr_min' ---
Number of Outliers: 0
Lower Limit: -68.50, Upper Limit: 199.50

--- Analyzing Outliers for 'year' ---
Number of Outliers: 0
Lower Limit: 2013.50, Upper Limit: 2025.50

--- Analyzing Outliers for 'state' ---
Number of Outliers: 0
Lower Limit: -31.50, Upper Limit: 84.50

--- Analyzing Outliers for 'age' ---
Number of Outliers: 58268
Lower Limit: -24.00, Upper Limit: 104.00

--- Analyzing Outliers for 'alc_res' ---
Number of Outliers: 0
Lower Limit: -1354.00, Upper Limit: 2406.00

Resumen de Outliers:
[{'Variable': 've_total', 'Outli

# Transformations for the dimensional modeling

In [24]:
df.columns

Index(['ve_total', 'fatals', 'peds', 'arr_hour', 'arr_min', 'year',
       'monthname', 'day_weekname', 'hour', 'minute', 'state', 'statename',
       'rur_urbname', 'func_sysname', 'weathername', 'lgt_condname',
       'harm_evname', 'age', 'sex', 'alc_res'],
      dtype='object')

In [25]:
df = df.drop(columns=['statename', 'state'])

In [26]:
df["lgt_condname"].unique()

array(['Daylight', 'Dark - Lighted', 'Dark - Not Lighted', 'Dusk', 'Dawn',
       'Dark - Unknown Lighting', 'Unknown', 'Other', 'Not Reported',
       'Reported as Unknown'], dtype=object)

In [78]:
print(df['alc_res'].unique())

[996   0 294  69 120 288  55 166 235  66 195 273 209  80 173 999 154 167
  33 153 187 159 112 234 146  81 183  35 197 188 210  25 174 157 247 238
 300 150 116 182 283 324 287 229 278  89  12  34 236  78 275 175 205 125
  53 181 126 130 262 401 252 233  64 108 165 192 237 204  60 225  45  10
 142  71 256 135  41 186  92   1 172  57 223 330 141  23 137 111  11 246
 271 265  51 207 212 301 263 214 143 331 302 117  20 155  96 218 176 115
  94 248 105  82  14 148 121 162  79  83 269 284 151 325 133  15 281 190
 199 101  13 161  37  26 179 245 124 251 177 217 280  18  59 298  52  30
 270 242 184  97 346 226 156 139 221  87 109  19  16 119 215 152 127 100
  54 282 158 149 316  88 313  29 296 230  17 312 241 244 277 122 249 227
 104  36 170  43  84 264 577 310 353 261 382  98 213 185 169  85 118 191
 318 253 220 194  99 254 198 189 202 106 293 110  22  70 144 180 309 134
 147 308 224 289 206 129  47  86 193  40 131 307 132 997 107 368 267 612
  21 196 231  32 160 268 240 114 400 232 319 339 21

In [None]:
visibility_mapping = {
    'Daylight': 'Alta',
    'Dawn': 'Moderada',
    'Dusk': 'Moderada',
    'Dark - Lighted': 'Baja',
    'Dark - Not Lighted': 'Muy Baja',
    'Dark - Unknown Lighting': 'Muy Baja',
    'Unknown': 'Unknown',
    'Not Reported': 'Unknown',
    'Reported as Unknown': 'Unknown',
    'Other': 'Unknown'
}

df["lgt_condname"] = df["lgt_condname"].map(visibility_mapping)

In [None]:
df["lgt_condname"].unique()

In [163]:
df["weathername"].unique()

array(['Clear', 'Rain', 'Cloudy', 'Unknown', 'Fog, Smog, Smoke', 'Snow',
       'Sleet or Hail', 'Freezing Rain or Drizzle', 'Blowing Snow',
       'Not Reported', 'Blowing Sand, Soil, Dirt', 'Severe Crosswinds',
       'Other', 'Reported as Unknown'], dtype=object)

In [None]:
# Convertir el BAC a su valor real en g/dL
df['alc_res'] = df['alc_res'] / 1000

# Función de clasificación
def clasificar_bac(bac):
    if bac <= 0.03:
        return "Bajo"
    elif bac <= 0.08:
        return "Moderado"
    elif bac <= 0.20:
        return "Alto"
    elif bac <= 0.40:
        return "Peligroso"
    else:
        return "Letal"

# Aplicar la función
df['alc_res'] = df['alc_res'].apply(clasificar_bac)

# Mostrar el resultado
print(df.head())


In [None]:
weather_map = {
    'Clear': 'Clear',
    'Rain': 'Rainy',
    'Cloudy': 'Windy',
    'Fog, Smog, Smoke': 'Foggy',
    'Snow': 'Snowy',
    'Sleet or Hail': 'Snowy',
    'Freezing Rain or Drizzle': 'Rainy',
    'Blowing Snow': 'Snowy',
    'Blowing Sand, Soil, Dirt': 'Windy',
    'Severe Crosswinds': 'Windy',
    'Clear': 'Clear',
    'Not Reported': 'Unknown',
    'Other': 'Unknown',
    'Reported as Unknown': 'Unknown'
    # 'Unknown' se mantiene igual
}

# Aplicamos el reemplazo
df['weathername'] = df['weathername'].replace(weather_map)


In [226]:
accidents['arr_hour'].unique()

array([99, 15, 16, 20, 18, 17, 14,  4,  5, 22,  0,  2, 23,  7, 19,  9, 11,
        1,  6, 13,  3, 10,  8, 21, 12, 88], dtype=int64)

In [224]:
df['arr_hour'].unique()

array([nan, 15., 16., 20., 18., 17., 14.,  4.,  5., 22.,  0.,  2., 23.,
       19.,  9., 11.,  1.,  6., 13.,  3., 10.,  8., 21.,  7., 12.])

In [10]:
df[df["arr_hour"] == 99]["arr_hour"].nunique()

1

In [11]:
df[df["arr_hour"] == 88]["arr_hour"].nunique()

1

In [12]:
df[df["arr_min"].isin([88, 97, 98, 99])]["arr_hour"].nunique()

26

In [225]:
df['arr_min'].unique()

array([nan,  9., 58., 31., 46., 10., 40., 39.,  0., 30., 15., 42., 23.,
       50., 59., 17., 45., 53.,  5.,  6., 35., 27., 47., 55., 43., 25.,
       44.,  1.,  3., 28.,  4., 34., 48., 33., 36., 54.,  8., 56.,  7.,
       49., 12., 32., 20., 22., 21.,  2., 24., 26., 16., 41., 52., 11.,
       13., 38., 51., 18., 29., 19., 37., 14., 57.])