# Imports & settings

In [18]:
import pandas as pd
import numpy as np
import folium

# Preprocessing

## 1. Collisions dataset preprocessing

In [19]:
# Loads the collision dataset.
collisions = pd.read_csv('data/collisions.csv')

  collisions = pd.read_csv('data/collisions.csv')


In [20]:
def filter_collisions(collisions):
    """
        Filters the collisions dataset for the years 2018 and 2020.
    """

    # Converts the 'CRASH DATE' column to a datetime format.
    collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'], format='%m/%d/%Y')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_collisions(data, year):
        return data[
            ((data['CRASH DATE'].dt.year == year) & (data['CRASH DATE'].dt.month >= 6) & (data['CRASH DATE'].dt.month <= 8))
        ]

    summer_collisions_2018 = filter_summer_collisions(collisions, 2018)
    summer_collisions_2020 = filter_summer_collisions(collisions, 2020)

    return summer_collisions_2018, summer_collisions_2020

# Gets the summer collisions for 2018 and 2020 datasets.
summer_collisions_2018, summer_collisions_2020 = filter_collisions(collisions)

In [21]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify collision in time.
time_columns = [
    'CRASH DATE',
    'CRASH TIME',
    'BOROUGH',
    'ZIP CODE',
    'COLLISION_ID']

# Columns that identify collision in space (locally).
location_columns = [
    'ON STREET NAME', 
    'CROSS STREET NAME',
    'OFF STREET NAME'
]

# Columns about coordinate location of the collision.
coordinates = [
    'LATITUDE',
    'LONGITUDE'
]

# Columns about the type of vehicles involved in the collision.
vehicle_type = [
    'VEHICLE TYPE CODE 1',
    'VEHICLE TYPE CODE 2',
    'VEHICLE TYPE CODE 3',
    'VEHICLE TYPE CODE 4',
    'VEHICLE TYPE CODE 5'
]

In [22]:
def filter_columns_of_interest(summer_collisions, columns_of_interest):
    """
        Filters the summer collisions to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_collisions.columns if col not in columns_of_interest]
    
    # Drops the columns not in columns_of_interest.
    summer_collisions = summer_collisions.drop(columns=columns_to_drop)
    
    return summer_collisions

# Defines columns of interest.
columns_of_interest = time_columns + coordinates + vehicle_type

# Filter columns of interest for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = filter_columns_of_interest(summer_collisions_2018, columns_of_interest)
summer_collisions_2020 = filter_columns_of_interest(summer_collisions_2020, columns_of_interest)

In [23]:
def manage_missing_values(summer_collisions):
    """
        Processes the datasets with specified criteria on information category selection (column selection).
    """

    # Replaces 'Unspecified' with NaN values.
    summer_collisions = summer_collisions.replace('Unspecified', np.nan)
    
    # Drops rows with missing values in categories time_columns, coordinates.
    summer_collisions = summer_collisions.dropna(subset=time_columns, how='any')
    summer_collisions = summer_collisions.dropna(subset=coordinates, how='any')
    
    # Drops rows with missing values in vehicle_type where at least two types are specified.
    summer_collisions = summer_collisions.dropna(subset=vehicle_type, thresh=2)
    
    return summer_collisions

# Gets the minimum missing values dataframe, with all information needed to conduct the visualizations, for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = manage_missing_values(summer_collisions_2018)
summer_collisions_2020 = manage_missing_values(summer_collisions_2020)

In [24]:
def add_daytype(summer_collisions):
    """
        Adds a column to the datasets indicating wheter is weekday or weekend.
    """

    # Adds a new column indicating wheter is weekday (1) or weekend (0).
    summer_collisions['DAY TYPE'] = (summer_collisions['CRASH DATE'].dt.dayofweek // 5 == 1).astype(int)
    
    return summer_collisions

# Adds the day type column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_daytype(summer_collisions_2018)
summer_collisions_2020 = add_daytype(summer_collisions_2020)

In [25]:
def add_covid_restrictions(summer_collisions, threshold_date):
    """
        Adds a column indicating wheter COVID-19 restrictions were dictated or not.
    """

    # Adds a new column indicating wheter the COVID-19 restrictions were dictated (1) or not (0).
    summer_collisions['COVID-19 RESTRICTIONS'] = (summer_collisions['CRASH DATE'] >= threshold_date).astype(int)

    return summer_collisions

# Threshold date for the dictation of COVID-19 restrictions.
covid_threshold_date = pd.to_datetime('2020-03-15')

# Adds the COVID-19 restriction column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_covid_restrictions(summer_collisions_2018, covid_threshold_date)
summer_collisions_2020 = add_covid_restrictions(summer_collisions_2020, covid_threshold_date)

In [26]:
# Changes data type of ZIP CODE column to integer.
summer_collisions_2018['ZIP CODE'] = pd.to_numeric(summer_collisions_2018['ZIP CODE'], errors='coerce', downcast='integer')
summer_collisions_2020['ZIP CODE'] = pd.to_numeric(summer_collisions_2020['ZIP CODE'], errors='coerce', downcast='integer')

In [27]:
# Reset index for summer_collisions_2018 dataset.
summer_collisions_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_collisions_2020 dataset.
summer_collisions_2020.reset_index(drop=True, inplace=True)

In [28]:
summer_collisions_2018.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,DAY TYPE,COVID-19 RESTRICTIONS
0,2018-08-30,14:00,BROOKLYN,11204,40.61662,-73.99972,3971526,Sedan,Sedan,,,,0,0
1,2018-08-31,7:55,BRONX,10472,40.827168,-73.870125,3973140,Sedan,Sedan,,,,0,0
2,2018-08-28,18:15,BROOKLYN,11211,40.70654,-73.95041,3969590,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,0,0
3,2018-08-29,14:50,QUEENS,11694,40.586067,-73.82263,3970294,Station Wagon/Sport Utility Vehicle,Sedan,,,,0,0
4,2018-08-05,16:45,MANHATTAN,10075,40.77364,-73.95986,3955175,Taxi,Station Wagon/Sport Utility Vehicle,,,,1,0


## 2. Weather dataset preprocessing

In [29]:
# Loads the weather dataset.
weather = pd.read_csv('data/weather.csv')

  weather = pd.read_csv('data/weather.csv')


In [30]:
def filter_weather(weather):
    """
        Filters the weather dataset for the years 2018 and 2020.
    """
    
    # Convert 'DATE' column to datetime format.
    weather['DATE'] = pd.to_datetime(weather['DATE'], errors='coerce')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_weather(data, year):
        return data[
            ((data['DATE'].dt.year == year) & (data['DATE'].dt.month >= 6) & (data['DATE'].dt.month <= 8))
        ]

    summer_weather_2018 = filter_summer_weather(weather, 2018)
    summer_weather_2020 = filter_summer_weather(weather, 2020)

    return summer_weather_2018, summer_weather_2020

# Gets the summer weather for 2018 and 2020 datasets.
summer_weather_2018, summer_weather_2020 = filter_weather(weather)

In [31]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify the weather station and the date of the observation.
observation_columns = [
    'STATION', 
    'NAME', 
    'DATE'
]

# Columns about coordinate location of the observation.
coordinates = [
    'LATITUDE', 
    'LONGITUDE', 
    'ELEVATION'
]

# Columns about the weather conditions (snowfall).
snowfall_columns = [
    'SNOW', # Snowfall.
    'SNWD'  # Snow depth.
]

# Columns about the weather conditions (temperature).
temperature_columns = [
    'TAVG', # Average temperature.
    'TMAX', # Maximum temperature.
    'TMIN'  # Minimum temperature.
]

# Columns about the weather conditions (wind).
wind_columns = [
    'AWND', # Average daily wind speed.
    'WDF2', # Direction of the fastest 2-minute wind.
    'WDF5', # Direction of the fastest 5-second wind.
    'WSF2', # Fastest 2-minute wind speed.
    'WSF5'  # Fastest 5-second wind speed.
]

# Columns about the weather conditions (precipitation).
# Consider only 'PRCP' column.

In [32]:
def filter_columns_of_interest(summer_weather, columns_of_interest):
    """
        Filters the summer weather to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_weather.columns if col not in columns_of_interest]
    
    # Drops the columns not in columns_of_interest.
    summer_weather = summer_weather.drop(columns=columns_to_drop)
    
    return summer_weather

# Defines columns of interest.
columns_of_interest = observation_columns + coordinates + snowfall_columns + temperature_columns + wind_columns + ['PRCP']

# Filter columns of interest for both summer_weather_2018 and summer_weather_2020.
summer_weather_2018 = filter_columns_of_interest(summer_weather_2018, columns_of_interest)
summer_weather_2020 = filter_columns_of_interest(summer_weather_2020, columns_of_interest)

In [None]:
def replace_nan_with_zero(summer_weather):
    # Replaces all NaN values with 0 values.
   summer_weather.fillna(0, inplace=True)
   return summer_weather

summer_weather_2018 = replace_nan_with_zero(summer_weather_2018)
summer_weather_2020 = replace_nan_with_zero(summer_weather_2020)

In [33]:
def encode_weather_conditions(summer_weather):
    # Creates new columns to define labels for weather conditions
    summer_weather['WINDY'] = (summer_weather['AWND'].gt(0) | summer_weather['WDF2'].gt(0) | summer_weather['WDF5'].gt(0) | summer_weather['WSF2'].gt(0) | summer_weather['WSF5'].gt(0))
    summer_weather['RAINY'] = summer_weather['PRCP'].gt(0)  # Considers only non-zero precipitation as rainy
    summer_weather['SNOWY'] = (summer_weather['SNOW'].gt(0) | summer_weather['SNWD'].gt(0))
    summer_weather['SUNNY'] = ~(summer_weather['WINDY'] | summer_weather['RAINY'] | summer_weather['SNOWY'])

    return summer_weather

summer_weather_2018 = encode_weather_conditions(summer_weather_2018)
summer_weather_2020 = encode_weather_conditions(summer_weather_2020)

In [35]:
# Reset index for summer_weather_2018 dataset.
summer_weather_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_weather_2020 dataset.
summer_weather_2020.reset_index(drop=True, inplace=True)

In [36]:
summer_weather_2018.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
0,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-01,0.0,1.8,0.0,0.0,...,20.6,16.7,0.0,0.0,0.0,0.0,False,True,True,0
1,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-02,0.0,5.3,0.0,0.0,...,28.9,18.9,0.0,0.0,0.0,0.0,False,True,True,0
2,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-03,0.0,14.0,0.0,0.0,...,29.4,14.4,0.0,0.0,0.0,0.0,False,True,True,0
3,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-04,0.0,16.8,0.0,0.0,...,18.9,10.6,0.0,0.0,0.0,0.0,False,True,True,0
4,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2018-06-05,0.0,0.0,0.0,0.0,...,23.3,11.1,0.0,0.0,0.0,0.0,False,True,True,0


# 3. New York City (NYC) map

In [37]:
# Load your dataset containing car crash coordinates
# Example DataFrame with columns 'latitude' and 'longitude'
data = {
    'latitude': [40.7128, 40.7214, 40.7306],  # Example latitude data
    'longitude': [-74.0060, -74.0052, -74.0060]  # Example longitude data
}
df = pd.DataFrame(data)

# Create a map centered around NYC using OpenStreetMap tiles.
nyc_map = folium.Map(location=[40.7128, -74.0060], tiles='OpenStreetMap', zoom_start=11)

# Iterate through the dataset and add markers to the map
for index, row in df.iterrows():
    folium.Marker([row['latitude'], row['longitude']]).add_to(nyc_map)

# Save the map as an HTML file
nyc_map.save("car_crash_map_osm.html")

Save to pickle the final dataset in order to load it in posterior visualizations, if not exists process raw data, if exists load. in other file???