# VI: Second Practical Work

**Authors:** Gerard Comas & Marc Franquesa.


## Data Processing
Processing all datasets in this notebook

In [64]:
# Initial imports
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point

warnings.simplefilter(action="ignore", category=FutureWarning)

### Collisions dataset

In [65]:
# read the dataset
collisions = pd.read_csv("./original-data/collisions.csv")

collisions.head()

  collisions = pd.read_csv("./original-data/collisions.csv")


Unnamed: 0,CRASH DATETIME,CRASH WEEKDAY,AFTER COVID,BOROUGH,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL VEHICLE,VEHICLE,ORIGINAL FACTOR,FACTOR,LOCATION,tmpf,relh,sknt,p01i,vsby,DISTRICT
0,2020-09-06 18:00:00,Sunday,True,,40.771038,-73.83413,0.0,1.0,Station Wagon/Sport Utility Vehicle,Car,Unsafe Lane Changing,Driving Infraction,"[40.771038, -73.83413]",27.222222,42.19,0.0,0.0,16.09344,4.0
1,2020-09-06 18:00:00,Sunday,True,BRONX,40.88845,-73.841965,0.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Failure to Yield Right-of-Way,Driving Infraction,"[40.88845, -73.841965]",27.222222,42.19,0.0,0.0,16.09344,22.0
2,2020-09-06 18:00:00,Sunday,True,,40.638527,-73.87853,1.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Driver Inattention/Distraction,Driver Inattention,"[40.638527, -73.87853]",27.222222,42.19,0.0,0.0,16.09344,
3,2020-09-06 18:00:00,Sunday,True,BRONX,40.80684,-73.9275,0.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Unspecified,Unspecified,"[40.80684, -73.9275]",27.222222,42.19,0.0,0.0,16.09344,50.0
4,2020-09-06 18:00:00,Sunday,True,BROOKLYN,40.71043,-73.9437,0.0,0.0,Sedan,Car,Driver Inattention/Distraction,Driver Inattention,"[40.71043, -73.9437]",27.222222,42.19,0.0,0.0,16.09344,68.0


In [66]:
# select only the values from 2018
collisions = collisions[collisions["CRASH DATETIME"] < "2019-01-01"] 

# select only the columns we need
collisions = collisions[["CRASH DATETIME", "LATITUDE", "LONGITUDE", "ORIGINAL VEHICLE"]]

In [67]:
categories = {
    "Taxi": ["Taxi"],
    "Ambulance": ["Ambulance", "AMBUL", "Ambul", "ambul", "AMB", "AMBU", "AMBULANCE"],
    "Fire truck": ["Fire", "FIRET", "FIRE", "FDNY", "fdny", "FD tr", "fd tr", "firet", "fire"],
}

reverse_categories = {val: key for key, values in categories.items() for val in values}

# Assume df is your DataFrame and 'column_name' is the column you want to classify
collisions["VEHICLE"] = collisions["ORIGINAL VEHICLE"].map(reverse_categories)

collisions = collisions.dropna(subset=["VEHICLE"])

collisions = collisions[["CRASH DATETIME", "LATITUDE", "LONGITUDE", "VEHICLE"]]

In [68]:
collisions.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE
506,2018-08-30 14:00:00,40.65939,-73.76581,Taxi
509,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance
527,2018-09-18 21:00:00,40.76155,-73.96659,Taxi
535,2018-09-18 21:00:00,40.67069,-73.91703,Taxi
538,2018-09-18 21:00:00,40.743423,-73.999855,Taxi


In [69]:
# Lets add emojis
vehicle_emojis = {
    "Taxi": "🚕",
    "Ambulance": "🚑",
    "Fire truck": "🚒",
}

collisions["VEHICLE EMOJI"] = collisions["VEHICLE"].map(vehicle_emojis)

collisions.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,VEHICLE EMOJI
506,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,🚕
509,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,🚑
527,2018-09-18 21:00:00,40.76155,-73.96659,Taxi,🚕
535,2018-09-18 21:00:00,40.67069,-73.91703,Taxi,🚕
538,2018-09-18 21:00:00,40.743423,-73.999855,Taxi,🚕


In [70]:
collisions["CRASH DAY"] = pd.to_datetime(collisions["CRASH DATETIME"]).dt.strftime("%Y-%m-%d")
collisions.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,VEHICLE EMOJI,CRASH DAY
506,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,🚕,2018-08-30
509,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,🚑,2018-08-30
527,2018-09-18 21:00:00,40.76155,-73.96659,Taxi,🚕,2018-09-18
535,2018-09-18 21:00:00,40.67069,-73.91703,Taxi,🚕,2018-09-18
538,2018-09-18 21:00:00,40.743423,-73.999855,Taxi,🚕,2018-09-18


### Weather dataset

In [71]:
weather = pd.read_csv("./original-data/weather2018.csv")

In [72]:
weather = weather[["datetime", "icon"]]
weather["WEATHER"] = weather["icon"]
weather["WEATHER"].unique()

array(['rain', 'partly-cloudy-day', 'clear-day', 'cloudy'], dtype=object)

In [73]:
weather_emojis = {
    "rain" : "🌧",
    "clear-day" : "☀️",
    "cloudy" : "☁️",
    "partly-cloudy-day" : "⛅️",
}

weather["WEATHER EMOJI"] = weather["WEATHER"].map(weather_emojis)

In [74]:
weather.head()

Unnamed: 0,datetime,icon,icon emoji
0,2018-06-01,rain,🌧
1,2018-06-02,rain,🌧
2,2018-06-03,rain,🌧
3,2018-06-04,rain,🌧
4,2018-06-05,partly-cloudy-day,⛅️


### Collisions + Weather

In [75]:
# merge the collisions and weather dataframes on the "CRASH DAY" and "datetime" columns
collisions_weather = pd.merge(collisions, weather, left_on="CRASH DAY", right_on="datetime")

### NY Map

In [76]:
map_data = gpd.read_file(f"./original-data/map.geojson")

collisions_weather["BOROUGH"] = collisions_weather.apply(lambda x: [-1] if pd.isnull(x["LATITUDE"]) or pd.isnull(x["LONGITUDE"]) else np.where(map_data.contains(Point(x["LONGITUDE"], x["LATITUDE"])))[0], axis=1)

collisions_weather["BOROUGH"] = collisions_weather["BOROUGH"].apply(lambda x: -1 if len(x) == 0 else x[0]).replace(-1, np.nan)

collisions_weather.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,VEHICLE EMOJI,CRASH DAY,datetime,icon,icon emoji,BOROUGH
0,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧,2.0
1,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,🚑,2018-08-30,2018-08-30,rain,🌧,3.0
2,2018-08-30 04:00:00,40.7606,-73.96434,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧,3.0
3,2018-08-30 10:00:00,40.798256,-73.82744,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧,
4,2018-08-30 16:00:00,40.748512,-73.98872,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧,3.0


In [77]:
boroughs = {
    0.0: "Staten Island",
    1.0: "Bronx",
    2.0: "Queens",
    3.0: "Manhattan",
    4.0: "Brooklyn"
}

collisions_weather["BOROUGH"] = collisions_weather["BOROUGH"].map(boroughs)

In [78]:
collisions_weather = collisions_weather[["CRASH DATETIME", "BOROUGH", "VEHICLE", "VEHICLE EMOJI", "WEATHER", "WEATHER EMOJI"]]

collisions_weather.to_csv("./processed-data/collisions_weather.csv", index=False)

In [79]:
map_data["COLLISIONS"] = collisions_weather.groupby(["BOROUGH"]).size()

# Convert to epsh = 4326
# map_data["geometry"] = map_data["geometry"].to_crs(epsg=4326)

map_data["AREA"] = map_data["geometry"].area

map_data["AREA PROPORTION"] = map_data["AREA"] / map_data["AREA"].sum()

# Value found online (wikipedia)
map_data["AREA KM2"] = 783.84 * map_data["AREA PROPORTION"]

map_data["COLLISIONS / KM2"] = map_data["COLLISIONS"] / map_data["AREA KM2"]


  map_data["AREA"] = map_data["geometry"].area


In [80]:
map_data.to_file("processed-data/map.geojson", driver="GeoJSON")