# VI: Second Practical Work

**Authors:** Gerard Comas & Marc Franquesa.


## Data Processing
Processing all datasets in this notebook

In [25]:
# Initial imports
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point
import math

warnings.simplefilter(action="ignore", category=FutureWarning)

### Collisions dataset

In [26]:
# read the dataset
collisions = pd.read_csv("./original-data/collisions.csv")
collisions["CRASH DATETIME"] = pd.to_datetime(collisions["CRASH DATETIME"])

collisions.head()

  collisions = pd.read_csv("./original-data/collisions.csv")


Unnamed: 0,CRASH DATETIME,CRASH WEEKDAY,AFTER COVID,BOROUGH,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL VEHICLE,VEHICLE,ORIGINAL FACTOR,FACTOR,LOCATION,tmpf,relh,sknt,p01i,vsby,DISTRICT
0,2020-09-06 18:00:00,Sunday,True,,40.771038,-73.83413,0.0,1.0,Station Wagon/Sport Utility Vehicle,Car,Unsafe Lane Changing,Driving Infraction,"[40.771038, -73.83413]",27.222222,42.19,0.0,0.0,16.09344,4.0
1,2020-09-06 18:00:00,Sunday,True,BRONX,40.88845,-73.841965,0.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Failure to Yield Right-of-Way,Driving Infraction,"[40.88845, -73.841965]",27.222222,42.19,0.0,0.0,16.09344,22.0
2,2020-09-06 18:00:00,Sunday,True,,40.638527,-73.87853,1.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Driver Inattention/Distraction,Driver Inattention,"[40.638527, -73.87853]",27.222222,42.19,0.0,0.0,16.09344,
3,2020-09-06 18:00:00,Sunday,True,BRONX,40.80684,-73.9275,0.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Unspecified,Unspecified,"[40.80684, -73.9275]",27.222222,42.19,0.0,0.0,16.09344,50.0
4,2020-09-06 18:00:00,Sunday,True,BROOKLYN,40.71043,-73.9437,0.0,0.0,Sedan,Car,Driver Inattention/Distraction,Driver Inattention,"[40.71043, -73.9437]",27.222222,42.19,0.0,0.0,16.09344,68.0


In [27]:
# select only the values from 2018
collisions = collisions[collisions["CRASH DATETIME"] < "2019-01-01"] 

# select only the columns we need
collisions = collisions[["CRASH DATETIME", "LATITUDE", "LONGITUDE", "ORIGINAL VEHICLE", "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED", "ORIGINAL FACTOR", "FACTOR"]]

In [28]:
categories = {
    "Taxi": ["Taxi"],
    "Ambulance": ["Ambulance", "AMBUL", "Ambul", "ambul", "AMB", "AMBU", "AMBULANCE"],
    "Fire truck": ["Fire", "FIRET", "FIRE", "FDNY", "fdny", "FD tr", "fd tr", "firet", "fire"],
}

reverse_categories = {val: key for key, values in categories.items() for val in values}

# Assume df is your DataFrame and 'column_name' is the column you want to classify
collisions["VEHICLE"] = collisions["ORIGINAL VEHICLE"].map(reverse_categories)

collisions = collisions.dropna(subset=["VEHICLE"])

collisions = collisions[["CRASH DATETIME", "LATITUDE", "LONGITUDE", "VEHICLE", "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED", "ORIGINAL FACTOR", "FACTOR"]]

In [29]:
collisions.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL FACTOR,FACTOR
506,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,0.0,0.0,Unspecified,Unspecified
509,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,0.0,0.0,View Obstructed/Limited,Environmental Factors
527,2018-09-18 21:00:00,40.76155,-73.96659,Taxi,0.0,0.0,Driver Inattention/Distraction,Driver Inattention
535,2018-09-18 21:00:00,40.67069,-73.91703,Taxi,2.0,0.0,Passing or Lane Usage Improper,Driving Infraction
538,2018-09-18 21:00:00,40.743423,-73.999855,Taxi,1.0,0.0,Passing Too Closely,Driving Infraction


In [30]:
# Lets add emojis
vehicle_emojis = {
    "Taxi": "🚕",
    "Ambulance": "🚑",
    "Fire truck": "🚒",
}

collisions["VEHICLE EMOJI"] = collisions["VEHICLE"].map(vehicle_emojis)

collisions.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL FACTOR,FACTOR,VEHICLE EMOJI
506,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,0.0,0.0,Unspecified,Unspecified,🚕
509,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,0.0,0.0,View Obstructed/Limited,Environmental Factors,🚑
527,2018-09-18 21:00:00,40.76155,-73.96659,Taxi,0.0,0.0,Driver Inattention/Distraction,Driver Inattention,🚕
535,2018-09-18 21:00:00,40.67069,-73.91703,Taxi,2.0,0.0,Passing or Lane Usage Improper,Driving Infraction,🚕
538,2018-09-18 21:00:00,40.743423,-73.999855,Taxi,1.0,0.0,Passing Too Closely,Driving Infraction,🚕


In [31]:
# Add day information
collisions["CRASH DAY"] = collisions["CRASH DATETIME"].dt.strftime("%Y-%m-%d")
collisions["CRASH WEEKDAY"] = collisions["CRASH DATETIME"].dt.day_name()
collisions["CRASH WEEK NUMBER"] = collisions["CRASH DATETIME"].dt.isocalendar().week
collisions.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL FACTOR,FACTOR,VEHICLE EMOJI,CRASH DAY,CRASH WEEKDAY,CRASH WEEK NUMBER
506,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,0.0,0.0,Unspecified,Unspecified,🚕,2018-08-30,Thursday,35
509,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,0.0,0.0,View Obstructed/Limited,Environmental Factors,🚑,2018-08-30,Thursday,35
527,2018-09-18 21:00:00,40.76155,-73.96659,Taxi,0.0,0.0,Driver Inattention/Distraction,Driver Inattention,🚕,2018-09-18,Tuesday,38
535,2018-09-18 21:00:00,40.67069,-73.91703,Taxi,2.0,0.0,Passing or Lane Usage Improper,Driving Infraction,🚕,2018-09-18,Tuesday,38
538,2018-09-18 21:00:00,40.743423,-73.999855,Taxi,1.0,0.0,Passing Too Closely,Driving Infraction,🚕,2018-09-18,Tuesday,38


### Weather dataset

In [32]:
weather = pd.read_csv("./original-data/weather2018.csv")

In [33]:
weather = weather[["datetime", "icon"]]
weather["WEATHER"] = weather["icon"]
weather["WEATHER"].unique()

array(['rain', 'partly-cloudy-day', 'clear-day', 'cloudy'], dtype=object)

In [34]:
weather_emojis = {
    "rain" : "🌧",
    "clear-day" : "☀️",
    "cloudy" : "☁️",
    "partly-cloudy-day" : "⛅️",
}

weather["WEATHER EMOJI"] = weather["WEATHER"].map(weather_emojis)

In [35]:
weather.head()

Unnamed: 0,datetime,icon,WEATHER,WEATHER EMOJI
0,2018-06-01,rain,rain,🌧
1,2018-06-02,rain,rain,🌧
2,2018-06-03,rain,rain,🌧
3,2018-06-04,rain,rain,🌧
4,2018-06-05,partly-cloudy-day,partly-cloudy-day,⛅️


### Collisions + Weather

In [36]:
# merge the collisions and weather dataframes on the "CRASH DAY" and "datetime" columns
collisions_weather = pd.merge(collisions, weather, left_on="CRASH DAY", right_on="datetime")

### NY Map

In [37]:
map_data = gpd.read_file(f"./original-data/map.geojson")

collisions_weather["BOROUGH"] = collisions_weather.apply(lambda x: [-1] if pd.isnull(x["LATITUDE"]) or pd.isnull(x["LONGITUDE"]) else np.where(map_data.contains(Point(x["LONGITUDE"], x["LATITUDE"])))[0], axis=1)

collisions_weather["BOROUGH"] = collisions_weather["BOROUGH"].apply(lambda x: -1 if len(x) == 0 else x[0]).replace(-1, np.nan)

collisions_weather.head()

Unnamed: 0,CRASH DATETIME,LATITUDE,LONGITUDE,VEHICLE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL FACTOR,FACTOR,VEHICLE EMOJI,CRASH DAY,CRASH WEEKDAY,CRASH WEEK NUMBER,datetime,icon,WEATHER,WEATHER EMOJI,BOROUGH
0,2018-08-30 14:00:00,40.65939,-73.76581,Taxi,0.0,0.0,Unspecified,Unspecified,🚕,2018-08-30,Thursday,35,2018-08-30,rain,rain,🌧,2.0
1,2018-08-30 14:00:00,40.797478,-73.93684,Ambulance,0.0,0.0,View Obstructed/Limited,Environmental Factors,🚑,2018-08-30,Thursday,35,2018-08-30,rain,rain,🌧,3.0
2,2018-08-30 04:00:00,40.7606,-73.96434,Taxi,0.0,0.0,Driver Inattention/Distraction,Driver Inattention,🚕,2018-08-30,Thursday,35,2018-08-30,rain,rain,🌧,3.0
3,2018-08-30 10:00:00,40.798256,-73.82744,Taxi,3.0,0.0,Following Too Closely,Driving Infraction,🚕,2018-08-30,Thursday,35,2018-08-30,rain,rain,🌧,
4,2018-08-30 16:00:00,40.748512,-73.98872,Taxi,0.0,0.0,Unspecified,Unspecified,🚕,2018-08-30,Thursday,35,2018-08-30,rain,rain,🌧,3.0


In [38]:
map_data["COLLISIONS"] = collisions_weather.groupby(["BOROUGH"]).size()

map_data.head()

Unnamed: 0,boro_code,boro_name,shape_area,shape_leng,geometry,COLLISIONS
0,5,Staten Island,1623620725.05,325917.35395,"MULTIPOLYGON (((-74.05051 40.56642, -74.05047 ...",9
1,2,Bronx,1187174784.85,463179.772813,"MULTIPOLYGON (((-73.89681 40.79581, -73.89694 ...",444
2,4,Queens,3041418505.55,888199.730955,"MULTIPOLYGON (((-73.82645 40.59053, -73.82642 ...",540
3,1,Manhattan,636520502.801,357713.30866,"MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...",2182
4,3,Brooklyn,1934138258.43,728148.53241,"MULTIPOLYGON (((-73.86327 40.58388, -73.86381 ...",585


In [39]:
boroughs = {
    0.0: "Staten Island",
    1.0: "Bronx",
    2.0: "Queens",
    3.0: "Manhattan",
    4.0: "Brooklyn"
}

collisions_weather["BOROUGH"] = collisions_weather["BOROUGH"].map(boroughs)

collisions_weather = collisions_weather[["CRASH DATETIME", "CRASH DAY", "CRASH WEEK NUMBER", "CRASH WEEKDAY", "BOROUGH", "VEHICLE", "VEHICLE EMOJI", "WEATHER", "WEATHER EMOJI", "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED", "ORIGINAL FACTOR", "FACTOR"]]

In [40]:
# Convert to epsh = 4326
# map_data["geometry"] = map_data["geometry"].to_crs(epsg=4326)

map_data["AREA"] = map_data["geometry"].area

map_data["AREA PROPORTION"] = map_data["AREA"] / map_data["AREA"].sum()

# Value found online (wikipedia)
map_data["AREA KM2"] = 783.84 * map_data["AREA PROPORTION"]

map_data["COLLISIONS / KM2"] = map_data["COLLISIONS"] / map_data["AREA KM2"]


  map_data["AREA"] = map_data["geometry"].area


In [41]:
map_data.to_file("processed-data/map.geojson", driver="GeoJSON")

### Filling with empty values

We will now fill the data set so we have values for all combination of day - vehicle - weather - borough

In [42]:
collisions_weather.head()

Unnamed: 0,CRASH DATETIME,CRASH DAY,CRASH WEEK NUMBER,CRASH WEEKDAY,BOROUGH,VEHICLE,VEHICLE EMOJI,WEATHER,WEATHER EMOJI,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL FACTOR,FACTOR
0,2018-08-30 14:00:00,2018-08-30,35,Thursday,Queens,Taxi,🚕,rain,🌧,0.0,0.0,Unspecified,Unspecified
1,2018-08-30 14:00:00,2018-08-30,35,Thursday,Manhattan,Ambulance,🚑,rain,🌧,0.0,0.0,View Obstructed/Limited,Environmental Factors
2,2018-08-30 04:00:00,2018-08-30,35,Thursday,Manhattan,Taxi,🚕,rain,🌧,0.0,0.0,Driver Inattention/Distraction,Driver Inattention
3,2018-08-30 10:00:00,2018-08-30,35,Thursday,,Taxi,🚕,rain,🌧,3.0,0.0,Following Too Closely,Driving Infraction
4,2018-08-30 16:00:00,2018-08-30,35,Thursday,Manhattan,Taxi,🚕,rain,🌧,0.0,0.0,Unspecified,Unspecified


In [43]:
dates = collisions_weather[["CRASH DATETIME", "CRASH DAY", "CRASH WEEK NUMBER", "CRASH WEEKDAY"]]
dates["CRASH DATETIME"] = dates["CRASH DATETIME"].dt.floor("D")
dates = dates.drop_duplicates()
dates["key"] = 0

hours = pd.DataFrame({"HOUR": range(0, 24)})
hours["key"] = 0

boroughs = collisions_weather[["BOROUGH"]].drop_duplicates().dropna()
boroughs["key"] = 0

vehicles = collisions_weather[["VEHICLE", "VEHICLE EMOJI"]].drop_duplicates().dropna()
vehicles["key"] = 0

weathers = collisions_weather[["WEATHER", "WEATHER EMOJI"]].drop_duplicates().dropna()
weathers["key"] = 0

filler = (
    dates
    .merge(boroughs, on="key")
    .merge(vehicles, on="key")
    .merge(weathers, on="key")
    .merge(hours, on="key")
    .drop(columns=["key"])
)

filler["VALID"] = 0

filler.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dates["CRASH DATETIME"] = dates["CRASH DATETIME"].dt.floor("D")


Unnamed: 0,CRASH DATETIME,CRASH DAY,CRASH WEEK NUMBER,CRASH WEEKDAY,BOROUGH,VEHICLE,VEHICLE EMOJI,WEATHER,WEATHER EMOJI,HOUR,VALID
0,2018-08-30,2018-08-30,35,Thursday,Queens,Taxi,🚕,rain,🌧,0,0
1,2018-08-30,2018-08-30,35,Thursday,Queens,Taxi,🚕,rain,🌧,1,0
2,2018-08-30,2018-08-30,35,Thursday,Queens,Taxi,🚕,rain,🌧,2,0
3,2018-08-30,2018-08-30,35,Thursday,Queens,Taxi,🚕,rain,🌧,3,0
4,2018-08-30,2018-08-30,35,Thursday,Queens,Taxi,🚕,rain,🌧,4,0


In [44]:
collisions_weather.count()

CRASH DATETIME               3953
CRASH DAY                    3953
CRASH WEEK NUMBER            3953
CRASH WEEKDAY                3953
BOROUGH                      3760
VEHICLE                      3953
VEHICLE EMOJI                3953
WEATHER                      3953
WEATHER EMOJI                3953
NUMBER OF PERSONS INJURED    3953
NUMBER OF PERSONS KILLED     3953
ORIGINAL FACTOR              3953
FACTOR                       3953
dtype: int64

In [45]:
collisions_weather["HOUR"] = collisions_weather["CRASH DATETIME"].dt.hour
collisions_weather["VALID"] = 1

collisions_weather = pd.concat([collisions_weather, filler])

collisions_weather["MONTH"] = collisions_weather["CRASH DATETIME"].dt.strftime("%B")
collisions_weather["DAY"] = collisions_weather["CRASH DATETIME"].dt.strftime("%d")

### Some final details
Mapping some names and stuff like that. To make nicer plots.

In [46]:
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

weekdays_3 = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

collisions_weather["CRASH WEEKDAY"] = collisions_weather["CRASH WEEKDAY"].map(dict(zip(weekdays, weekdays_3)))


In [47]:
collisions_weather

Unnamed: 0,CRASH DATETIME,CRASH DAY,CRASH WEEK NUMBER,CRASH WEEKDAY,BOROUGH,VEHICLE,VEHICLE EMOJI,WEATHER,WEATHER EMOJI,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL FACTOR,FACTOR,HOUR,VALID,MONTH,DAY
0,2018-08-30 14:00:00,2018-08-30,35,Thu,Queens,Taxi,🚕,rain,🌧,0.0,0.0,Unspecified,Unspecified,14,1,August,30
1,2018-08-30 14:00:00,2018-08-30,35,Thu,Manhattan,Ambulance,🚑,rain,🌧,0.0,0.0,View Obstructed/Limited,Environmental Factors,14,1,August,30
2,2018-08-30 04:00:00,2018-08-30,35,Thu,Manhattan,Taxi,🚕,rain,🌧,0.0,0.0,Driver Inattention/Distraction,Driver Inattention,4,1,August,30
3,2018-08-30 10:00:00,2018-08-30,35,Thu,,Taxi,🚕,rain,🌧,3.0,0.0,Following Too Closely,Driving Infraction,10,1,August,30
4,2018-08-30 16:00:00,2018-08-30,35,Thu,Manhattan,Taxi,🚕,rain,🌧,0.0,0.0,Unspecified,Unspecified,16,1,August,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175675,2018-06-02 00:00:00,2018-06-02,22,Sat,Staten Island,Fire truck,🚒,cloudy,☁️,,,,,19,0,June,02
175676,2018-06-02 00:00:00,2018-06-02,22,Sat,Staten Island,Fire truck,🚒,cloudy,☁️,,,,,20,0,June,02
175677,2018-06-02 00:00:00,2018-06-02,22,Sat,Staten Island,Fire truck,🚒,cloudy,☁️,,,,,21,0,June,02
175678,2018-06-02 00:00:00,2018-06-02,22,Sat,Staten Island,Fire truck,🚒,cloudy,☁️,,,,,22,0,June,02


In [48]:
collisions_weather.to_csv("./processed-data/collisions_weather.csv", index=False)