# VI: Second Practical Work

**Authors:** Gerard Comas & Marc Franquesa.


## Data Processing
Processing all datasets in this notebook

In [1]:
# Initial imports
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point

warnings.simplefilter(action="ignore", category=FutureWarning)

### Collisions dataset

In [2]:
# read the dataset
collisions = pd.read_csv("./original-data/collisions.csv")

collisions.head()

  collisions = pd.read_csv("./original-data/collisions.csv")


Unnamed: 0,CRASH DATETIME,CRASH WEEKDAY,AFTER COVID,BOROUGH,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,ORIGINAL VEHICLE,VEHICLE,ORIGINAL FACTOR,FACTOR,LOCATION,tmpf,relh,sknt,p01i,vsby,DISTRICT
0,2020-09-06 18:00:00,Sunday,True,,40.771038,-73.83413,0.0,1.0,Station Wagon/Sport Utility Vehicle,Car,Unsafe Lane Changing,Driving Infraction,"[40.771038, -73.83413]",27.222222,42.19,0.0,0.0,16.09344,4.0
1,2020-09-06 18:00:00,Sunday,True,BRONX,40.88845,-73.841965,0.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Failure to Yield Right-of-Way,Driving Infraction,"[40.88845, -73.841965]",27.222222,42.19,0.0,0.0,16.09344,22.0
2,2020-09-06 18:00:00,Sunday,True,,40.638527,-73.87853,1.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Driver Inattention/Distraction,Driver Inattention,"[40.638527, -73.87853]",27.222222,42.19,0.0,0.0,16.09344,
3,2020-09-06 18:00:00,Sunday,True,BRONX,40.80684,-73.9275,0.0,0.0,Station Wagon/Sport Utility Vehicle,Car,Unspecified,Unspecified,"[40.80684, -73.9275]",27.222222,42.19,0.0,0.0,16.09344,50.0
4,2020-09-06 18:00:00,Sunday,True,BROOKLYN,40.71043,-73.9437,0.0,0.0,Sedan,Car,Driver Inattention/Distraction,Driver Inattention,"[40.71043, -73.9437]",27.222222,42.19,0.0,0.0,16.09344,68.0


In [3]:
# select only the values from 2018
collisions = collisions[collisions["CRASH DATETIME"] < "2019-01-01"] 

# select only the columns we need
collisions = collisions[["CRASH DATETIME", "BOROUGH", "ORIGINAL VEHICLE"]]

In [4]:
categories = {
    "Taxi": ["Taxi"],
    "Ambulance": ["Ambulance", "AMBUL", "Ambul", "ambul", "AMB", "AMBU", "AMBULANCE"],
    "Fire truck": ["Fire", "FIRET", "FIRE", "FDNY", "fdny", "FD tr", "fd tr", "firet", "fire"],
}

reverse_categories = {val: key for key, values in categories.items() for val in values}

# Assume df is your DataFrame and 'column_name' is the column you want to classify
collisions["VEHICLE"] = collisions["ORIGINAL VEHICLE"].map(reverse_categories)

collisions = collisions.dropna(subset=["VEHICLE"])

collisions = collisions[["CRASH DATETIME", "BOROUGH", "VEHICLE"]]

In [5]:
collisions.head()

Unnamed: 0,CRASH DATETIME,BOROUGH,VEHICLE
506,2018-08-30 14:00:00,QUEENS,Taxi
509,2018-08-30 14:00:00,MANHATTAN,Ambulance
527,2018-09-18 21:00:00,MANHATTAN,Taxi
535,2018-09-18 21:00:00,BROOKLYN,Taxi
538,2018-09-18 21:00:00,MANHATTAN,Taxi


In [6]:
# Lets add emojis
vehicle_emojis = {
    "Taxi": "🚕",
    "Ambulance": "🚑",
    "Fire truck": "🚒",
}

collisions["VEHICLE EMOJI"] = collisions["VEHICLE"].map(vehicle_emojis)

collisions.head()

Unnamed: 0,CRASH DATETIME,BOROUGH,VEHICLE,VEHICLE EMOJI
506,2018-08-30 14:00:00,QUEENS,Taxi,🚕
509,2018-08-30 14:00:00,MANHATTAN,Ambulance,🚑
527,2018-09-18 21:00:00,MANHATTAN,Taxi,🚕
535,2018-09-18 21:00:00,BROOKLYN,Taxi,🚕
538,2018-09-18 21:00:00,MANHATTAN,Taxi,🚕


In [19]:
collisions["CRASH DAY"] = pd.to_datetime(collisions["CRASH DATETIME"]).dt.strftime("%Y-%m-%d")
collisions.head()

Unnamed: 0,CRASH DATETIME,BOROUGH,VEHICLE,VEHICLE EMOJI,CRASH DAY
506,2018-08-30 14:00:00,QUEENS,Taxi,🚕,2018-08-30
509,2018-08-30 14:00:00,MANHATTAN,Ambulance,🚑,2018-08-30
527,2018-09-18 21:00:00,MANHATTAN,Taxi,🚕,2018-09-18
535,2018-09-18 21:00:00,BROOKLYN,Taxi,🚕,2018-09-18
538,2018-09-18 21:00:00,MANHATTAN,Taxi,🚕,2018-09-18


In [7]:
collisions.to_csv("./processed-data/collisions.csv", index=False)

### Weather dataset

In [8]:
weather = pd.read_csv("./original-data/weather2018.csv")

In [11]:
weather = weather[["datetime", "icon"]]
weather["icon"].unique()

array(['rain', 'partly-cloudy-day', 'clear-day', 'cloudy'], dtype=object)

In [12]:
weather_emojis = {
    "rain" : "🌧",
    "clear-day" : "☀️",
    "cloudy" : "☁️",
    "partly-cloudy-day" : "⛅️",
}

weather["icon emoji"] = weather["icon"].map(weather_emojis)

In [13]:
weather.head()

Unnamed: 0,datetime,icon,icon emoji
0,2018-06-01,rain,🌧
1,2018-06-02,rain,🌧
2,2018-06-03,rain,🌧
3,2018-06-04,rain,🌧
4,2018-06-05,partly-cloudy-day,⛅️


In [14]:
weather.to_csv("./processed-data/weather.csv", index=False)

### Collisions + Weather

In [21]:
# merge the collisions and weather dataframes on the "CRASH DAY" and "datetime" columns
collisions_weather = pd.merge(collisions, weather, left_on="CRASH DAY", right_on="datetime")

Unnamed: 0,CRASH DATETIME,BOROUGH,VEHICLE,VEHICLE EMOJI,CRASH DAY,datetime,icon,icon emoji
0,2018-08-30 14:00:00,QUEENS,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧
1,2018-08-30 14:00:00,MANHATTAN,Ambulance,🚑,2018-08-30,2018-08-30,rain,🌧
2,2018-08-30 04:00:00,MANHATTAN,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧
3,2018-08-30 10:00:00,,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧
4,2018-08-30 16:00:00,MANHATTAN,Taxi,🚕,2018-08-30,2018-08-30,rain,🌧
...,...,...,...,...,...,...,...,...
3948,2018-06-02 10:00:00,MANHATTAN,Taxi,🚕,2018-06-02,2018-06-02,rain,🌧
3949,2018-06-02 10:00:00,,Taxi,🚕,2018-06-02,2018-06-02,rain,🌧
3950,2018-06-02 21:00:00,MANHATTAN,Taxi,🚕,2018-06-02,2018-06-02,rain,🌧
3951,2018-06-02 20:00:00,BROOKLYN,Taxi,🚕,2018-06-02,2018-06-02,rain,🌧


In [None]:
collisions_weather.to_csv("./processed-data/collisions_weather.csv", index=False)

### NY Map