# VI: Second Practical Work

**Authors:** Gerard Comas & Marc Franquesa.


## Data Processing
Processing all datasets in this notebook

In [None]:
# Initial imports
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point

warnings.simplefilter(action="ignore", category=FutureWarning)

### Collisions dataset

In [None]:
# read the dataset
collisions = pd.read_csv("./original-data/collisions.csv")

collisions.head()

In [None]:
# select only the values from 2018
collisions = collisions[collisions["CRASH DATETIME"] < "2019-01-01"] 

# select only the columns we need
collisions = collisions[["CRASH DATETIME", "BOROUGH", "LOCATION", "ORIGINAL VEHICLE"]]

In [None]:
categories = {
    "Taxi": ["Taxi"],
    "Ambulance": ["Ambulance", "AMBUL", "Ambul", "ambul", "AMB", "AMBU", "AMBULANCE"],
    "Fire truck": ["Fire", "FIRET", "FIRE", "FDNY", "fdny", "FD tr", "fd tr", "firet", "fire"],
}

reverse_categories = {val: key for key, values in categories.items() for val in values}

# Assume df is your DataFrame and 'column_name' is the column you want to classify
collisions["VEHICLE"] = collisions["ORIGINAL VEHICLE"].map(reverse_categories).fillna("Other")

In [None]:
collisions["VEHICLE"].unique()

In [None]:
collisions.head()

In [None]:
collisions.to_csv("./processed-data/collisions.csv", index=False)

### Weather dataset

In [None]:
weather = pd.read_csv("./original-data/weather2018.csv")

In [None]:
weather = weather[["datetime", "icon"]]
weather["icon"].unique()

In [None]:
weather.to_csv("./processed-data/weather.csv", index=False)

### NYC Map

In [None]:
map_data = gpd.read_file(f"./original-data/map.geojson")

collisions["DISTRICT"] = collisions["LOCATION"].apply(
    lambda x: [-1] if x != x else np.where(map_data.contains(Point(x[1], x[0])))[0]
)

collisions["DISTRICT"] = collisions["DISTRICT"].apply(lambda x: -1 if len(x) == 0 else x[0]).replace(-1, np.nan)

map_data["COLLISIONS"] = collisions.groupby(["DISTRICT"]).size()

map_data["AREA"] = map_data["geometry"].area

map_data["AREA PROPORTION"] = map_data["AREA"] / map_data["AREA"].sum()

# Value found online (wikipedia)
map_data["AREA KM2"] = 783.84 * map_data["AREA PROPORTION"]

map_data["COLLISIONS / KM2"] = map_data["COLLISIONS"] / map_data["AREA KM2"]