# VI: Second Practical Work

**Authors:** Gerard Comas & Marc Franquesa.

**COLAB INSTRUCTIONS:**
1. Upload notebooks
2. Create `original-data` and `processed-data` folders
3. Upload datasets found in local `original-data` to colab `original-data`
4. Execute `pre-processing.ipynb` notebook
5. Execute `design.ipynb` notebook

In order to make our enitre visualisation coherent and work well with one another, we have only kepts collisions made by the vehicles noted in the project statement.


In [None]:
# If False will fill dataset
# Not needed anymore as we have found
# workarounds for every plot

using_colab_or_streamlit = True

## Data Processing
Processing all datasets in this notebook

In [None]:
# Initial imports
import pandas as pd
import numpy as np
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point

warnings.simplefilter(action="ignore", category=FutureWarning)

### Collisions dataset

In [None]:
# read the dataset
collisions = pd.read_csv("./original-data/collisions.csv")
collisions["CRASH DATETIME"] = pd.to_datetime(collisions["CRASH DATETIME"])

collisions.head()

In [None]:
# select only the values from 2018
collisions = collisions[collisions["CRASH DATETIME"] < "2019-01-01"] 

# select only the columns we need
collisions = collisions[["CRASH DATETIME", "LATITUDE", "LONGITUDE", "ORIGINAL VEHICLE", "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED", "ORIGINAL FACTOR", "FACTOR"]]

In [None]:
categories = {
    "Taxi": ["Taxi"],
    "Ambulance": ["Ambulance", "AMBUL", "Ambul", "ambul", "AMB", "AMBU", "AMBULANCE"],
    "Fire truck": ["Fire", "FIRET", "FIRE", "FDNY", "fdny", "FD tr", "fd tr", "firet", "fire"],
}

reverse_categories = {val: key for key, values in categories.items() for val in values}

# Assume df is your DataFrame and 'column_name' is the column you want to classify
collisions["VEHICLE"] = collisions["ORIGINAL VEHICLE"].map(reverse_categories)

collisions = collisions.dropna(subset=["VEHICLE"])

collisions = collisions[["CRASH DATETIME", "LATITUDE", "LONGITUDE", "VEHICLE", "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED", "ORIGINAL FACTOR", "FACTOR"]]

In [None]:
collisions.head()

In [None]:
# Lets add emojis
vehicle_emojis = {
    "Taxi": "🚕",
    "Ambulance": "🚑",
    "Fire truck": "🚒",
}

collisions["VEHICLE EMOJI"] = collisions["VEHICLE"].map(vehicle_emojis)

collisions.head()

In [None]:
# Add day information
collisions["CRASH DAY"] = collisions["CRASH DATETIME"].dt.strftime("%Y-%m-%d")
collisions["CRASH WEEKDAY"] = collisions["CRASH DATETIME"].dt.day_name()
collisions["CRASH WEEK NUMBER"] = collisions["CRASH DATETIME"].dt.isocalendar().week
collisions.head()

### Weather dataset

In [None]:
weather = pd.read_csv("./original-data/weather2018.csv")

In [None]:
weather = weather[["datetime", "icon"]]
weather["WEATHER"] = weather["icon"]
weather["WEATHER"].unique()

In [None]:
weather_emojis = {
    "rain" : "🌧",
    "clear-day" : "☀️",
    "cloudy" : "☁️",
    "partly-cloudy-day" : "⛅️",
}

weather["WEATHER EMOJI"] = weather["WEATHER"].map(weather_emojis)

In [None]:
weather.head()

### Collisions + Weather

In [None]:
# merge the collisions and weather dataframes on the "CRASH DAY" and "datetime" columns
collisions_weather = pd.merge(collisions, weather, left_on="CRASH DAY", right_on="datetime")

### NY Map

In [None]:
map_data = gpd.read_file(f"./original-data/map.geojson")

collisions_weather["BOROUGH"] = collisions_weather.apply(lambda x: [-1] if pd.isnull(x["LATITUDE"]) or pd.isnull(x["LONGITUDE"]) else np.where(map_data.contains(Point(x["LONGITUDE"], x["LATITUDE"])))[0], axis=1)

collisions_weather["BOROUGH"] = collisions_weather["BOROUGH"].apply(lambda x: -1 if len(x) == 0 else x[0]).replace(-1, np.nan)

collisions_weather.head()

In [None]:
map_data["COLLISIONS"] = collisions_weather.groupby(["BOROUGH"]).size()

map_data.head()

In [None]:
boroughs = {
    0.0: "Staten Island",
    1.0: "Bronx",
    2.0: "Queens",
    3.0: "Manhattan",
    4.0: "Brooklyn"
}

collisions_weather["BOROUGH"] = collisions_weather["BOROUGH"].map(boroughs)

collisions_weather = collisions_weather[["CRASH DATETIME", "CRASH DAY", "CRASH WEEK NUMBER", "CRASH WEEKDAY", "BOROUGH", "VEHICLE", "VEHICLE EMOJI", "WEATHER", "WEATHER EMOJI", "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED", "ORIGINAL FACTOR", "FACTOR"]]

In [None]:
# Convert to epsh = 4326
# map_data["geometry"] = map_data["geometry"].to_crs(epsg=4326)

map_data["AREA"] = map_data["geometry"].area

map_data["AREA PROPORTION"] = map_data["AREA"] / map_data["AREA"].sum()

# Value found online (wikipedia)
map_data["AREA KM2"] = 783.84 * map_data["AREA PROPORTION"]

map_data["COLLISIONS / KM2"] = map_data["COLLISIONS"] / map_data["AREA KM2"]

In [None]:
map_data["BOROUGH"] = map_data["boro_name"]
map_data["AREA_KM2"] = map_data["AREA KM2"]

map_data = map_data[["BOROUGH", "AREA_KM2", "COLLISIONS", "COLLISIONS / KM2", "geometry"]]

map_data.to_file("processed-data/map.geojson", driver="GeoJSON")

### Filling with empty values

We will now fill the data set so we have values for all combination of day - vehicle - weather - borough so that our plots have sensible values.

In [None]:
collisions_weather.head()

In [None]:
dates = collisions_weather[["CRASH DATETIME", "CRASH DAY", "CRASH WEEK NUMBER", "CRASH WEEKDAY"]]
dates["CRASH DATETIME"] = dates["CRASH DATETIME"].dt.floor("D")
dates = dates.drop_duplicates()
dates["key"] = 0

hours = pd.DataFrame({"HOUR": range(0, 24)})
hours["key"] = 0

boroughs = collisions_weather[["BOROUGH"]].drop_duplicates().dropna()
boroughs["key"] = 0

vehicles = collisions_weather[["VEHICLE", "VEHICLE EMOJI"]].drop_duplicates().dropna()
vehicles["key"] = 0

weathers = collisions_weather[["WEATHER", "WEATHER EMOJI"]].drop_duplicates().dropna()
weathers["key"] = 0

filler = (
    dates
    .merge(boroughs, on="key")
    .merge(vehicles, on="key")
    .merge(weathers, on="key")
    .merge(hours, on="key")
    .drop(columns=["key"])
)

filler["VALID"] = 0

filler.head()

In [None]:
collisions_weather.count()

In [None]:
collisions_weather["HOUR"] = collisions_weather["CRASH DATETIME"].dt.hour
collisions_weather["VALID"] = 1

if not using_colab_or_streamlit:
    collisions_weather = pd.concat([collisions_weather, filler])

collisions_weather["MONTH"] = collisions_weather["CRASH DATETIME"].dt.strftime("%B")
collisions_weather["DAY"] = collisions_weather["CRASH DATETIME"].dt.strftime("%d")

### Some final details
Mapping some names to make nicer plots. Could have been put in design but decided to set any dataframe formatting here.

In [None]:
old_weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

new_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

collisions_weather["CRASH WEEKDAY"] = collisions_weather["CRASH WEEKDAY"].map(dict(zip(old_weekdays, new_weekdays)))

collisions_weather["CRASH HOUR"] = collisions_weather["CRASH DATETIME"].dt.strftime("%H") + ":00H"

collisions_weather["LOCATION AT HOUR"] = collisions_weather["BOROUGH"] + ", " + collisions_weather["CRASH HOUR"]

old_weather = ["rain", "clear-day", "cloudy", "partly-cloudy-day"]

new_weather = ["Rainy", "Clear", "Cloudy", "Partly cloudy"]

collisions_weather["WEATHER"] = collisions_weather["WEATHER"].map(dict(zip(old_weather, new_weather)))

In [None]:
collisions_weather

In [None]:
collisions_weather.to_csv("./processed-data/collisions_weather.csv", index=False)