# VI: First Practical Work

**Authors:** Gerard Comas & Marc Franquesa.

## Data Processing
Processing all datasets in this notebook

In [None]:
# Initial imports
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point

warnings.simplefilter(action='ignore', category=FutureWarning)

### Collisions dataset

In [None]:
collisions = pd.read_csv("./original-data/collisions.csv")

print(collisions.columns)
print(f"Initial amount of rows: {len(collisions)}")

In [None]:
# Adding a CRASH DATETIME column as well as several checks to make sure we have the correct dataset

# Truncating to the hour because some (most) rows are already truncated and we don't need more information
collisions["CRASH DATETIME"] = pd.to_datetime(collisions["CRASH DATE"] + " " + collisions["CRASH TIME"]).dt.floor("H")

# Adding day of week column
collisions["CRASH WEEKDAY"] = collisions["CRASH DATETIME"].dt.day_name()

# Adding BEFORE COVID column
collisions["AFTER COVID"] = collisions['CRASH DATETIME'].dt.year == 2020

print(f"First crash: {collisions['CRASH DATETIME'].sort_values().iloc[0]}")

print(f"Last crash of 2018: {collisions[collisions['CRASH DATETIME'].dt.year == 2018]['CRASH DATETIME'].sort_values().iloc[-1]}")

print(f"First crash of 2020: {collisions[collisions['CRASH DATETIME'].dt.year == 2020]['CRASH DATETIME'].sort_values().iloc[0]}")

print(f"Last crash: {collisions['CRASH DATETIME'].sort_values().iloc[-1]}")

print(f"Collisions in 2019: {len(collisions[collisions['CRASH DATETIME'].dt.year == 2019])}")

In [None]:
# Checking if LOCATION contains the same information as LATITUDE and LONGITUDE
# We will take advantage of the fact that if a value is NaN in python then
# value == value will return False
def same_information():
    location = collisions["LOCATION"].tolist()
    lat, lon = collisions["LATITUDE"].tolist(), collisions["LONGITUDE"].tolist()
    for i, row in enumerate(location):
        # LOCATION is not NaN
        if row == row:
            if not list(map(float, row[1: -1].split(", "))) == [lat[i], lon[i]]: return False
        # LOCATION is NaN
        else:
            # If lat or lon is different to Nan return False
            if lat[i] == lat[i] or lon[i] == lon[i]: return False
    return True

print(same_information())

In [None]:
# Column selection
cols = [
    "CRASH DATETIME",
    "CRASH WEEKDAY",
    "AFTER COVID",
    "BOROUGH",
    "LATITUDE",
    "LONGITUDE",
    "NUMBER OF PERSONS INJURED",
    "NUMBER OF PERSONS KILLED",
    "VEHICLE TYPE CODE 1",
]
collisions = collisions[cols]

# Number of missing values in each column
print(collisions.isnull().sum())

In [None]:
# Fill in missing values with 0 for the injured/killed columns
collisions["NUMBER OF PERSONS INJURED"].fillna(0, inplace=True)
collisions["NUMBER OF PERSONS KILLED"].fillna(0, inplace=True)

We will now classify all vehicle types into these categories:
* ATV
* bicycle
* car/suv
* taxi
* ebike
* escooter
* truck/bus
* motorcycle
* other
* unknown

In [None]:
classified_vehicles = {
    "Station Wagon/Sport Utility Vehicle": "car",
    "Sedan": "car",
    "Bus": "bus",
    "Tractor Truck Diesel": "truck",
    "Taxi": "taxi",
    "E-Scooter": "escooter",
    "Flat Bed": "truck",
    "Motorbike": "motorcycle",
    "Motorcycle": "motorcycle",
    "Box Truck": "truck",
    "Pick-up Truck": "truck",
    "Bike": "bicycle",
    "Dump": "truck",
    "Concrete Mixer": "truck",
    "Van": "truck",
    "PK": "other",
    "Golf Cart": "other",
    "LIMO": "car",
    "Tanker": "truck",
    "AMBULANCE": "other",
    "Convertible": "car",
    "E-Bike": "ebike",
    "Moped": "motorcycle",
    "Fire Truck": "truck",
    "nan": "other",
    "Tractor Truck Gasoline": "truck",
    "Ambulance": "other",
    "forlift": "other",
    "MOTOR SKAT": "other",
    "FDNY LADDE": "other",
    "Tow Truck / Wrecker": "truck",
    "FIRE TRUCK": "truck",
    "PICK UP": "other",
    "Garbage or Refuse": "truck",
    "GARBAGE TR": "truck",
    "Chassis Cab": "truck",
    "Bulk Agriculture": "other",
    "Can": "other",
    "van": "truck",
    "Carry All": "other",
    "FLATBED FR": "truck",
    "Open Body": "other",
    "4 dr sedan": "car",
    "Motorscooter": "motorcycle",
    "Minibike": "motorcycle",
    "Flat Rack": "other",
    "Armored Truck": "truck",
    "School Bus": "bus",
    "FDNY TRUCK": "truck",
    "truck": "truck",
    "UNK": "unknown",
    "TRAILER": "other",
    "FIRTRUCK": "truck",
    "MOPED": "motorcycle",
    "Lift Boom": "other",
    "fdny ems": "other",
    "AMBULACE": "other",
    "bus": "bus",
    "BOX TRUCK": "truck",
    "Street Swe": "other",
    "Scooter": "escooter",
    "FDNY fire": "other",
    "DELIVERY": "other",
    "Cement Tru": "truck",
    "USPS/GOVT": "other",
    "Pedicab": "other",
    "TRUCK VAN": "truck",
    "UTILITY": "other",
    "Pick up tr": "other",
    "UNKNOWN": "unknown",
    "Multi-Wheeled Vehicle": "other",
    "SUV": "car",
    "utility": "other",
    "POWER SHOV": "other",
    "DELIVERY T": "other",
    "SWT": "other",
    "Trac": "other",
    "FDNY AMBUL": "other",
    "AMBU": "other",
    "USPS": "other",
    "FLAT": "other",
    "Beverage Truck": "truck",
    "E-BIKE": "ebike",
    "3-Door": "car",
    "Fork Lift": "other",
    "Refrigerated Van": "truck",
    "PSD": "other",
    "Fire Engin": "other",
    "FORKLIFT": "other",
    "TRAC": "other",
    "Tow Truck": "truck",
    "COURIER": "other",
    "Courier": "other",
    "Leased amb": "other",
    "SMART CAR": "car",
    "message si": "other",
    "scooter": "escooter",
    "E-UNICYCLE": "escooter",
    "Street Cle": "other",
    "box": "other",
    "F550": "truck",
    "DELV": "other",
    "SKATEBOARD": "other",
    "Lawnmower": "other",
    "almbulance": "other",
    "dark color": "other",
    "Work Van": "other",
    "ford van": "truck",
    "ambulance": "other",
    "Fire truck": "truck",
    "Minicycle": "motorcycle",
    "PC": "other",
    "box truck": "truck",
    "FDNY ENGIN": "other",
    "commercial": "other",
    "Unknown": "unknown",
    "Tractor tr": "truck",
    "2 dr sedan": "car",
    "FD LADDER": "other",
    "abulance": "other",
    "FDNY Engin": "other",
    "OTH": "other",
    "Go kart": "other",
    "Trailer": "other",
    "TRUCK": "truck",
    "Stake or Rack": "other",
    "COMMERCIAL": "other",
    "CHEVY EXPR": "other",
    "SLINGSHOT": "other",
    "dilevery t": "other",
    "FDNY #226": "other",
    "FREIGHT FL": "other",
    "Fork lift": "other",
    "UTIL": "other",
    "UNKN": "other",
    "FDNY FIRE": "other",
    "ELECTRIC S": "other",
    "FIRETRUCK": "truck",
    "MOVING VAN": "truck",
    "usps": "other",
    "moped": "motorcycle",
    "forklift": "other",
    "UPS TRUCK": "truck",
    "backhoe": "other",
    "Delv": "other",
    "dump truck": "truck",
    "Freight": "other",
    "Horse": "other",
    "Cargo Van": "truck",
    "USPS VAN": "other",
    "TRUCK FLAT": "truck",
    "BOBCAT FOR": "other",
    "Tractor Tr": "truck",
    "Pumper": "other",
    "DELIVERY V": "other",
    "DOT EQUIPM": "other",
    "fire truck": "truck",
    "Livestock Rack": "other",
    "GEN  AMBUL": "other",
    "J1": "other",
    "DUMP": "other",
    "18 WHEELER": "truck",
    "MAIL TRUCK": "other",
    "UTILITY VE": "other",
    "MOTORSCOOT": "motorcycle",
    "government": "other",
    "trailer": "other",
    "FIRE ENGIN": "other",
    "Front-Load": "other",
    "DRILL RIG": "other",
    "SCOOTER": "escooter",
    "Wh Ford co": "other",
    "suburban": "car",
    "E REVEL SC": "other",
    "ROAD SWEEP": "other",
    "LIGHT TRAI": "other",
    "Tractor": "truck",
    "UT": "other",
    "USPS TRUCK": "other",
    "cross": "other",
    "Van Camper": "other",
    "AMBULENCE": "other",
    "FOOD TRUCK": "other",
    "Bucket Tru": "other",
    "gator": "other",
    "FDNY Ambul": "other",
    "JOHN DEERE": "other",
    "f-250": "other",
    "MECHANICAL": "other",
    "WORK VAN": "other",
    "NYC FD": "other",
    "MTA BUS": "bus",
    "NYC AMBULA": "other",
    "GOLF CART": "other",
    "FLATBED": "truck",
    "Trc": "other",
    "FORK LIFT": "other",
    "Pick up Tr": "other",
    "postal bus": "bus",
    "F150XL PIC": "other",
    "ambu": "other",
    "Pick up": "other",
    "CAT": "other",
    "ELEC. UNIC": "escooter",
    "1C": "other",
    "SCOOT": "escooter",
    "FREIG": "other",
    "AMBUL": "other",
    "VAN T": "other",
    "MINI": "other",
    "Garba": "other",
    "motor": "other",
    "Lunch Wagon": "other",
    "E-Bik": "ebike",
    "Ambul": "other",
    "FDNY": "other",
    "SCHOO": "other",
    "Comm": "other",
    "Fire": "other",
    "Sanit": "other",
    "mail": "other",
    "RV": "other",
    "GARBA": "other",
    "ambul": "other",
    "FIRET": "other",
    "FIRE": "other",
    "SELF": "other",
    "STAK": "other",
    "WORKH": "other",
    "FORKL": "other",
    "Tract": "other",
    "freig": "other",
    "DELIV": "other",
    "trail": "other",
    "PICKU": "other",
    "Dumps": "other",
    "forkl": "other",
    "fire": "other",
    "TRK": "other",
    "ELECT": "other",
    "2- to": "other",
    "BROOM": "other",
    "TRAIL": "other",
    "EBIKE": "ebike",
    "Trail": "other",
    "Glass Rack": "other",
    "Motorized Home": "other",
    "US POSTAL": "other",
    "TRT": "other",
    "BLOCK": "other",
    "pas": "other",
    "COM": "other",
    "CONCR": "other",
    "Pallet": "other",
    "unknown": "unknown",
    "CHERR": "other",
    "UTV": "other",
    "MOTOR": "other",
    "MTA B": "bus",
    "TRACT": "other",
    "NYC": "other",
    "UHAUL": "other",
    "scoot": "escooter",
    "FED E": "other",
    "COMME": "other",
    "TRLR": "other",
    "LOADE": "other",
    "rv": "other",
    "TOWER": "other",
    "Pick": "other",
    "AMB": "other",
    "NS AM": "other",
    "UNKNO": "unknown",
    "NEW Y": "other",
    "TOW T": "other",
    "GRAY": "other",
    "tract": "other",
    "STREE": "other",
    "MAIL": "other",
    "e-bik": "ebike",
    "unk": "unknown",
    "box t": "other",
    "CRANE": "other",
    "garba": "other",
    "Pickup with mounted Camper": "other",
    "FRONT": "other",
    "Sprin": "other",
    "delv": "other",
    "POWER": "other",
    "Box t": "other",
    "CAMP": "other",
    "Enclosed Body - Removable Enclosure": "other",
    "RGS": "other",
    "GOVER": "other",
    "FORK": "other",
    "UTILI": "other",
    "POSTO": "other",
    "firet": "other",
    "WORK": "other",
    "R/V C": "other",
    "sgws": "other",
    "Cat 9": "other",
    "BACKH": "other",
    "E-MOT": "other",
    "MACK": "other",
    "SPC": "other",
    "fork": "other",
    "OMR": "other",
    "semi": "other",
    "FORK-": "other",
    "Wheel": "other",
    "Utili": "other",
    "E-BIK": "ebike",
    "fd tr": "other",
    "SWEEP": "other",
    "BOX T": "other",
    "CASE": "other",
    "FD TR": "other",
    "Work": "other",
    "LIBER": "other",
    "fdny": "other",
    "COMB": "other",
    "HEAVY": "other",
    "DUMPS": "other",
    "MTA b": "bus",
    "Hopper": "other",
    "R/V": "other",
    "FOOD": "other",
    "FD tr": "other",
    "Spc": "other",
    "BED T": "other",
    "comme": "other",
    "UPS T": "other",
    "PAS": "other",
    "BICYC": "bicycle",
    "Subn": "other",
    "WHEEL": "other",
    "Util": "other",
    "ACCES": "other",
    "e sco": "escooter",
    "BOBCA": "other",
    "TANK": "other",
    "TRACK": "other",
    "utili": "other",
    "DEMA-": "other",
    "tow": "other",
    "dump": "other",
    "Elect": "other",
    "deliv": "other",
    "Backh": "other",
    "CEMEN": "other",
    "99999": "other",
    "BULLD": "other",
    "seagr": "other",
    "schoo": "other",
    "CONST": "other",
    "self": "other",
    "BK": "other",
    "Semi": "other",
    "Scoot": "escooter",
    "NYPD": "other",
    "Taxis": "taxi"
}

In [None]:
# Changing vehicle types to classification we want to use, list is found in the 
# NYC collision dataset: ATV, bicycle, car/suv, ebike, escooter, truck/bus,
# motorcycle, other, unknown
collisions[f"VEHICLE TYPE CODE 1"] = collisions[f"VEHICLE TYPE CODE 1"].replace(classified_vehicles).fillna("unknown")

collisions["VEHICLE TYPE CODE 1"].value_counts()

In [None]:
# Replacing LATITUDE and longitude values that don't make make sense for NYC into NaNs
collisions["LATITUDE"] = collisions["LATITUDE"].where(collisions["LATITUDE"].between(38, 42))
collisions["LONGITUDE"] = collisions["LONGITUDE"].where(collisions["LONGITUDE"].between(-76, -72))

# Adding our own LOCATION column, we do know that it already exists but it was easier for us this way
# If either LATITUDE or LONGITUDE is NaN then location will be NaN
def combine_columns(row):
    if pd.notna(row["LATITUDE"]) and pd.notna(row["LONGITUDE"]):
        return [row["LATITUDE"], row["LONGITUDE"]]
    else:
        return np.nan

collisions["LOCATION"] = collisions.apply(combine_columns, axis=1)

# Dropping NaNs in LOCATION and BOROUGH, if either BOROUGH or LOCATION is not NaN we will keep the row
collisions.dropna(subset=["LOCATION", "BOROUGH"], how="all", inplace=True)

print(f"Current amount of rows: {len(collisions)}")

In [None]:
print(collisions.isnull().sum())

In [None]:
collisions.head()

In [None]:
collisions.to_csv("./processed-data/collisions.csv", index=False)

### Weather dataset

### NYC Map
Currently using [NYC community district boundaries](https://data.cityofnewyork.us/City-Government/Community-Districts/yfnk-k7r4) in a geojson format. Lets add the number of collisions in each region.

In [None]:
map_data = gpd.read_file(f"./original-data/map.geojson")

collisions["DISTRICT"] = collisions["LOCATION"].apply(
    lambda x: [-1] if x != x else np.where(map_data.contains(Point(x[1], x[0])))[0]
)

collisions["DISTRICT"] = collisions["DISTRICT"].apply(lambda x: -1 if len(x) == 0 else x[0]).replace(-1, np.nan)

map_data["collision_count"] = collisions.groupby(["DISTRICT"]).size()

## Design and implementation

**Q IDEAS:**

* Q1: basic barplot?
* Q2: slope chart
* Q3: histogram?
* Q4: basic map plot
* Q5:

---
**O IDEAS:**
* Color for vehicle type


In [None]:
# Helpful functions

def before_covid(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["AFTER COVID"] == False]

def after_covid(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["AFTER COVID"] == True]

### 1. Are accidents more frequent during weekdays or weekends? Is there any difference between before COVID-19 and after?

With an ambitious goal in mind, lets first plot the total collisions of each day of the week before COVID.

In [None]:
before_covid_day_count = before_covid(collisions).groupby(["CRASH WEEKDAY"]).size().reset_index(name="counts")

weekdayorder = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

alt.Chart(before_covid_day_count).mark_bar().encode(
    x = alt.X("CRASH WEEKDAY:O", sort=weekdayorder, axis=alt.Axis(title="Week Day")),
    y = alt.Y("counts:Q", axis=alt.Axis(title="Collisions"))
).properties(
    width=400
)

Lets now make a grouped bar chart, separating before and after covid.

In [None]:
days_df = collisions.groupby(["CRASH WEEKDAY", "AFTER COVID"]).size().reset_index(name="counts")

before, after, all_time = "Summer 2018 (Before Covid)", "Summer 2020 (After Covid)", "All"

days_df["MOMENT"] = np.where(days_df["AFTER COVID"], after, before)

weekdayorder = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

opacity = 0.5

colors = {
    before: "#fdc086", # Before COVID
    after: "#7fc97f", # After COVID
    all_time: "#beaed4"
}

days_ch = alt.Chart(days_df).mark_bar(
    opacity=opacity
).encode(
   x=alt.X("CRASH WEEKDAY:O", axis=alt.Axis(labelAngle=-30, title=None), sort=weekdayorder),
   xOffset="MOMENT:O",
   y=alt.Y("counts:Q", axis=alt.Axis(title="Collisions", grid=True)),
   color=alt.Color("MOMENT:O", scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())), legend=alt.Legend(title=None))
)

days_ch

Lets now add the average of before and after covid.

In [None]:
averages = alt.Chart(days_df).mark_rule(opacity=1).encode(
    y="mean(counts):Q",
    size=alt.value(2),
    color="MOMENT:O"
)

averages + days_ch

Lets now separate the days of the week in two categories, weekdays and weekends.

In [None]:
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
weekends = ["Saturday", "Sunday"]

weekdays_df = days_df[days_df["CRASH WEEKDAY"].isin(weekdays)]
weekends_df = days_df[days_df["CRASH WEEKDAY"].isin(weekends)]

weekdays_ch = alt.Chart(weekdays_df).mark_bar(opacity=opacity).encode(
   x=alt.X("CRASH WEEKDAY:O", axis=alt.Axis(labelAngle=-30, title=None), sort=weekdayorder),
   xOffset="MOMENT:O",
   y=alt.Y("counts:Q", axis=alt.Axis(title="Collisions / Means", grid=True), scale=alt.Scale(domain=[0, 13000])),
   color=alt.Color("MOMENT:O", scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())))
).properties(title=alt.Title("Weekdays", fontSize=10, fontWeight=600))

averages_weekday = alt.Chart(weekdays_df).mark_rule(opacity=1).encode(
    y="mean(counts):Q",
    size=alt.value(2),
    color=alt.Color("MOMENT:O")
)


weekends_ch = alt.Chart(weekends_df).mark_bar(opacity=opacity).encode(
   x=alt.X("CRASH WEEKDAY:O", axis=alt.Axis(labelAngle=-30, title=None), sort=weekdayorder),
   xOffset="MOMENT:O",
   y=alt.Y(
       "counts:Q",
       axis=alt.Axis(title=None, labels=False, domain=False, ticks=False, grid=True),
       scale=alt.Scale(domain=[0, 13000])
   ),
   color=alt.Color(
       "MOMENT:O",
       scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())),
       legend=alt.Legend(title=None)
   )
).properties(title=alt.Title("Weekends", fontSize=10, fontWeight=600))

averages_weekend = alt.Chart(weekends_df).mark_rule(opacity=1).encode(
    y="mean(counts):Q",
    size=alt.value(2),
    color="MOMENT:O"
)



q1 = ((weekdays_ch + averages_weekday) | (weekends_ch + averages_weekend))

q1.configure_legend(symbolOpacity=1)

### 2. Is there any type of vehicle more prone to participate in accidents?
Obviously, with the current data we have this is impossible, as cars are the most predominant vehicle by a large margin, meaning they will have the most collisions. Lets start off viewing this data with a simle bar plot.

In [None]:
vehicles = collisions.groupby(["VEHICLE TYPE CODE 1"]).size().reset_index(name="counts")

alt.Chart(vehicles).mark_bar().encode(
    y=alt.Y("counts:Q", axis=alt.Axis(title="Collisions")),
    x=alt.X("VEHICLE TYPE CODE 1:O", axis=alt.Axis(title=None, labelAngle=-30))
).properties(
    width=400
)

This confirms what we hypothesized earlier.

### 3. At what time of the day are accidents more common?
Lets make a simpler historgram with the overall average as well as a little mark indicating the max hour.

In [None]:
time_df = collisions
time_df["HOUR"] = time_df["CRASH DATETIME"].dt.hour
time_df = time_df.groupby(["HOUR", "AFTER COVID"]).size().reset_index(name="counts")

time_df["MOMENT"] = np.where(time_df["AFTER COVID"], after, before)

time_ch = alt.Chart(time_df).mark_bar(opacity=opacity).encode(
    x=alt.X("HOUR:O", axis=alt.Axis(labelAngle=0), title="Hour"),
    y=alt.Y("counts:Q", title="Collisions / Mean"),
    color=alt.Color(
        "MOMENT:O",
        scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())),
        legend=alt.Legend(title=None)
    ),
    order=alt.Order("MOMENT:O", sort='ascending')
)

time_all_df = time_df.groupby(["HOUR"]).sum().reset_index()

averages_weekend = alt.Chart(time_all_df).mark_rule(opacity=1, color=colors[all_time]).encode(
    y="mean(counts):Q",
    size=alt.value(2),
)

max_hour = alt.Chart().mark_text(text="16").encode(
    x=alt.value(329),
    y=alt.value(20),
)

q3 = (time_ch + averages_weekend + max_hour)

In [None]:
(q1 | q3).configure_legend(symbolOpacity=1)

### 4. Are there any areas with a larger number of accidents?
Lets make a choropleth map. First, lets just a couple collisions in NYC. We are using a district map.

In [None]:
base = alt.Chart(map_data).mark_geoshape(fill="lightgray", stroke="black").project(type="albersUsa").properties(
    width=700,
    height=700
)

pts = alt.Chart(collisions[collisions["LOCATION"].notna()].head(1000)).mark_circle().encode(
    latitude="LATITUDE",
    longitude="LONGITUDE",
    color='BOROUGH'
)

(base + pts)

In [None]:
choropleth = alt.Chart(map_data).mark_geoshape().project(type="albersUsa").encode(
    color=alt.Color("collision_count:Q", scale=alt.Scale(scheme='reds'))
).properties(
    width=500,
    height=500
)

choropleth