# Data Processing
Processing all datasets in this notebook

In [None]:
# Initial imports
import pandas as pd
from collections import Counter

## Collisions dataset

In [None]:
collisions = pd.read_csv("./original-data/collisions.csv")

print(collisions.columns)
print(f"Initial amount of rows: {len(collisions)}")

In [None]:
# Initial column selection
cols = [
    "CRASH DATE",
    "CRASH TIME",
    "LATITUDE",
    "LONGITUDE",
    "VEHICLE TYPE CODE 1",
    "VEHICLE TYPE CODE 2",
]
collisions = collisions[cols]

# Number of missing values in each column
print(collisions.isnull().sum())

In [None]:
# Dropping NaNs
collisions = collisions.dropna()
print(f"Current amount of rows: {len(collisions)}")

We will now remove all vehicle types appearing less than 10 times, and then we will be classifying all types into these six categories:
* ATV
* bicycle
* car/suv
* ebike
* escooter
* truck/bus
* motorcycle
* other

In [None]:
# Keeping types that appear more than 10 times in the entire dataset
# we are assuming that less than that is human error

types_count = Counter()
for i in range(2):
    collisions[f"VEHICLE TYPE CODE {i + 1}"] = collisions[f"VEHICLE TYPE CODE {i + 1}"].str.lower()
    types_count += Counter(dict(collisions[f"VEHICLE TYPE CODE {i + 1}"].value_counts()))
types_keeping = {name for name, count in types_count.items() if count > 10}
collisions = collisions[
    collisions["VEHICLE TYPE CODE 1"].isin(types_keeping) &
    collisions["VEHICLE TYPE CODE 2"].isin(types_keeping)
]


# Changing vehicle types to classification we want to use, list is found in the 
# NYC collision dataset: ATV, bicycle, car/suv, ebike, escooter, truck/bus,
# motorcycle, other

vehicle_classifications = {
    "sedan": "car/suv",
    "station wagon/sport utility vehicle": "car/suv",
    "taxi": "car/suv",
    "pick-up truck": "truck/bus",
    "box truck": "truck/bus",
    "bus": "truck/bus",
    "bike": "bicycle",
    "tractor truck diesel": "truck/bus",
    "motorcycle": "motorcycle",
    "van": "car/suv",
    "ambulance": "truck/bus",
    "convertible": "car/suv",
    "dump": "truck/bus",
    "flat bed": "truck/bus",
    "pk": "truck/bus",
    "garbage or refuse": "truck/bus",
    "carry all": "truck/bus",
    "tractor truck gasoline": "truck/bus",
    "e-scooter": "escooter",
    "tow truck / wrecker": "truck/bus",
    "moped": "motorcycle",
    "chassis cab": "truck/bus",
    "e-bike": "ebike",
    "tanker": "truck/bus",
    "concrete mixer": "truck/bus",
    "ambul": "truck/bus",
    "flat rack": "truck/bus",
    "motorscooter": "motorcycle",
    "motorbike": "motorcycle",
    "refrigerated van": "car/suv",
    "armored truck": "truck/bus",
    "3-door": "car/suv",
    "fire": "truck/bus",
    "beverage truck": "truck/bus",
    "4 dr sedan": "car/suv",
    "trail": "other",
    "lift boom": "truck/bus",
    "truck": "truck/bus",
    "fire truck": "truck/bus",
    "stake or rack": "truck/bus",
    "ambu": "truck/bus",
    "fdny": "truck/bus",
    "multi-wheeled vehicle": "other",
    "usps": "truck/bus",
    "trailer": "other",
    "limo": "car/suv",
    "tract": "truck/bus",
    "unk": "other",
    "open body": "other",
    "scooter": "escooter",
    "utili": "truck/bus",
    "tow truck": "truck/bus",
    "box t": "truck/bus",
    "firet": "truck/bus",
    "garba": "truck/bus",
    "school bus": "truck/bus",
    "van camper": "car/suv",
    "scoot": "escooter",
    "tow t": "truck/bus",
    "comme": "truck/bus",
    "elect": "ebike",
    "minibike": "motorcycle",
    "fdny ambul": "truck/bus",
    "delv": "truck/bus",
    "bulk agriculture": "other",
    "rv": "other",
    "commercial": "truck/bus",
    "motor": "motorcycle",
    "com": "other",
    "deliv": "truck/bus",
    "forkl": "forklift",
    "unknown": "other",
    "e-bik": "ebike",
    "pedicab": "other",
    "util": "truck/bus",
    "flat": "other",
    "fdny fire": "truck/bus",
    "forklift": "forklift",
    "minicycle": "motorcycle",
    "lunch wagon": "truck/bus",
    "fork": "forklift",
    "pallet": "forklift",
    "firetruck": "truck/bus",
    "schoo": "truck/bus",
    "pick up": "truck/bus",
    "unkno": "other",
    "power": "other",
    "pickup with mounted camper": "truck/bus",
    "fdny truck": "truck/bus"
}

for i in range(2):
    collisions[f"VEHICLE TYPE CODE {i + 1}"] = collisions[f"VEHICLE TYPE CODE {i + 1}"].replace(vehicle_classifications)

print(f"Current amount of rows: {len(collisions)}")
vehicle_count = sum([Counter(dict(collisions[f"VEHICLE TYPE CODE {i + 1}"].value_counts())) for i in range(2)], Counter())
print(f"Final vehicle count:\n{vehicle_count}")

In [None]:
collisions.to_csv("./processed-data/collisions.csv", index=False)

## Weather dataset