# VI: First Practical Work

**Authors:** Gerard Comas & Marc Franquesa.

## Data Processing
Processing all datasets in this notebook

In [None]:
# Initial imports
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import warnings
from shapely.geometry import shape, Point

warnings.simplefilter(action="ignore", category=FutureWarning)

### Collisions dataset

In [None]:
collisions = pd.read_csv("./original-data/collisions.csv")

print(collisions.columns)
print(f"Initial amount of rows: {len(collisions)}")
print(collisions.isnull().sum())

In [None]:
# Adding a CRASH DATETIME column as well as several checks to make sure we have the correct dataset

# Truncating to the hour because some (most) rows are already truncated and we don't need more information
collisions["CRASH DATETIME"] = pd.to_datetime(collisions["CRASH DATE"] + " " + collisions["CRASH TIME"]).dt.floor("H")

# Adding day of week column
collisions["CRASH WEEKDAY"] = collisions["CRASH DATETIME"].dt.day_name()

# Adding BEFORE COVID column
collisions["AFTER COVID"] = collisions['CRASH DATETIME'].dt.year == 2020

print(f"First crash: {collisions['CRASH DATETIME'].sort_values().iloc[0]}")

print(f"Last crash of 2018: {collisions[collisions['CRASH DATETIME'].dt.year == 2018]['CRASH DATETIME'].sort_values().iloc[-1]}")

print(f"First crash of 2020: {collisions[collisions['CRASH DATETIME'].dt.year == 2020]['CRASH DATETIME'].sort_values().iloc[0]}")

print(f"Last crash: {collisions['CRASH DATETIME'].sort_values().iloc[-1]}")

print(f"Collisions in 2019: {len(collisions[collisions['CRASH DATETIME'].dt.year == 2019])}")

In [None]:
# Checking if LOCATION contains the same information as LATITUDE and LONGITUDE
# We will take advantage of the fact that if a value is NaN in python then
# value == value will return False
def same_information():
    location = collisions["LOCATION"].tolist()
    lat, lon = collisions["LATITUDE"].tolist(), collisions["LONGITUDE"].tolist()
    for i, row in enumerate(location):
        # LOCATION is not NaN
        if row == row:
            if not list(map(float, row[1: -1].split(", "))) == [lat[i], lon[i]]: return False
        # LOCATION is NaN
        else:
            # If lat or lon is different to Nan return False
            if lat[i] == lat[i] or lon[i] == lon[i]: return False
    return True

print(same_information())

In [None]:
# Column selection
cols = [
    "CRASH DATETIME",
    "CRASH WEEKDAY",
    "AFTER COVID",
    "BOROUGH",
    "LATITUDE",
    "LONGITUDE",
    "NUMBER OF PERSONS INJURED",
    "NUMBER OF PERSONS KILLED",
    "VEHICLE TYPE CODE 1",
    "CONTRIBUTING FACTOR VEHICLE 1"
]
collisions = collisions[cols]

# Number of missing values in each column
print(collisions.isnull().sum())

In [None]:
# Fill in missing values with 0 for the injured/killed columns
collisions["NUMBER OF PERSONS INJURED"].fillna(0, inplace=True)
collisions["NUMBER OF PERSONS KILLED"].fillna(0, inplace=True)

We will now classify all vehicle types into these categories:
* Ambulance
* Bicycle
* Car
* E-bike
* E-scooter
* Truck
* Bus
* Motorcycle
* Other
* Unknown

In [None]:
classified_vehicles = {
    "Station Wagon/Sport Utility Vehicle": "Car",
    "Sedan": "Car",
    "Bus": "Bus",
    "Tractor Truck Diesel": "Truck",
    "Taxi": "Car",
    "E-Scooter": "E-scooter",
    "Flat Bed": "Truck",
    "Motorbike": "Motorcycle",
    "Motorcycle": "Motorcycle",
    "Box Truck": "Truck",
    "Pick-up Truck": "Truck",
    "Bike": "Bicycle",
    "Dump": "Truck",
    "Concrete Mixer": "Truck",
    "Van": "Truck",
    "PK": "Other",
    "Golf Cart": "Other",
    "LIMO": "Car",
    "Tanker": "Truck",
    "AMBULANCE": "Ambulance",
    "Convertible": "Car",
    "E-Bike": "E-bike",
    "Moped": "Motorcycle",
    "Fire Truck": "Truck",
    "nan": "Other",
    "Tractor Truck Gasoline": "Truck",
    "Ambulance": "Ambulance",
    "forlift": "Other",
    "MOTOR SKAT": "Other",
    "FDNY LADDE": "Other",
    "Tow Truck / Wrecker": "Truck",
    "FIRE TRUCK": "Truck",
    "PICK UP": "Other",
    "Garbage or Refuse": "Truck",
    "GARBAGE TR": "Truck",
    "Chassis Cab": "Truck",
    "Bulk Agriculture": "Other",
    "Can": "Other",
    "van": "Truck",
    "Carry All": "Other",
    "FLATBED FR": "Truck",
    "Open Body": "Other",
    "4 dr sedan": "Car",
    "Motorscooter": "Motorcycle",
    "Minibike": "Motorcycle",
    "Flat Rack": "Other",
    "Armored Truck": "Truck",
    "School Bus": "Bus",
    "FDNY TRUCK": "Truck",
    "truck": "Truck",
    "UNK": "Unknown",
    "TRAILER": "Other",
    "FIRTRUCK": "Truck",
    "MOPED": "Motorcycle",
    "Lift Boom": "Other",
    "fdny ems": "Other",
    "AMBULACE": "Ambulance",
    "bus": "Bus",
    "BOX TRUCK": "Truck",
    "Street Swe": "Other",
    "Scooter": "Motorcycle",
    "FDNY fire": "Other",
    "DELIVERY": "Other",
    "Cement Tru": "Truck",
    "USPS/GOVT": "Other",
    "Pedicab": "Other",
    "TRUCK VAN": "Truck",
    "UTILITY": "Other",
    "Pick up tr": "Other",
    "UNKNOWN": "Unknown",
    "Multi-Wheeled Vehicle": "Other",
    "SUV": "Car",
    "utility": "Other",
    "POWER SHOV": "Other",
    "DELIVERY T": "Other",
    "SWT": "Other",
    "Trac": "Other",
    "FDNY AMBUL": "Ambulance",
    "AMBU": "Other",
    "USPS": "Other",
    "FLAT": "Other",
    "Beverage Truck": "Truck",
    "E-BIKE": "E-bike",
    "3-Door": "Car",
    "Fork Lift": "Other",
    "Refrigerated Van": "Truck",
    "PSD": "Other",
    "Fire Engin": "Other",
    "FORKLIFT": "Other",
    "TRAC": "Other",
    "Tow Truck": "Truck",
    "COURIER": "Other",
    "Courier": "Other",
    "Leased amb": "Other",
    "SMART CAR": "Car",
    "message si": "Other",
    "scooter": "Motorcycle",
    "E-UNICYCLE": "E-scooter",
    "Street Cle": "Other",
    "box": "Other",
    "F550": "Truck",
    "DELV": "Other",
    "SKATEBOARD": "Other",
    "Lawnmower": "Other",
    "almbulance": "Other",
    "dark color": "Other",
    "Work Van": "Other",
    "ford van": "Truck",
    "ambulance": "Ambulance",
    "Fire truck": "Truck",
    "Minicycle": "Motorcycle",
    "PC": "Other",
    "box truck": "Truck",
    "FDNY ENGIN": "Other",
    "commercial": "Other",
    "Unknown": "Unknown",
    "Tractor tr": "Truck",
    "2 dr sedan": "Car",
    "FD LADDER": "Other",
    "abulance": "Other",
    "FDNY Engin": "Other",
    "OTH": "Other",
    "Go kart": "Other",
    "Trailer": "Other",
    "TRUCK": "Truck",
    "Stake or Rack": "Other",
    "COMMERCIAL": "Other",
    "CHEVY EXPR": "Other",
    "SLINGSHOT": "Other",
    "dilevery t": "Other",
    "FDNY #226": "Other",
    "FREIGHT FL": "Other",
    "Fork lift": "Other",
    "UTIL": "Other",
    "UNKN": "Other",
    "FDNY FIRE": "Other",
    "ELECTRIC S": "Other",
    "FIRETRUCK": "Truck",
    "MOVING VAN": "Truck",
    "usps": "Other",
    "moped": "Motorcycle",
    "forklift": "Other",
    "UPS TRUCK": "Truck",
    "backhoe": "Other",
    "Delv": "Other",
    "dump truck": "Truck",
    "Freight": "Other",
    "Horse": "Other",
    "Cargo Van": "Truck",
    "USPS VAN": "Other",
    "TRUCK FLAT": "Truck",
    "BOBCAT FOR": "Other",
    "Tractor Tr": "Truck",
    "Pumper": "Other",
    "DELIVERY V": "Other",
    "DOT EQUIPM": "Other",
    "fire truck": "Truck",
    "Livestock Rack": "Other",
    "GEN  AMBUL": "Ambulance",
    "J1": "Other",
    "DUMP": "Other",
    "18 WHEELER": "Truck",
    "MAIL TRUCK": "Other",
    "UTILITY VE": "Other",
    "MOTORSCOOT": "Motorcycle",
    "government": "Other",
    "trailer": "Other",
    "FIRE ENGIN": "Other",
    "Front-Load": "Other",
    "DRILL RIG": "Other",
    "SCOOTER": "Motorcycle",
    "Wh Ford co": "Other",
    "suburban": "Car",
    "E REVEL SC": "Other",
    "ROAD SWEEP": "Other",
    "LIGHT TRAI": "Other",
    "Tractor": "Truck",
    "UT": "Other",
    "USPS TRUCK": "Other",
    "cross": "Other",
    "Van Camper": "Other",
    "AMBULENCE": "Ambulance",
    "FOOD TRUCK": "Other",
    "Bucket Tru": "Other",
    "gator": "Other",
    "FDNY Ambul": "Ambulance",
    "JOHN DEERE": "Other",
    "f-250": "Other",
    "MECHANICAL": "Other",
    "WORK VAN": "Other",
    "NYC FD": "Other",
    "MTA BUS": "Bus",
    "NYC AMBULA": "Ambulance",
    "GOLF CART": "Other",
    "FLATBED": "Truck",
    "Trc": "Other",
    "FORK LIFT": "Other",
    "Pick up Tr": "Other",
    "postal bus": "Bus",
    "F150XL PIC": "Other",
    "ambu": "Other",
    "Pick up": "Other",
    "CAT": "Other",
    "ELEC. UNIC": "E-scooter",
    "1C": "Other",
    "SCOOT": "Motorcycle",
    "FREIG": "Other",
    "AMBUL": "Ambulance",
    "VAN T": "Other",
    "MINI": "Other",
    "Garba": "Other",
    "motor": "Other",
    "Lunch Wagon": "Other",
    "E-Bik": "E-bike",
    "Ambul": "Ambulance",
    "FDNY": "Other",
    "SCHOO": "Other",
    "Comm": "Other",
    "Fire": "Other",
    "Sanit": "Other",
    "mail": "Other",
    "RV": "Other",
    "GARBA": "Other",
    "ambul": "Ambulance",
    "FIRET": "Other",
    "FIRE": "Other",
    "SELF": "Other",
    "STAK": "Other",
    "WORKH": "Other",
    "FORKL": "Other",
    "Tract": "Other",
    "freig": "Other",
    "DELIV": "Other",
    "trail": "Other",
    "PICKU": "Other",
    "Dumps": "Other",
    "forkl": "Other",
    "fire": "Other",
    "TRK": "Other",
    "ELECT": "Other",
    "2- to": "Other",
    "BROOM": "Other",
    "TRAIL": "Other",
    "EBIKE": "E-bike",
    "Trail": "Other",
    "Glass Rack": "Other",
    "Motorized Home": "Other",
    "US POSTAL": "Other",
    "TRT": "Other",
    "BLOCK": "Other",
    "pas": "Other",
    "COM": "Other",
    "CONCR": "Other",
    "Pallet": "Other",
    "unknown": "Unknown",
    "CHERR": "Other",
    "UTV": "Other",
    "MOTOR": "Other",
    "MTA B": "Bus",
    "TRACT": "Other",
    "NYC": "Other",
    "UHAUL": "Other",
    "scoot": "Motorcycle",
    "FED E": "Other",
    "COMME": "Other",
    "TRLR": "Other",
    "LOADE": "Other",
    "rv": "Other",
    "TOWER": "Other",
    "Pick": "Other",
    "AMB": "Other",
    "NS AM": "Other",
    "UNKNO": "Unknown",
    "NEW Y": "Other",
    "TOW T": "Other",
    "GRAY": "Other",
    "tract": "Other",
    "STREE": "Other",
    "MAIL": "Other",
    "e-bik": "E-bike",
    "unk": "Unknown",
    "box t": "Other",
    "CRANE": "Other",
    "garba": "Other",
    "Pickup with mounted Camper": "Other",
    "FRONT": "Other",
    "Sprin": "Other",
    "delv": "Other",
    "POWER": "Other",
    "Box t": "Other",
    "CAMP": "Other",
    "Enclosed Body - Removable Enclosure": "Other",
    "RGS": "Other",
    "GOVER": "Other",
    "FORK": "Other",
    "UTILI": "Other",
    "POSTO": "Other",
    "firet": "Other",
    "WORK": "Other",
    "R/V C": "Other",
    "sgws": "Other",
    "Cat 9": "Other",
    "BACKH": "Other",
    "E-MOT": "E-scooter",
    "MACK": "Other",
    "SPC": "Other",
    "fork": "Other",
    "OMR": "Other",
    "semi": "Other",
    "FORK-": "Other",
    "Wheel": "Other",
    "Utili": "Other",
    "E-BIK": "E-bike",
    "fd tr": "Other",
    "SWEEP": "Other",
    "BOX T": "Other",
    "CASE": "Other",
    "FD TR": "Other",
    "Work": "Other",
    "LIBER": "Other",
    "fdny": "Other",
    "COMB": "Other",
    "HEAVY": "Other",
    "DUMPS": "Other",
    "MTA b": "Bus",
    "Hopper": "Other",
    "R/V": "Other",
    "FOOD": "Other",
    "FD tr": "Other",
    "Spc": "Other",
    "BED T": "Other",
    "comme": "Other",
    "UPS T": "Other",
    "PAS": "Other",
    "BICYC": "Bicycle",
    "Subn": "Other",
    "WHEEL": "Other",
    "Util": "Other",
    "ACCES": "Other",
    "e sco": "E-scooter",
    "BOBCA": "Other",
    "TANK": "Other",
    "TRACK": "Other",
    "utili": "Other",
    "DEMA-": "Other",
    "tow": "Other",
    "dump": "Other",
    "Elect": "Other",
    "deliv": "Other",
    "Backh": "Other",
    "CEMEN": "Other",
    "99999": "Other",
    "BULLD": "Other",
    "seagr": "Other",
    "schoo": "Other",
    "CONST": "Other",
    "self": "Other",
    "BK": "Other",
    "Semi": "Other",
    "Scoot": "Motorcycle",
    "NYPD": "Other",
    "Taxis": "Car",
}

In [None]:
# Changing vehicle types to classification we want to use, list is found in the 
# NYC collision dataset: ATV, bicycle, car/suv, ebike, E-scooter, truck/bus,
# motorcycle, other, unknown

collisions["ORIGINAL VEHICLE"] = collisions["VEHICLE TYPE CODE 1"].fillna("unknown")
collisions = collisions.drop(columns="VEHICLE TYPE CODE 1")

collisions["VEHICLE"] = collisions["ORIGINAL VEHICLE"].replace(classified_vehicles)

collisions["VEHICLE"].value_counts()

In [None]:
collisions["CONTRIBUTING FACTOR VEHICLE 1"].value_counts()

In [None]:
classified_factors = {
    "Driver Inattention/Distraction": "Driver Inattention",
    "Unspecified": "Unspecified",
    "Following Too Closely": "Driving Infraction",
    "Failure to Yield Right-of-Way": "Driving Infraction",
    "Backing Unsafely": "Driving Infraction",
    "Passing or Lane Usage Improper": "Driving Infraction",
    "Passing Too Closely": "Driving Infraction",
    "Other Vehicular": "Unspecified",
    "Unsafe Lane Changing": "Driving Infraction",
    "Turning Improperly": "Driving Infraction",
    "Traffic Control Disregarded": "Driving Infraction",
    "Unsafe Speed": "Driving Infraction",
    "Driver Inexperience": "Driving Inexperience",
    "Reaction to Uninvolved Vehicle": "Unspecified",
    "Alcohol Involvement": "Substance Abuse",
    "View Obstructed/Limited": "Environmental Factors",
    "Pedestrian/Bicyclist/Other Pedestrian Error/Confusion": "Pedestrian Error",
    "Oversized Vehicle": "Oversized Vehicle",
    "Aggressive Driving/Road Rage": "Driving Behavior",
    "Pavement Slippery": "Environmental Factors",
    "Brakes Defective": "Vehicle Defect",
    "Passenger Distraction": "Driver Inattention",
    "Fell Asleep": "Medical Condition",
    "Obstruction/Debris": "Environmental Factors",
    "Outside Car Distraction": "Environmental Factors",
    "Steering Failure": "Vehicle Defect",
    "Tire Failure/Inadequate": "Vehicle Defect",
    "Pavement Defective": "Environmental Factors",
    "Glare": "Environmental Factors",
    "Failure to Keep Right": "Driving Infraction",
    "Illnes": "Medical Condition",
    "Fatigued/Drowsy": "Medical Condition",
    "Lost Consciousness": "Medical Condition",
    "Driverless/Runaway Vehicle": "Driverless Vehicle",
    "Drugs (illegal)": "Substance Abuse",
    "Animals Action": "Environmental Factors",
    "Accelerator Defective": "Vehicle Defect",
    "Cell Phone (hand-Held)": "Driver Inattention",
    "Lane Marking Improper/Inadequate": "Environmental Factors",
    "Traffic Control Device Improper/Non-Working": "Environmental Factors",
    "Physical Disability": "Medical Condition",
    "Other Electronic Device": "Driver Inattention",
    "Other Lighting Defects": "Vehicle Defect",
    "Vehicle Vandalism": "Unspecified",
    "Prescription Medication": "Medical Condition",
    "Tinted Windows": "Vehicle Defect",
    "Eating or Drinking": "Driver Inattention",
    "Shoulders Defective/Improper": "Vehicle Defect",
    "Headlights Defective": "Vehicle Defect",
    "Using On Board Navigation Device": "Driver Inattention",
    "Cell Phone (hands-free)": "Driver Inattention",
    "Tow Hitch Defective": "Vehicle Defect",
    "Windshield Inadequate": "Vehicle Defect",
    "Texting": "Driver Inattention",
    "Listening/Using Headphones": "Driver Inattention"
}


In [None]:
collisions["ORIGINAL FACTOR"] = collisions["CONTRIBUTING FACTOR VEHICLE 1"].fillna("Unspecified")
collisions = collisions.drop(columns= "CONTRIBUTING FACTOR VEHICLE 1")

collisions["FACTOR"] = collisions["ORIGINAL FACTOR"].replace(classified_factors)

collisions["FACTOR"].value_counts()

In [None]:
# Replacing LATITUDE and longitude values that don't make make sense for NYC into NaNs
collisions["LATITUDE"] = collisions["LATITUDE"].where(collisions["LATITUDE"].between(38, 42))
collisions["LONGITUDE"] = collisions["LONGITUDE"].where(collisions["LONGITUDE"].between(-76, -72))

# Adding our own LOCATION column, we do know that it already exists but it was easier for us this way
# If either LATITUDE or LONGITUDE is NaN then location will be NaN
def combine_columns(row):
    if pd.notna(row["LATITUDE"]) and pd.notna(row["LONGITUDE"]):
        return [row["LATITUDE"], row["LONGITUDE"]]
    else:
        return np.nan

collisions["LOCATION"] = collisions.apply(combine_columns, axis=1)

# Dropping NaNs in LOCATION and BOROUGH, if either BOROUGH or LOCATION is not NaN we will keep the row
collisions.dropna(subset=["LOCATION", "BOROUGH"], how="all", inplace=True)

print(f"Current amount of rows: {len(collisions)}")

In [None]:
print(collisions.isnull().sum())

In [None]:
collisions.head()

### Weather dataset

Data obtained from the ASOS Network of Iowa State University, with the following link: https://mesonet.agron.iastate.edu/request/download.phtml?network=NY_ASOS. <br>

We have selected the station [NYC] NEW YORK CITY (1943-Now).


In [None]:
# Read the csv file into a pandas DataFrame
weather_2018 = pd.read_csv("./original-data/NYC_weather_2018.csv")
weather_2020 = pd.read_csv("./original-data/NYC_weather_2020.csv")

# concatenate the two dataframes
weather = pd.concat([weather_2018, weather_2020], ignore_index=True)

# M & T represents a NaN in the dataset (found in the docs)
weather = weather.replace('M', None).replace('T', None)

# print the concatenated dataframe
print(weather.columns)

# na values in the dataframe
print(weather.isna().sum())

The following columns will be removed, as they are not deemed relevant. This may be due to the focus on summer data, where columns related to snow lack significance, or because the columns inherently possess a high number of missing values: <br>
- station
- dwpf
- drct
- alti
- gust
- skyc1
- skyc2
- skyc3
- skyc4
- sky1
- skyl2
- skyl3
- skyl4
- wxcodes
- feel
- ice_accretion_1hr
- ice_accretion_3hr
- ice_accretion_6hr
- peak_wind_gust
- peak_wind_drct
- peak_wind_time
- metar
- snowdepth

In [None]:
cols = [
    "valid",
    "tmpf",
    "relh",
    "sknt",
    "p01i",
    "mslp",
    "vsby",
]

weather = weather[cols]

weather[cols[1:]] = weather[cols[1:]].apply(pd.to_numeric)

print(weather.isna().sum())

### The analysis will involve working with the following variables.
- **`valid`**: timestamp of the observation
- **`tmpf`**: Air Temperature in Fahrenheit, typically @ 2 meters 
- **`relh`**: Relative Humidity in %
- **`sknt`**: Wind Speed in knots 
- **`p01i`**: One hour precipitation for the period from the observation time to the time of the previous hourly precipitation reset. Values are in inches.
- **`mslp`**: Sea Level Pressure in millibar 
- **`vsby`**: Visibility in miles 

Let's visually explore the missing values using the `missingno` package.

In [None]:
import missingno as msno

# matrix plot
msno.matrix(weather)

The function `msno.matrix(weather)` generates a nullity matrix, which is a graphical representation of the absence of data in the `weather` DataFrame. Each row in the matrix corresponds to a row in the DataFrame, and white marks indicate missing values.

By observing the graph, you can look for patterns in the missing data. For instance, if you notice that white marks tend to cluster in certain areas, it could suggest that the missing data is not randomly distributed but rather related to some variable or condition. As in the case of the `sknt` column, likely due to a failure in the wind speed sensor, we should take this into account when analyzing the data.

We can also visually see that missing values in the `tmpf` and `relh` columns are related.

Another valuable insight from this graph is that the `mslp` column may not be as interesting as initially thought, as having so many random missing values doesn't make sense.

In [None]:
# heatmap
msno.heatmap(weather)

The `msno.heatmap(weather)` function generates a heatmap showing the correlation of missing data in the `weather` DataFrame. The values on the heatmap range from -1 to 1.

A value close to 1 indicates that the presence of a missing value in one column is strongly correlated with the presence of a missing value in another column. This could suggest that missing values in both columns are being caused by the same underlying factor.

On the other hand, a value close to -1 indicates that the presence of a missing value in one column is strongly correlated with the presence of a value in another column. This could suggest that missing values in one column are being caused by the absence of missing values in the other column.

Now, we can confirm the strong correlation between the missing values in the `tmpf` and `relh` columns, as well as the significant correlation between these columns and `vsby`.

In [None]:
# drop the columns mslp
weather.drop('mslp', axis=1, inplace=True)

Let's determine the intervals where the wind speed sensor may have stopped functioning.

In [None]:
# filter the weather dataframe to show only data from September 2018
weather_sep2018 = weather[(weather['valid'] >= '2018-09-01') & (weather['valid'] < '2018-09-30')]

# create a mask for missing values in sknt column
mask = weather_sep2018['sknt'].isna()

# create a group identifier for consecutive missing values
group_id = (~mask).cumsum()

# group the consecutive missing values together and count the number of missing values in each group
consecutive_missing_values = mask.groupby(group_id).sum()

# print the number of consecutive missing values and the last one in the series
print(f"Number of consecutive missing values in sknt: {consecutive_missing_values.max()}")

# obtain the first and last element with id = consecutive_missing_values.idxmax() in group_id
group_id_max = consecutive_missing_values.idxmax()

first_element_valid_sep2018 = weather_sep2018.loc[group_id.eq(group_id_max).idxmax(), 'valid']
last_element_valid_sep2018 = weather_sep2018.loc[group_id.eq(group_id_max+1).idxmax(), 'valid']

# print the valid column of the first and last element of the consecutive missing value
print(f"Valid column of the first element: {first_element_valid_sep2018}")
print(f"Valid column of the last element: {last_element_valid_sep2018}")

In [None]:
# filter the weather dataframe to show only data from June 2020
weather_jun2020 = weather[(weather['valid'] >= '2020-06-01') & (weather['valid'] < '2020-06-30')]

# create a mask for missing values in sknt column
mask = weather_jun2020['sknt'].isna()

# create a group identifier for consecutive missing values
group_id = (~mask).cumsum()

# group the consecutive missing values together and count the number of missing values in each group
consecutive_missing_values = mask.groupby(group_id).sum()

# print the number of consecutive missing values and the last one in the series
print(f"Number of consecutive missing values in sknt: {consecutive_missing_values.max()}")

# obtain the first and last element with id = consecutive_missing_values.idxmax() in group_id
group_id_max = consecutive_missing_values.idxmax()
first_element_valid_jun2020 = weather_jun2020.loc[group_id.eq(group_id_max).idxmax(), 'valid']
last_element_valid_jun2020 = weather_jun2020.loc[group_id.eq(group_id_max+1).idxmax(), 'valid']

# print the valid column of the first and last element of the consecutive missing value
print(f"Valid column of the first element: {first_element_valid_jun2020}")
print(f"Valid column of the last element: {last_element_valid_jun2020}")

Given the identified periods of wind speed sensor malfunction, a procedure is initiated to interpolate all remaining missing values, excluding those specific intervals.

In [None]:
# interpolate the missing values in sknt column except the ones between 2020-06-01 00:51 and 2020-06-19 19:51 and between 2018-09-12 14:21 and 2018-09-17 17:51
mask1 = (weather['valid'] >= first_element_valid_jun2020) & (weather['valid'] <= last_element_valid_jun2020)
mask2 = (weather['valid'] >= first_element_valid_sep2018) & (weather['valid'] <= last_element_valid_sep2018)
mask = ~(mask1 | mask2)
weather.loc[mask, 'sknt'] = weather.loc[mask, 'sknt'].interpolate()

In the analysis of the `p01i` column, a visualization is conducted on the values preceding missing entries. This aims to provide an approximate understanding of their magnitudes. Considering the uncertainty about sensor behavior, it is plausible that the sensor may cease recording during periods of no rainfall or excessively high rainfall amounts.

In [None]:
# get the rows where the value in the p01i column is not missing and the value in the shifted p01i column is missing
before_missing = weather.loc[weather['p01i'].notna() & weather['p01i'].shift(1).isna()]

ch = alt.Chart(before_missing).mark_bar().encode(
    x='valid:O',
    y='p01i:Q'
)

ch.properties(
    width=800,
    height=400
).configure_axisX(labels=False)


In [None]:
print("Statistics of weather['p01i']:\n", weather['p01i'].describe())
print("\nStatistics of before_missing['p01i']:\n", before_missing['p01i'].describe())

No specific patterns or behaviors in the sensor data have been identified. Consequently, a decision has been made to interpolate the missing values.

In [None]:
# Interpolate the missing values in p01i column
weather['p01i'] = weather['p01i'].interpolate()

As previously mentioned, there exists a correlation between the missing values in the `vsby` column and `tmpf`. Consequently, the decision has been made to interpolate only the values that are not correlated.

In [None]:
# Interpolate the missing valurs in vsby column
weather.loc[weather['tmpf'].notna(), 'vsby'] = weather.loc[weather['tmpf'].notna(), 'vsby'].interpolate()

# get the count of missing values in each column
weather.isna().sum()

Now, the conversion to the International System of Units will be performed:
- **tmpf**: Fahrenheit to Celsius <br>
$$ Celsius = (Fahrenheit - 32) \cdot  \frac{5}{9} $$ 
- **sknt**: Knots to km/h
$$ 1 \ knot = 1.852 \ \frac{km}{h}$$
- **p01i**: inches to cm
$$ 1 \ inch  = 2.54 \ cm $$
- **vsby**: miles to km
$$ 1 \ mile = 1.609344 \ km $$

In [None]:
weather["tmpf"] = (weather["tmpf"] - 32) * 5/9

weather['sknt'] = weather['sknt'] * 1.852

weather['p01i'] = weather['p01i'] * 2.54

weather['vsby'] = weather['vsby'] * 1.609344


weather['valid'] = pd.to_datetime(weather['valid']).dt.floor('H')
weather_grouped = weather.groupby('valid').mean().reset_index()
print(weather_grouped.head())

In [None]:
print(f"First weather: {weather_grouped['valid'].sort_values().iloc[0]}")

print(f"Last weather of 2018: {weather_grouped[weather_grouped['valid'].dt.year == 2018]['valid'].sort_values().iloc[-1]}")

print(f"First weather of 2020: {weather_grouped[weather_grouped['valid'].dt.year == 2020]['valid'].sort_values().iloc[0]}")

print(f"Last weather: {weather_grouped['valid'].sort_values().iloc[-1]}")

print(f"Weather in 2019: {len(weather_grouped[weather_grouped['valid'].dt.year == 2019])}")

In [None]:
weather.describe()

weather_grouped.describe()

In [None]:
collisions = collisions.merge(weather_grouped, how="outer", right_on="valid", left_on="CRASH DATETIME")
collisions.drop(columns="valid", inplace=True)

In [None]:
collisions.head()

In [None]:
collisions["LOCATION"].dtype

### NYC Map
Currently using [NYC community district boundaries](https://data.cityofnewyork.us/City-Government/Community-Districts/yfnk-k7r4) in a geojson format. Lets add the number of collisions in each region. We find that community districts is the perfect granularity for a choropleth map. Not too little (Boroughs) not too much (zip codes). Check NYC total area [here](https://en.wikipedia.org/wiki/New_York_City#Geography).

In [None]:
map_data = gpd.read_file(f"./original-data/map.geojson")

collisions["DISTRICT"] = collisions["LOCATION"].apply(
    lambda x: [-1] if x != x else np.where(map_data.contains(Point(x[1], x[0])))[0]
)

collisions["DISTRICT"] = collisions["DISTRICT"].apply(lambda x: -1 if len(x) == 0 else x[0]).replace(-1, np.nan)

map_data["COLLISIONS"] = collisions.groupby(["DISTRICT"]).size()

map_data["AREA"] = map_data["geometry"].area

map_data["AREA PROPORTION"] = map_data["AREA"] / map_data["AREA"].sum()

# Value found online (wikipedia)
map_data["AREA KM2"] = 783.84 * map_data["AREA PROPORTION"]

map_data["COLLISIONS / KM2"] = map_data["COLLISIONS"] / map_data["AREA KM2"]



In [None]:
collisions.to_csv("./processed-data/collisions.csv", index=False)
weather_grouped.to_csv("processed-data/weather.csv", index=False)
map_data.to_file("processed-data/map.geojson", driver="GeoJSON")