# Weather augmentation
Based on Boroughs:
| Borough       | Latitude   | Longitude   |
|---------------|------------|-------------|
| The Bronx     | 40.8448° N | 73.8648° W  |
| Brooklyn      | 40.6782° N | 73.9442° W  |
| Manhattan     | 40.7831° N | 73.9712° W  |
| Queens        | 40.7282° N | 73.7949° W  |
| Staten Island | 40.5795° N | 74.1502° W  |


In [2]:
from datetime import datetime
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from meteostat import Hourly
from meteostat import Point

base_path = "../datasets"

In [47]:
boroughs_locations = {
    "bronx": Point(40.8448, -73.8648),
    "brooklyn": Point(40.6782, -73.9442),
    "manhattan": Point(40.7831, -73.9712),
    "queens": Point(40.7282, -73.7949),
    "staten_island": Point(40.5795, -74.1502)
}

### Retrieve weather data from api

In [48]:
# for each boroughs, get the weather hourly data from 2014 to today and save it in a csv file based on the borough name
weather_data = pd.DataFrame()
for borough, location in boroughs_locations.items():
    data = Hourly(location, start=datetime(2013, 1, 1), end=datetime.now())
    data = data.fetch()
    data["borough"] = borough
    weather_data = pd.concat([weather_data, data])

weather_data = weather_data.reset_index()
weather_data = weather_data.rename(columns={"time": "date"})
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data["hour"] = weather_data["date"].dt.hour
weather_data["date"] = weather_data["date"].dt.date

print(weather_data['snow'].value_counts(dropna=False)) # NEVER SNOW?
print(weather_data['wpgt'].value_counts(dropna=False)) # NEVER WIND?
print(weather_data['tsun'].value_counts(dropna=False)) # No meaningful data

weather_data = weather_data.drop(columns=["snow", "wpgt", "tsun"])

weather_data.to_parquet(f"{base_path}/weather.parquet", index=False)

snow
NaN    507785
Name: count, dtype: int64
wpgt
NaN    507785
Name: count, dtype: int64
tsun
NaN    507785
Name: count, dtype: int64


In [49]:
weather_data

Unnamed: 0,date,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco,borough,hour
0,2013-01-01,2.8,-5.6,54.0,,230.0,16.6,1015.0,,bronx,0
1,2013-01-01,2.8,-6.1,52.0,0.0,240.0,14.8,1013.5,,bronx,1
2,2013-01-01,2.8,-6.1,52.0,0.0,240.0,16.6,1012.6,,bronx,2
3,2013-01-01,3.3,-6.1,50.0,0.0,250.0,14.8,1012.2,,bronx,3
4,2013-01-01,3.3,-5.6,52.0,0.0,250.0,20.5,1012.4,,bronx,4
...,...,...,...,...,...,...,...,...,...,...,...
507780,2024-08-02,23.9,21.2,85.0,0.0,271.0,8.3,1011.9,3.0,staten_island,8
507781,2024-08-02,23.3,21.2,88.0,0.0,279.0,7.2,1012.1,1.0,staten_island,9
507782,2024-08-02,23.2,21.5,90.0,0.0,324.0,4.7,1012.2,3.0,staten_island,10
507783,2024-08-02,24.3,22.0,87.0,0.0,350.0,5.0,1012.4,3.0,staten_island,11


### Try with a sample data

In [3]:
weather_data = pd.read_parquet(f"{base_path}/weather.parquet")

sample = pd.read_parquet(f"{base_path}/sample_data_cleaned.parquet")
sample["issue_date"] = pd.to_datetime(sample["issue_date"])
sample["_date"] = sample["issue_date"].dt.date
sample["_hour"] = sample["issue_date"].dt.hour

In [4]:
# merge the sample data with the weather data
merged_data = pd.merge(sample, weather_data, left_on=["_date", "_hour", "violation_county"], right_on=["date", "hour", "borough"], how="left")

columns_to_drop = [col for col in merged_data.columns if col.startswith("_")] + ["date", "hour", "borough"]
merged_data = merged_data.drop(columns=columns_to_drop)

In [5]:
# count the merged data and data
print(f"Sample data shape: {sample.shape}")
print(f"Merged data shape: {merged_data.shape}")

Sample data shape: (1404261, 43)
Merged data shape: (1404261, 49)


In [6]:
for col in merged_data.columns:
    print(f"  {col}:\n{merged_data[col].head(3)}\n")

  summons_number:
0    5069663409
1    7433097510
2    1358115771
Name: summons_number, dtype: int64

  plate_id:
0     HSP9388
1    T466979C
2     49722JG
Name: plate_id, dtype: string

  registration_state:
0    PA
1    NY
2    NY
Name: registration_state, dtype: string

  plate_type:
0    PAS
1    OMT
2    COM
Name: plate_type, dtype: string

  issue_date:
0   2013-07-20 18:09:00
1   2013-07-29 08:06:00
2   2013-07-10 09:44:00
Name: issue_date, dtype: datetime64[ns]

  violation_code:
0     7
1    21
2    19
Name: violation_code, dtype: int64

  vehicle_body_type:
0      CP
1    SUBN
2     VAN
Name: vehicle_body_type, dtype: string

  vehicle_make:
0    DODGE
1    TOYOT
2    MAZDA
Name: vehicle_make, dtype: string

  issuing_agency:
0    V
1    T
2    X
Name: issuing_agency, dtype: string

  street_code1:
0        0
1    51090
2        0
Name: street_code1, dtype: int64

  street_code2:
0        0
1    16490
2        0
Name: street_code2, dtype: int64

  street_code3:
0        0
1  