# Weather augmentation
Based on Boroughs:
| Borough       | Latitude   | Longitude   |
|---------------|------------|-------------|
| The Bronx     | 40.8448° N | 73.8648° W  |
| Brooklyn      | 40.6782° N | 73.9442° W  |
| Manhattan     | 40.7831° N | 73.9712° W  |
| Queens        | 40.7282° N | 73.7949° W  |
| Staten Island | 40.5795° N | 74.1502° W  |


In [22]:
from datetime import datetime
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from meteostat import Hourly
from meteostat import Point

base_path = "../datasets"

In [23]:
boroughs_locations = {
    "bronx": Point(40.8448, -73.8648),
    "brooklyn": Point(40.6782, -73.9442),
    "manhattan": Point(40.7831, -73.9712),
    "queens": Point(40.7282, -73.7949),
    "staten_island": Point(40.5795, -74.1502)
}

### Retrieve weather data from api

In [24]:
# for each boroughs, get the weather hourly data from 2014 to today and save it in a csv file based on the borough name
weather_data = pd.DataFrame()
for borough, location in boroughs_locations.items():
    data = Hourly(location, start=datetime(2013, 1, 1), end=datetime.now())
    data = data.fetch()
    data["borough"] = borough
    weather_data = pd.concat([weather_data, data])

weather_data = weather_data.reset_index()
weather_data = weather_data.rename(columns={"time": "date"})
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data["hour"] = weather_data["date"].dt.hour
weather_data["date"] = weather_data["date"].dt.date

print(weather_data['snow'].value_counts(dropna=False)) # NEVER SNOW?
print(weather_data['wpgt'].value_counts(dropna=False)) # NEVER WIND?
print(weather_data['tsun'].value_counts(dropna=False)) # No meaningful data

weather_data = weather_data.drop(columns=["snow", "wpgt", "tsun"])

borough_to_code = {
  'manhattan': 1,
  'bronx': 2,
  'brooklyn': 3,
  'queens': 4,
  'staten_island': 5
}

weather_data['borough'] = weather_data['borough'].map(borough_to_code)
weather_data['borough'] = weather_data['borough'].astype(str)

weather_data.to_parquet(f"{base_path}/weather.parquet", index=False)

snow
NaN    509585
Name: count, dtype: int64
wpgt
NaN    509585
Name: count, dtype: int64
tsun
NaN    509585
Name: count, dtype: int64


In [25]:
weather_data

Unnamed: 0,date,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco,borough,hour
0,2013-01-01,2.8,-5.6,54.0,,230.0,16.6,1015.0,,2,0
1,2013-01-01,2.8,-6.1,52.0,0.0,240.0,14.8,1013.5,,2,1
2,2013-01-01,2.8,-6.1,52.0,0.0,240.0,16.6,1012.6,,2,2
3,2013-01-01,3.3,-6.1,50.0,0.0,250.0,14.8,1012.2,,2,3
4,2013-01-01,3.3,-5.6,52.0,0.0,250.0,20.5,1012.4,,2,4
...,...,...,...,...,...,...,...,...,...,...,...
509580,2024-08-17,20.4,19.4,94.0,0.0,140.0,7.2,1015.8,3.0,5,8
509581,2024-08-17,21.0,19.5,91.0,0.0,128.0,7.2,1015.6,3.0,5,9
509582,2024-08-17,21.5,19.2,87.0,0.0,129.0,7.2,1015.6,3.0,5,10
509583,2024-08-17,21.9,19.1,84.0,0.0,141.0,8.3,1015.8,3.0,5,11


### Try with a sample data

In [26]:
weather_data = pd.read_parquet(f"{base_path}/weather.parquet")

sample = pd.read_parquet(f"{base_path}/sample_cleaned_data.parquet")
sample["issue_date"] = pd.to_datetime(sample["issue_date"])
sample["_date"] = sample["issue_date"].dt.date
sample["_hour"] = sample["issue_date"].dt.hour.fillna(0).astype(int)

In [28]:
# merge the sample data with the weather data
merged_data = pd.merge(sample, weather_data, left_on=["_date", "_hour", "violation_county"], right_on=["date", "hour", "borough"], how="left")

columns_to_drop = [col for col in merged_data.columns if col.startswith("_")] + ["date", "hour", "borough"]
merged_data = merged_data.drop(columns=columns_to_drop)

In [19]:
# count the merged data and data
print(f"Sample data shape: {sample.shape}")
print(f"Merged data shape: {merged_data.shape}")

Sample data shape: (1356866, 44)
Merged data shape: (1356866, 50)


In [29]:
for col in merged_data.columns:
    print(f"  {col}:\n{merged_data[col].head(3)}\n")

  summons_number:
0    7551971749
1    7228388483
2    7510733789
Name: summons_number, dtype: int64

  plate_id:
0    FXL4687
1    GHK7873
2    GAR2749
Name: plate_id, dtype: string

  registration_state:
0    NY
1    NY
2    NY
Name: registration_state, dtype: string

  plate_type:
0    PAS
1    PAS
2    PAS
Name: plate_type, dtype: string

  issue_date:
0   2013-07-29 21:22:00
1   2013-07-29 11:57:00
2   2013-07-29 07:59:00
Name: issue_date, dtype: datetime64[ns]

  violation_code:
0    38
1    50
2    71
Name: violation_code, dtype: int64

  vehicle_body_type:
0    4DSD
1    4DSD
2    SUBN
Name: vehicle_body_type, dtype: string

  vehicle_make:
0    ME/BE
1    HONDA
2    ROVER
Name: vehicle_make, dtype: string

  issuing_agency:
0    T
1    T
2    T
Name: issuing_agency, dtype: string

  street_code1:
0    90980
1        0
2    79730
Name: street_code1, dtype: int64

  street_code2:
0        0
1        0
2    93230
Name: street_code2, dtype: int64

  street_code3:
0        0
1     