# Weather augmentation
Based on Boroughs:
| Borough       | Latitude   | Longitude   |
|---------------|------------|-------------|
| The Bronx     | 40.8448° N | 73.8648° W  |
| Brooklyn      | 40.6782° N | 73.9442° W  |
| Manhattan     | 40.7831° N | 73.9712° W  |
| Queens        | 40.7282° N | 73.7949° W  |
| Staten Island | 40.5795° N | 74.1502° W  |


In [2]:
from datetime import datetime
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from meteostat import Daily
from meteostat import Point

base_path = "../datasets"

In [3]:
boroughs_locations = {
    "bronx": Point(40.8448, -73.8648),
    "brooklyn": Point(40.6782, -73.9442),
    "manhattan": Point(40.7831, -73.9712),
    "queens": Point(40.7282, -73.7949),
    "staten_island": Point(40.5795, -74.1502)
}

### Retrieve weather data from api

In [80]:
# for each boroughs, get the weather hourly data from 2014 to today and save it in a csv file based on the borough name
weather_data = pd.DataFrame()
for borough, location in boroughs_locations.items():
    data = Daily(location, start=datetime(2013, 1, 1), end=datetime.now())
    data = data.fetch()
    data["borough"] = borough
    weather_data = pd.concat([weather_data, data])

weather_data = weather_data.reset_index()
weather_data = weather_data.rename(columns={"time": "date"})
weather_data["date"] = pd.to_datetime(weather_data["date"])

weather_data.to_parquet(f"{base_path}/weather_data.parquet", index=False)

In [68]:
weather_data

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,borough
0,2013-01-01,2.4,-2.8,3.9,0.0,0.0,280.0,14.9,,1012.4,,bronx
1,2013-01-02,-2.6,-5.6,0.6,0.0,0.0,305.0,13.3,,1017.9,,bronx
2,2013-01-03,-1.8,-5.0,0.6,0.0,0.0,290.0,10.6,,1020.5,,bronx
3,2013-01-04,1.3,-0.6,3.3,0.0,0.0,249.0,16.4,,1016.4,,bronx
4,2013-01-05,2.1,-1.1,6.7,0.0,0.0,268.0,10.7,,1022.0,,bronx
...,...,...,...,...,...,...,...,...,...,...,...,...
21145,2024-07-27,24.6,18.6,29.7,0.0,,38.0,11.5,,1019.3,,staten_island
21146,2024-07-28,24.5,18.8,31.2,0.0,,120.0,12.3,,1018.9,,staten_island
21147,2024-07-29,24.7,21.4,29.3,1.4,,169.0,11.2,,1015.1,,staten_island
21148,2024-07-30,26.6,22.2,32.3,0.1,,197.0,17.4,,1013.5,,staten_island


### Try with a sample data

In [4]:
weather_data = pd.read_csv(f"{base_path}/weather_data.csv")
weather_data["date"] = pd.to_datetime(weather_data["date"])

# load the sample parquet file
sample = pd.read_parquet("sample_data.parquet", engine="pyarrow")
sample['issue_date'] = pd.to_datetime(sample["issue_date"], format="mixed")

# Year seems to be wrong
sample.drop(columns=["DataYear"], inplace=True)

NY       39778396
K        23622766
Q        21408979
BX       17444790
QN       12415086
BK       12060000
MN        3675481
ST        2427401
R         1456599
Kings      742802
Bronx      417356
Qns        234428
Rich         2999
QUEEN          34
QNS            33
KINGS          26
RICH           15
BRONX          11
MAN             4
108             2
K   F           2
KING            2
NEW Y           2
NEWY            2
NYC             2
QU              2
RC              2
00000           1
103             1
A               1
ABX             1
F               1
MH              1
MS              1
N               1
P               1
PBX             1
RICHM           1
USA             1
VINIS           1

In [16]:
# translate the county names to the borough names
county_to_borough = {
    "BRONX": "bronx",
    "BX": "bronx",
    "Bronx": "bronx",
    "BRONX": "bronx",
    "BK": "brooklyn",
    "K": "brooklyn",
    "Kings": "brooklyn",
    "KINGS": "brooklyn",
    "KING": "brooklyn",
    "Q": "queens",
    "QN": "queens",
    "Qns": "queens",
    "QUEEN": "queens",
    "QUEENS": "queens",
    "QNS": "queens",
    "QU": "queens",
    "NY": "manhattan",
    "MN": "manhattan",
    "MAN": "manhattan",
    "NEW Y": "manhattan",
    "NEWY": "manhattan",
    "NYC": "manhattan",
    "ST": "staten_island",
    "R": "staten_island",
    "Rich": "staten_island",
    "RICH": "staten_island",
    "RICHM": "staten_island",
    "RC": "staten_island",
    "MH": "manhattan",
    "MS": "manhattan",
    "N": "manhattan",
    "P": "manhattan",
    "PBX": "manhattan",
    "USA": "manhattan",
    "VINIS": "manhattan",
    "A": "unknown",
    "F": "unknown",
    "ABX": "bronx",
    "108": "unknown",
    "103": "staten_island", # zip code ? 
    "00000": "unknown",
    "K   F": "unknown"
}

sample['violation_county'] = sample['violation_county'].map(county_to_borough)
sample['violation_county'] = sample['violation_county'].replace('unknown', None)

initial_sample_size = sample.shape[0]
print('Initial sample size:', initial_sample_size)

sample = sample.dropna(subset=['violation_county'])

final_sample_size = sample.shape[0]
print('Final sample size:', final_sample_size)
print('Number of rows dropped:', initial_sample_size - final_sample_size)
print('Percentage of rows dropped:', (initial_sample_size - final_sample_size) / initial_sample_size * 100)

Initial sample size: 1404225
Final sample size: 1356850
Number of rows dropped: 47375
Percentage of rows dropped: 3.373747084690844


In [9]:
# merge the sample data with the weather data
merged_data = pd.merge(sample, weather_data, left_on=["issue_date", "violation_county"], right_on=["date", "borough"], how="left")

merged_data = merged_data.drop(columns=['date', 'borough'])

In [10]:
# count the merged data and data
print(f"Sample data shape: {sample.shape}")
print(f"Merged data shape: {merged_data.shape}")

Sample data shape: (1356850, 43)
Merged data shape: (1356850, 53)
