# Fetching DATA

### 1. Bike rides data  

Files are downloaded extracted and csvs are picked and data id extracted and stored in .parquet files


Parquet stores data column by column, allowing analytical queries to read only the required columns, drastically improving speed and reducing I/O compared to row-based CSVs. 


Its columnar format enables more effective compression, leading to much smaller file sizes and lower storage costs. 


In [9]:
from pathlib import Path
import requests
import zipfile
import pandas as pd
import numpy as np

In [31]:
def fetch_raw_trip_data(year: int, month: int) -> str:
    base_url = "https://s3.amazonaws.com/hubway-data"
    patterns = [
        f"{year}{month:02}-bluebikes-tripdata.csv.zip",
        f"{year}{month:02}-bluebikes-tripdata.zip",
    ]

    raw_dir = Path("..") / "data" / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

    # 1) Download ZIP
    zip_path = None
    for fname in patterns:
        url = f"{base_url}/{fname}"
        resp = requests.get(url, stream=True)
        if resp.status_code == 200:
            zip_path = raw_dir / fname
            with open(zip_path, "wb") as f:
                for chunk in resp.iter_content(8_192):
                    f.write(chunk)
            print(f"Downloaded {url}")
            break
        else:
            print(f"{url} returned {resp.status_code}")
    if not zip_path:
        raise FileNotFoundError(f"No CSV ZIP found for {year}-{month:02}")

    # 2) Extract CSV(s)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(raw_dir)
    print(f"Extracted to {raw_dir}")

    # 3) Collect CSV files
    csvs = list(raw_dir.glob(f"{year}{month:02}*-bluebikes-tripdata*.csv"))
    if not csvs:
        csvs = list(raw_dir.glob("*.csv"))
    folder = raw_dir / zip_path.stem
    if folder.is_dir():
        csvs += list(folder.glob("*.csv"))

    if not csvs:
        raise FileNotFoundError(f"No CSVs found after extracting {zip_path}")

    # 4) Read & concatenate
    dfs = []
    for csv in csvs:
        print(f"Reading {csv.relative_to(raw_dir)}")
        dfs.append(pd.read_csv(csv))
    df = pd.concat(dfs, ignore_index=True)

    print(f"DataFrame shape: {df.shape}")
    # 5) Clean & convert to Parquet

    df.dropna(subset=["start_station_id", "end_station_id"], inplace=True)
    
    print(f"DataFrame shape after cleaning: {df.shape}")

    out_path = raw_dir / f"rides_{year}_{month:02}.parquet"
    df.to_parquet(out_path, index=False)
    print(f"Converted to parquet: {out_path}")

    
    try:
        zip_path.unlink()
        for csv in csvs:
            csv.unlink()
        print("Cleaned up ZIP and CSV files")
    except Exception as e:
        print(f"Cleanup warning: {e}")

    return str(out_path)

In [77]:
for month in range(4, 13):
    try:
        fetch_raw_trip_data(2023, month)
    except Exception as e:
        print(f"Error processing {2023}-{month:02}: {e}")



https://s3.amazonaws.com/hubway-data/202304-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202304-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202304-bluebikes-tripdata.csv
DataFrame shape: (296291, 13)
DataFrame shape after cleaning: (294553, 13)
Converted to parquet: ../data/raw/rides_2023_04.parquet
Cleaned up ZIP and CSV files
https://s3.amazonaws.com/hubway-data/202305-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202305-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202305-bluebikes-tripdata.csv
DataFrame shape: (387593, 13)
DataFrame shape after cleaning: (385574, 13)
Converted to parquet: ../data/raw/rides_2023_05.parquet
Cleaned up ZIP and CSV files
https://s3.amazonaws.com/hubway-data/202306-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202306-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202306-bluebikes-tripdata.csv
DataF

In [78]:
for month in range(1, 13):
    try:
        fetch_raw_trip_data(2024, month)
    except Exception as e:
        print(f"Error processing {2024}-{month:02}: {e}")

https://s3.amazonaws.com/hubway-data/202401-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202401-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202401-bluebikes-tripdata.csv
DataFrame shape: (166699, 13)
DataFrame shape after cleaning: (166200, 13)
Converted to parquet: ../data/raw/rides_2024_01.parquet
Cleaned up ZIP and CSV files
https://s3.amazonaws.com/hubway-data/202402-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202402-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202402-bluebikes-tripdata.csv
DataFrame shape: (231947, 13)
DataFrame shape after cleaning: (231163, 13)
Converted to parquet: ../data/raw/rides_2024_02.parquet
Cleaned up ZIP and CSV files
https://s3.amazonaws.com/hubway-data/202403-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202403-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202403-bluebikes-tripdata.csv
DataF

In [79]:
for month in range(1, 9):
    try:
        fetch_raw_trip_data(2025, month)
    except Exception as e:
        print(f"Error processing {2025}-{month:02}: {e}")

https://s3.amazonaws.com/hubway-data/202501-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202501-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202501-bluebikes-tripdata.csv
DataFrame shape: (162316, 13)
DataFrame shape after cleaning: (161926, 13)
Converted to parquet: ../data/raw/rides_2025_01.parquet
Cleaned up ZIP and CSV files
https://s3.amazonaws.com/hubway-data/202502-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202502-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202502-bluebikes-tripdata.csv
DataFrame shape: (166022, 13)
DataFrame shape after cleaning: (165742, 13)
Converted to parquet: ../data/raw/rides_2025_02.parquet
Cleaned up ZIP and CSV files
https://s3.amazonaws.com/hubway-data/202503-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202503-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202503-bluebikes-tripdata.csv
DataF

In [32]:
fetch_raw_trip_data(2025, 9)

https://s3.amazonaws.com/hubway-data/202509-bluebikes-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/hubway-data/202509-bluebikes-tripdata.zip
Extracted to ../data/raw
Reading 202509-bluebikes-tripdata.csv
DataFrame shape: (586979, 13)
DataFrame shape after cleaning: (585931, 13)
Converted to parquet: ../data/raw/rides_2025_09.parquet
Cleaned up ZIP and CSV files


'../data/raw/rides_2025_09.parquet'

In [21]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [1]:
STATIONS_FILE = "/mnt/data/-External-_Bluebikes_Station_List.xlsx"

In [2]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import time
import pyarrow.parquet as pq
import pyarrow as pa


API_KEY = "ceb8408f78806b389b91c54f68c364f6"   # <-- insert your key here
BASE_URL = "https://history.openweathermap.org/data/2.5/history/city"

STATIONS_FILE = "/Users/narendravarma/Documents/blue_bikes/data/raw/-External-_Bluebikes_Station_List.xlsx"
OUTPUT_FILE = "bluebikes_with_weather_2023.parquet"

# ================================
# WEATHER FETCH FUNCTION
# ================================
def fetch_openweather_history(lat: float, lon: float, start: datetime, end: datetime, units: str = "metric") -> pd.DataFrame:
    """
    Fetch hourly historical weather from OpenWeather for the given lat/lon and time window.
    Breaks into small chunks (5 days) due to API limits.
    """
    all_records = []
    window = timedelta(days=5)
    current_start = start

    while current_start < end:
        current_end = min(current_start + window, end)
        start_unix = int(current_start.timestamp())
        end_unix = int(current_end.timestamp())

        params = {
            "lat": lat,
            "lon": lon,
            "type": "hour",
            "start": start_unix,
            "end": end_unix,
            "units": units,
            "appid": API_KEY,
        }

        resp = requests.get(BASE_URL, params=params)
        if resp.status_code != 200:
            print(f"Error {resp.status_code} for {lat},{lon} from {current_start} to {current_end}")
        else:
            data = resp.json()
            if "list" in data:
                for entry in data["list"]:
                    rec = {
                        "timestamp": datetime.fromtimestamp(entry["dt"]),
                        "lat_round": round(lat, 2),
                        "lon_round": round(lon, 2),
                        "temp": entry["main"].get("temp"),
                        "humidity": entry["main"].get("humidity"),
                        "pressure": entry["main"].get("pressure"),
                        "wind_speed": entry.get("wind", {}).get("speed"),
                        "wind_deg": entry.get("wind", {}).get("deg"),
                        "precipitation": entry.get("rain", {}).get("1h", 0.0)
                    }
                    all_records.append(rec)

        current_start = current_end + timedelta(seconds=1)
        time.sleep(1)  # avoid hitting rate limits

    return pd.DataFrame(all_records)


# ================================
# 1. Load Stations
# ================================
stations = pd.read_excel(STATIONS_FILE, header=1)
stations.head()



Unnamed: 0,Number,NAME,Lat,Long,Seasonal Status,Municipality,Total Docks,Station ID (to match to historic system data)
0,L32001,Railroad Lot and Minuteman Bikeway,42.416065,-71.153366,Year Round,Arlington,11,461
1,L32002,Linwood St at Minuteman Bikeway,42.409354,-71.149065,Year Round,Arlington,11,462
2,L32005,Thorndike Field at Minuteman Bikeway,42.400168,-71.14457,Year Round,Arlington,11,480
3,L32003,Mass Ave at Grafton St,42.407261,-71.143821,Year Round,Arlington,11,464
4,L32004,Broadway at Grafton St,42.409942,-71.140093,Winter Storage,Arlington,11,465


In [24]:
stations = pd.read_excel(STATIONS_FILE, header=1)
stations.head()

Unnamed: 0,Number,NAME,Lat,Long,Seasonal Status,Municipality,Total Docks,Station ID (to match to historic system data)
0,L32001,Railroad Lot and Minuteman Bikeway,42.416065,-71.153366,Year Round,Arlington,11,461
1,L32002,Linwood St at Minuteman Bikeway,42.409354,-71.149065,Year Round,Arlington,11,462
2,L32005,Thorndike Field at Minuteman Bikeway,42.400168,-71.14457,Year Round,Arlington,11,480
3,L32003,Mass Ave at Grafton St,42.407261,-71.143821,Year Round,Arlington,11,464
4,L32004,Broadway at Grafton St,42.409942,-71.140093,Winter Storage,Arlington,11,465


In [29]:
stations.shape

(572, 10)

In [27]:
stations["lat_round"] = stations["Lat"].round(1)
stations["lon_round"] = stations["Long"].round(1)

print(stations.head())
unique_coords = stations[["lat_round", "lon_round"]].drop_duplicates()

unique_coords.shape, unique_coords.head()

   Number                                  NAME        Lat       Long  \
0  L32001    Railroad Lot and Minuteman Bikeway  42.416065 -71.153366   
1  L32002       Linwood St at Minuteman Bikeway  42.409354 -71.149065   
2  L32005  Thorndike Field at Minuteman Bikeway  42.400168 -71.144570   
3  L32003                Mass Ave at Grafton St  42.407261 -71.143821   
4  L32004                Broadway at Grafton St  42.409942 -71.140093   

  Seasonal Status Municipality  Total Docks  \
0      Year Round    Arlington           11   
1      Year Round    Arlington           11   
2      Year Round    Arlington           11   
3      Year Round    Arlington           11   
4  Winter Storage    Arlington           11   

  Station ID (to match to historic system data)  lat_round  lon_round  
0                                           461       42.4      -71.2  
1                                           462       42.4      -71.1  
2                                           480       42.4    

((7, 2),
     lat_round  lon_round
 0        42.4      -71.2
 1        42.4      -71.1
 6        42.4      -71.0
 10       42.3      -71.1
 51       42.3      -71.0)

In [28]:
unique_coords

Unnamed: 0,lat_round,lon_round
0,42.4,-71.2
1,42.4,-71.1
6,42.4,-71.0
10,42.3,-71.1
51,42.3,-71.0
67,42.3,-71.2
423,42.5,-70.9


In [6]:
unique_coords = unique_coords.drop(unique_coords.index[-1])

In [7]:
unique_coords

Unnamed: 0,lat_round,lon_round
0,42.4,-71.2
1,42.4,-71.1
6,42.4,-71.0
10,42.3,-71.1
51,42.3,-71.0
67,42.3,-71.2
423,42.5,-70.9


In [51]:
!pip install folium

Collecting folium
  Downloading folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting branca>=0.6.0 (from folium)
  Using cached branca-0.8.1-py3-none-any.whl.metadata (1.5 kB)
Collecting xyzservices (from folium)
  Downloading xyzservices-2025.4.0-py3-none-any.whl.metadata (4.3 kB)
Downloading folium-0.20.0-py2.py3-none-any.whl (113 kB)
Using cached branca-0.8.1-py3-none-any.whl (26 kB)
Downloading xyzservices-2025.4.0-py3-none-any.whl (90 kB)
Installing collected packages: xyzservices, branca, folium
Successfully installed branca-0.8.1 folium-0.20.0 xyzservices-2025.4.0


In [None]:
import pandas as pd
import folium

# 1. Create a DataFrame from your coordinates
# Note: I'm converting the output you provided into a structured list of unique points
data = {
    'lat_round': [42.4, 42.4, 42.4, 42.3, 42.3, 42.3, 42.5],
    'lon_round': [-71.2, -71.1, -71.0, -71.1, -71.0, -71.2]
}
df = pd.DataFrame(data).drop_duplicates()

# 2. Determine the center point of your data (for initial map view)
center_lat = df['lat_round'].mean()
center_lon = df['lon_round'].mean()

# 3. Initialize the Folium Map
# The location (42.4, -71.1) is the center of your data cluster (Boston area)
m = folium.Map(location=[center_lat, center_lon], zoom_start=11)

# 4. Add a Marker for each unique rounded coordinate
for index, row in df.iterrows():
    # You can customize the marker with a popup label
    popup_text = f"Lat: {row['lat_round']}, Lon: {row['lon_round']}"
    
    folium.Marker(
        location=[row['lat_round'], row['lon_round']],
        popup=popup_text,
        tooltip='Station Cluster'
    ).add_to(m)

# 5. Save the map to an HTML file
# You can open this file in any web browser to view the interactive map
m.save('rounded_stations_map.html')

print("Map saved to rounded_stations_map.html")

Map saved to rounded_stations_map.html


In [10]:
raw_dir = Path("..") / "data" / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)

In [None]:
weather_frames = []
month = 1
year = 2025
start_date = datetime(2025, 2, 1)
end_date = datetime(2025, 2, 31, 23, 59, 59)

for _, row in unique_coords.iterrows():
    lat, lon = row["lat_round"], row["lon_round"]
    print(f"Fetching weather for {lat},{lon}")
    df_weather = fetch_openweather_history(lat, lon, start_date, end_date, units="imperial")
    if not df_weather.empty:
        print(df_weather.head())
        weather_frames.append(df_weather)


weather_df = pd.concat(weather_frames, ignore_index=True)

# Ensure datetime types
weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"], errors="coerce", utc=True)
weather_df["hour"] = weather_df["timestamp"].dt.floor("H")

# Enforce numeric types
num_cols = ["temp", "humidity", "pressure", "wind_speed", "wind_deg", "precipitation"]
for col in num_cols:
    weather_df[col] = pd.to_numeric(weather_df[col], errors="coerce")

# Enforce float type for lat/lon
weather_df["lat_round"] = weather_df["lat_round"].astype("float32")
weather_df["lon_round"] = weather_df["lon_round"].astype("float32")

out_path1 = raw_dir / f"weather_{year}_{month:02}.parquet"

# Save clean parquet
weather_df.to_parquet(out_path1, index=False, engine="pyarrow")


Fetching weather for 42.4,-71.2
            timestamp  lat_round  lon_round   temp  humidity  pressure  \
0 2025-01-01 00:00:00       42.4      -71.2  40.82        85      1006   
1 2025-01-01 01:00:00       42.4      -71.2  42.57        83      1003   
2 2025-01-01 02:00:00       42.4      -71.2  43.59        85      1003   
3 2025-01-01 03:00:00       42.4      -71.2  44.11        88      1002   
4 2025-01-01 04:00:00       42.4      -71.2  44.53        88      1000   

   wind_speed  wind_deg  precipitation  
0        6.91        80           0.00  
1        8.05        50           0.00  
2       11.01       134           5.15  
3       14.97        70           5.91  
4       19.57        90           0.00  
Fetching weather for 42.4,-71.1
            timestamp  lat_round  lon_round   temp  humidity  pressure  \
0 2025-01-01 00:00:00       42.4      -71.1  41.04        86      1006   
1 2025-01-01 01:00:00       42.4      -71.1  43.03        84      1003   
2 2025-01-01 02:00:00  

  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [None]:
import pandas as pd
from datetime import datetime
from pathlib import Path
from src.config import RAW_DATA_DIR

# Configure paths
raw_dir = RAW_DATA_DIR  # Update this path


In [12]:
def download_weather_month(year, month, unique_coords, units="imperial"):
    """
    Download weather data for all coordinates for a specific month
    
    Parameters:
    - year: int (e.g., 2025)
    - month: int (1-12)
    - unique_coords: DataFrame with lat_round, lon_round columns
    - units: temperature units ("imperial" or "metric")
    
    Returns:
    - Path to saved file if successful, None otherwise
    """
    # Calculate proper start and end dates for the month
    if month == 12:
        end_year = year + 1
        end_month = 1
    else:
        end_year = year
        end_month = month + 1
    
    start_date = datetime(year, month, 1)
    end_date = datetime(end_year, end_month, 1)
    # Subtract 1 second to get last moment of the last day of target month
    end_date = end_date - pd.Timedelta(seconds=1)
    
    weather_frames = []
    
    print(f"Downloading weather data for {year}-{month:02d}...")
    print(f"Date range: {start_date.date()} to {end_date.date()}")
    print(f"Processing {len(unique_coords)} locations...")
    
    for _, row in unique_coords.iterrows():
        lat, lon = row["lat_round"], row["lon_round"]
        df_weather = fetch_openweather_history(lat, lon, start_date, end_date, units=units)
        
        if not df_weather.empty:
            weather_frames.append(df_weather)
    
    if not weather_frames:
        print("‚ö†Ô∏è  No weather data was fetched!")
        return None
    
    # Combine and process all data
    weather_df = pd.concat(weather_frames, ignore_index=True)
    
    # Data cleaning
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"], errors="coerce", utc=True)
    weather_df["hour"] = weather_df["timestamp"].dt.floor("H")
    
    # Enforce numeric types
    num_cols = ["temp", "humidity", "pressure", "wind_speed", "wind_deg", "precipitation"]
    for col in num_cols:
        weather_df[col] = pd.to_numeric(weather_df[col], errors="coerce")
    
    # Enforce float type for lat/lon
    weather_df["lat_round"] = weather_df["lat_round"].astype("float32")
    weather_df["lon_round"] = weather_df["lon_round"].astype("float32")
    
    # Save file
    out_path = raw_dir / f"weather_{year}_{month:02d}.parquet"
    weather_df.to_parquet(out_path, index=False, engine="pyarrow")
    
    print(f"‚úÖ Successfully saved {len(weather_df):,} records to: {out_path}")
    return out_path

In [13]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 9  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-09...
Date range: 2025-09-01 to 2025-09-30
Processing 7 locations...
‚úÖ Successfully saved 5,040 records to: ../data/raw/weather_2025_09.parquet
üéâ Download completed for 2025-09
üìÅ File: ../data/raw/weather_2025_09.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [14]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 8  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-08...
Date range: 2025-08-01 to 2025-08-31
Processing 7 locations...
‚úÖ Successfully saved 5,208 records to: ../data/raw/weather_2025_08.parquet
üéâ Download completed for 2025-08
üìÅ File: ../data/raw/weather_2025_08.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [15]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 7  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-07...
Date range: 2025-07-01 to 2025-07-31
Processing 7 locations...
‚úÖ Successfully saved 5,208 records to: ../data/raw/weather_2025_07.parquet
üéâ Download completed for 2025-07
üìÅ File: ../data/raw/weather_2025_07.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [16]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 6  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-06...
Date range: 2025-06-01 to 2025-06-30
Processing 7 locations...
‚úÖ Successfully saved 5,040 records to: ../data/raw/weather_2025_06.parquet
üéâ Download completed for 2025-06
üìÅ File: ../data/raw/weather_2025_06.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [17]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 5  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-05...
Date range: 2025-05-01 to 2025-05-31
Processing 7 locations...
‚úÖ Successfully saved 5,208 records to: ../data/raw/weather_2025_05.parquet
üéâ Download completed for 2025-05
üìÅ File: ../data/raw/weather_2025_05.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [18]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 4  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-04...
Date range: 2025-04-01 to 2025-04-30
Processing 7 locations...
‚úÖ Successfully saved 5,040 records to: ../data/raw/weather_2025_04.parquet
üéâ Download completed for 2025-04
üìÅ File: ../data/raw/weather_2025_04.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [19]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 3  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-03...
Date range: 2025-03-01 to 2025-03-31
Processing 7 locations...
‚úÖ Successfully saved 5,187 records to: ../data/raw/weather_2025_03.parquet
üéâ Download completed for 2025-03
üìÅ File: ../data/raw/weather_2025_03.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [20]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 2  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

Downloading weather data for 2025-02...
Date range: 2025-02-01 to 2025-02-28
Processing 7 locations...
‚úÖ Successfully saved 4,704 records to: ../data/raw/weather_2025_02.parquet
üéâ Download completed for 2025-02
üìÅ File: ../data/raw/weather_2025_02.parquet


  weather_df["hour"] = weather_df["timestamp"].dt.floor("H")


In [None]:
# CONFIGURATION - Update these values for each download
YEAR = 2025
MONTH = 1  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

In [None]:
# CONFIGURATION - Update these values for each download
YEAR = 2024
MONTH = 10  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

In [None]:
# CONFIGURATION - Update these values for each download
YEAR = 2024
MONTH = 11  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")

In [None]:
# CONFIGURATION - Update these values for each download
YEAR = 2024
MONTH = 12  # February

# Download weather data
result_path = download_weather_month(
    year=YEAR, 
    month=MONTH, 
    unique_coords=unique_coords,  # Your coordinates DataFrame
    units="imperial"
)

if result_path:
    print(f"üéâ Download completed for {YEAR}-{MONTH:02d}")
    print(f"üìÅ File: {result_path}")
else:
    print(f"‚ùå Download failed for {YEAR}-{MONTH:02d}")