In [None]:
import requests
import pandas as pd
import io
import os
from datetime import datetime, timedelta
from typing import Optional, List

# --- Configuration ---
API_KEYS = [
    "API_KEY_HERE_1",
    "API_KEY_HERE_2",
]

LOCATION = "New York, NY, United States"  # Specific for Visual Crossing
OVERALL_START_DATETIME_STR = "2019-01-01T00:00:00"
# We go a bit into the future to cover the full dataset range
OVERALL_END_DATETIME_STR = "2025-10-01T00:00:00"

CHUNK_SIZE_HOURS = 930  # 15 Days per chunk (Safe for API limits)
OUTPUT_FILENAME = "nyc_weather_hourly_2019_2025.csv"
BACKUP_FILENAME = "nyc_weather_hourly_2019_2025.csv.bak"

# The "High Impact" Features only
ELEMENTS_HOURLY = [
    "datetime",
    "temp",
    "feelslike",
    "precip",
    "snow",
    "snowdepth",
    "windspeed",
    "visibility",
    "conditions",
    "icon",
]


In [2]:
def fetch_hourly_weather_data(
    api_key: str,
    location: str,
    start_datetime_str: str,
    end_datetime_str: str,
    elements: Optional[List[str]] = None,
) -> Optional[pd.DataFrame]:
    BASE_URL = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/"
    # Note: Visual Crossing uses dates in URL, so we format them cleanly
    request_url = f"{BASE_URL}{location}/{start_datetime_str}/{end_datetime_str}"

    params = {
        "unitGroup": "metric",  # CRITICAL: Ensures Celsius, KM, MM
        "include": "hours",
        "key": api_key,
        "contentType": "csv",
    }

    if elements:
        params["elements"] = ",".join(elements)

    try:
        response = requests.get(request_url, params=params)
        response.raise_for_status()
        return pd.read_csv(io.StringIO(response.text))
    except requests.exceptions.HTTPError as http_err:
        print(f"    -> HTTP Error: {http_err.response.status_code} with Key ending in ...{api_key[-4:]}")
        return None
    except Exception as err:
        print(f"    -> Unexpected error: {err}")
        return None


def clean_and_deduplicate(filename):
    """
    Post-processing to remove overlaps and sort.
    """
    print("\nüßπ Starting Post-Download Cleanup...")
    if not os.path.exists(filename):
        print("‚ùå File not found.")
        return

    df = pd.read_csv(filename)
    original_count = len(df)

    # 1. Deduplicate based on datetime
    df = df.drop_duplicates(subset=["datetime"], keep="first")

    # 2. Sort
    df["datetime"] = pd.to_datetime(df["datetime"])
    df = df.sort_values("datetime")

    # 3. Fill Nulls for Snow/Precip (Visual Crossing sometimes leaves 0 as Null)
    cols_to_fix = ["snow", "snowdepth", "precip"]
    for col in cols_to_fix:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    removed = original_count - len(df)
    print(f"‚úÖ Cleanup Complete. Removed {removed} duplicate rows.")
    print(f"üìÖ Data Range: {df['datetime'].min()} to {df['datetime'].max()}")

    # Save back
    df.to_csv(filename, index=False)
    print(f"üíæ Saved cleaned data to {filename}")


In [3]:
# 1. Resume Logic
if os.path.exists(OUTPUT_FILENAME):
    print("üìÇ Existing data file found. Resuming...")
    # We read the last few lines to find the last date
    df_existing = pd.read_csv(OUTPUT_FILENAME)
    last_datetime_str = df_existing["datetime"].iloc[-1]
    try:
        last_dt_obj = datetime.strptime(last_datetime_str, "%Y-%m-%dT%H:%M:%S")
    except ValueError:
        # Handle cases where seconds might be missing or format varies
        last_dt_obj = datetime.fromisoformat(last_datetime_str)

    current_start_dt = last_dt_obj + timedelta(hours=1)
    print(f"‚è≠Ô∏è Resuming from: {current_start_dt}")
else:
    print("üÜï Starting new download.")
    current_start_dt = datetime.strptime(OVERALL_START_DATETIME_STR, "%Y-%m-%dT%H:%M:%S")

final_end_dt = datetime.strptime(OVERALL_END_DATETIME_STR, "%Y-%m-%dT%H:%M:%S")

print("=" * 60)

key_index = 0

while current_start_dt < final_end_dt:
    if key_index >= len(API_KEYS):
        print("‚ùå All API keys exhausted for this session.")
        break

    current_key = API_KEYS[key_index]

    # Calculate chunk end
    chunk_end_dt = current_start_dt + timedelta(hours=CHUNK_SIZE_HOURS)
    if chunk_end_dt > final_end_dt:
        chunk_end_dt = final_end_dt

    start_str = current_start_dt.strftime("%Y-%m-%dT%H:%M:%S")
    end_str = chunk_end_dt.strftime("%Y-%m-%dT%H:%M:%S")

    print(f"üì• Fetching: {start_str} -> {end_str} (Key #{key_index + 1})")

    df_chunk = fetch_hourly_weather_data(
        api_key=current_key,
        location=LOCATION,
        start_datetime_str=start_str,
        end_datetime_str=end_str,
        elements=ELEMENTS_HOURLY,
    )

    if df_chunk is not None and not df_chunk.empty:
        # Atomic Append
        header = not os.path.exists(OUTPUT_FILENAME)
        df_chunk.to_csv(OUTPUT_FILENAME, mode="a", header=header, index=False)
        print(f"   ‚úÖ Saved {len(df_chunk)} rows.")

        # Advance time
        current_start_dt = chunk_end_dt + timedelta(hours=1)
    else:
        print("   ‚ö†Ô∏è Chunk failed or Key limit reached. Switching key...")
        key_index += 1

print("=" * 60)
# Run Cleanup
clean_and_deduplicate(OUTPUT_FILENAME)


üÜï Starting new download.
üì• Fetching: 2019-01-01T00:00:00 -> 2019-02-08T18:00:00 (Key #1)
   ‚úÖ Saved 936 rows.
üì• Fetching: 2019-02-08T19:00:00 -> 2019-03-19T13:00:00 (Key #1)
    -> HTTP Error: 429 with Key ending in ...62YZ
   ‚ö†Ô∏è Chunk failed or Key limit reached. Switching key...
üì• Fetching: 2019-02-08T19:00:00 -> 2019-03-19T13:00:00 (Key #2)
   ‚úÖ Saved 959 rows.
üì• Fetching: 2019-03-19T14:00:00 -> 2019-04-27T08:00:00 (Key #2)
    -> HTTP Error: 429 with Key ending in ...75V8
   ‚ö†Ô∏è Chunk failed or Key limit reached. Switching key...
üì• Fetching: 2019-03-19T14:00:00 -> 2019-04-27T08:00:00 (Key #3)
   ‚úÖ Saved 960 rows.
üì• Fetching: 2019-04-27T09:00:00 -> 2019-06-05T03:00:00 (Key #3)
    -> HTTP Error: 429 with Key ending in ...T6V6
   ‚ö†Ô∏è Chunk failed or Key limit reached. Switching key...
üì• Fetching: 2019-04-27T09:00:00 -> 2019-06-05T03:00:00 (Key #4)
   ‚úÖ Saved 960 rows.
üì• Fetching: 2019-06-05T04:00:00 -> 2019-07-13T22:00:00 (Key #4)
    -> H