### This notebook will be automatically filled in with your parameters.

In [None]:
import subprocess
import sys
import os
import datetime
import json
from typing import List, Dict, Any, Optional, Union
import datarobot as dr
from datarobot.models.use_cases.utils import UseCaseLike


try:
    import openmeteo_requests
    import requests_cache
    import pandas as pd
    from retry_requests import retry
    import pytz

except ImportError as e:
    print("Installing packages")
    missing_packages = []

    try:
        import openmeteo_requests
    except ImportError:
        missing_packages.append('openmeteo_requests')

    try:
        import requests_cache
    except ImportError:
        missing_packages.append('requests_cache')

    try:
        import pandas as pd
    except ImportError:
        missing_packages.append('pandas')

    try:
        from retry_requests import retry
    except ImportError:
        missing_packages.append('retry_requests')

    try:
        import pytz
    except ImportError:
        missing_packages.append('pytz')

    if missing_packages:
        print(f"Installing missing packages: {' '.join(missing_packages)}")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + missing_packages)

    # Try importing again after installation
    import openmeteo_requests
    import requests_cache
    import pandas as pd
    from retry_requests import retry
    import datetime
    import pytz


In [None]:
def get_today_city_data(
        locations: List[Dict[str, float]], 
        parameters: Dict[str, Any]
) -> pd.DataFrame:

    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://api.open-meteo.com/v1/forecast"

    latitudes = [locations[city]["Latitude"] for city in locations]
    longitudes = [locations[city]["Longitude"] for city in locations]
    city_names = list(locations.keys())

    parameters["past_days"] = 1 # This should be tied to when the notebook scheduled to pull
    parameters["latitude"] = latitudes
    parameters["longitude"] = longitudes

    responses = openmeteo.weather_api(url, params=parameters)

    all_data = []
    for i, response in enumerate(responses):
        # TODO: Log this?
        print(f"Gathered weather data from {city_names[i]}")

        # Process hourly data. The order of variables needs to be the same as requested.
        hourly = response.Hourly()
        hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()        
        hourly_precipitation_probability = hourly.Variables(1).ValuesAsNumpy()
        hourly_precipitation = hourly.Variables(2).ValuesAsNumpy()
        hourly_uv_index = hourly.Variables(3).ValuesAsNumpy()

        timezone = pytz.timezone(response.Timezone())
        timestart = datetime.datetime.fromtimestamp(hourly.Time(), tz=timezone)
        timeend = datetime.datetime.fromtimestamp(hourly.TimeEnd(), tz=timezone)

        hourly_data = {
            "date": pd.date_range(
                        start = timestart,
                        end = timeend,
                        freq = pd.Timedelta(seconds = hourly.Interval()),
                        inclusive = "left"
                    ).strftime('%Y-%m-%d %H:%M:%S'),
            "temperature": hourly_temperature_2m,
            "uv_index": hourly_uv_index,
            "precipitation_probability": hourly_precipitation_probability,
            "precipitation": hourly_precipitation,
            "elevation": response.Elevation(),
            "longitude": response.Latitude(), # make this an integer?
            "latitude": response.Longitude(), # make this an integer?
            "city": city_names[i]
        }
        all_data.append(pd.DataFrame(data=hourly_data))

    hourly_dataframe = pd.concat(all_data, ignore_index=True)
    return hourly_dataframe

### Now that we have the dataframe, we append it to the existing data in our usecase.

In [None]:
def _check_if_dataset_exists(
        name: str, 
        use_cases: Optional[UseCaseLike] = None
) -> Union[str, None]:
    """
    Check if a dataset with the given name exists in your use case
    Returns:
        id (string) or None
    """
    datasets = dr.Dataset.list(use_cases=use_cases)
    return next((dataset.id for dataset in datasets if dataset.name == name), None)

locations = json.loads(os.environ["locations"])
parameters = json.loads(os.environ["parameters"])

incoming_data = get_today_city_data(locations, parameters)

modeling_dataset_name = os.environ["modeling_dataset_name"]
use_case_id = os.environ["DATAROBOT_DEFAULT_USE_CASE"]

modeling_dataset_id = _check_if_dataset_exists(modeling_dataset_name, use_cases=use_case_id)
modeling_df = dr.Dataset.get(modeling_dataset_id).get_as_dataframe()

new_data = pd.concat([modeling_df, incoming_data])
new_data['date'] = pd.to_datetime(new_data['date'])

# Define a function to drop duplicates based on the 'date' column
def drop_duplicate_dates(group):
    return group.drop_duplicates(subset='date')

# Apply the function to each group
new_data = new_data.groupby('city', group_keys=False).apply(drop_duplicate_dates)

dr.Dataset.create_version_from_in_memory_data(modeling_dataset_id, new_data)