In [22]:
import openmeteo_requests
import requests_cache
import pandas as pd
import numpy as np
from retry_requests import retry
from datetime import datetime, timedelta
import json

import json
import requests

# Load configuration from a JSON file hosted on GitHub
def load_config_from_github(raw_url):
    """
    Load a JSON configuration file from a GitHub raw URL.

    :param raw_url: The raw URL of the JSON file on GitHub
    :return: Parsed JSON data
    """
    try:
        response = requests.get(raw_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the configuration file: {e}")
        return None

# GitHub raw link for config.json
config_raw_url = "https://raw.githubusercontent.com/mattia-rampazzo/bdt_project/4724beda4def84536931119d552e1f772f8801de/Predictions/config.json"

# Load latitude and longitude from the config.json hosted on GitHub
config = load_config_from_github(config_raw_url)

# Debugging: Print the config to verify
if config:
    print("Configuration loaded successfully:")
    print(config)
else:
    print("Failed to load configuration.")

latitude = config["latitude"]
longitude = config["longitude"]

# Calculate dynamic dates
end_date = (datetime.today() - timedelta(days=1)).date()  # Today - 1 day
start_date = end_date - timedelta(days=60)  # End date - 2 months

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Fetch pollen data
pollen_params = {
    "latitude": latitude,
    "longitude": longitude,
    "hourly": [
        "alder_pollen", "birch_pollen", "grass_pollen",
        "mugwort_pollen", "olive_pollen", "ragweed_pollen"
    ],
    "start_date": str(start_date),
    "end_date": str(end_date)
}
pollen_response = openmeteo.weather_api("https://air-quality-api.open-meteo.com/v1/air-quality", params=pollen_params)[0]
pollen_hourly = pollen_response.Hourly()

# Build pollen data dictionary
pollen_data = {
    "time": pd.date_range(
        start=pd.to_datetime(pollen_hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(pollen_hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=pollen_hourly.Interval()),
        inclusive="left"
    )
}
for i, var in enumerate(pollen_params["hourly"]):
    pollen_data[f"{var} (grains/m³)"] = pollen_hourly.Variables(i).ValuesAsNumpy()

pollen_df = pd.DataFrame(pollen_data)

# Fetch weather data
weather_params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": str(start_date),
    "end_date": str(end_date),
    "hourly": [
        "temperature_2m", "relative_humidity_2m", "precipitation", "rain",
        "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high",
        "wind_speed_10m", "soil_temperature_0_to_7cm"
    ]
}
weather_response = openmeteo.weather_api("https://archive-api.open-meteo.com/v1/archive", params=weather_params)[0]
weather_hourly = weather_response.Hourly()

# Build weather data dictionary
weather_data = {
    "time": pd.date_range(
        start=pd.to_datetime(weather_hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(weather_hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=weather_hourly.Interval()),
        inclusive="left"
    )
}
for i, var in enumerate(weather_params["hourly"]):
    weather_data[f"{var}"] = weather_hourly.Variables(i).ValuesAsNumpy()

weather_df = pd.DataFrame(weather_data)

# Merge pollen and weather data
merged_df = pd.merge(weather_df, pollen_df, on="time", how="inner")

# Compute daily averages
merged_df["date"] = pd.to_datetime(merged_df["time"]).dt.date
daily_data = merged_df.groupby("date").mean().reset_index()

# Save only the daily data to a CSV file
output_file = "daily_data.csv"
daily_data.to_csv(output_file, index=False)

print(f"Daily data saved to {output_file}")


Configuration loaded successfully:
{'latitude': 46.0679, 'longitude': 11.1211, 'pollen_types': ['Alder pollen (grains/m³)', 'Birch pollen (grains/m³)', 'Grass pollen (grains/m³)', 'Mugwort pollen (grains/m³)', 'Olive pollen (grains/m³)', 'Ragweed pollen (grains/m³)'], 'selected_pollen': 'Birch pollen (grains/m³)'}
Daily data saved to daily_data.csv


In [29]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import joblib
import os
import json
import gdown
import zipfile

# Mapping of valleys to their precise model filenames
valley_to_model_map = {
    "Valli Giudicarie": "Valli Giudicarie_models.pkl",
    "Val Badia": "Val Badia_models.pkl",
    "Alta Valsugana": "Alta Valsugana_models.pkl",
    "Alto Garda e Ledro": "Alto Garda e Ledro_models.pkl",
    "Val d'Adige": "Val d'Adige_models.pkl",
    "Valsugana": "Valsugana_models.pkl",
    "Valle di Primiero": "Valle di Primiero_models.pkl",
    "Val di Non": "Val di Non_models.pkl",
    "Salto-Sciliar": "Salto-Sciliar_models.pkl",
    "Burgraviato": "Burgraviato_models.pkl",
    "Val di Sole": "Val di Sole_models.pkl",
    "Valle di Fassa": "Valle di Fassa_models.pkl",
    "Vallagarina": "Vallagarina_models.pkl",
    "Valle di Fiemme": "Valle di Fiemme_models.pkl",
    "Val Venosta": "Val Venosta_models.pkl",
    "Val Pusteria": "Val Pusteria_models.pkl",
    "Oltradige-Bassa Atesina": "Oltradige-Bassa Atesina_models.pkl",
    "Valle di Cembra": "Valle di Cembra_models.pkl",
    "Valle dei Laghi": "Valle dei Laghi_models.pkl",
    "Val Rendena": "Val Rendena_models.pkl",
}

# Online sources
valley_boundaries_file_url = "https://raw.githubusercontent.com/mattia-rampazzo/bdt_project/0ffbd5ae549aa248e733ed77e7399d952abea591/Predictions/Valley_Boundaries.csv"
models_drive_file_url = "https://drive.google.com/file/d/171oVWCZK17jGUVWV3-p9Q3_qt-8fYzpk/view?usp=share_link"
models_output_directory = "./ValleysModels1"

# Function to download and extract Google Drive files
def download_and_extract_google_drive_file(drive_file_url, output_directory):
    file_id = drive_file_url.split("/d/")[1].split("/")[0]
    download_url = f"https://drive.google.com/uc?id={file_id}"
    zip_file_name = "downloaded_file.zip"
    
    print(f"Downloading file from Google Drive: {drive_file_url}")
    gdown.download(download_url, zip_file_name, quiet=False)

    if zipfile.is_zipfile(zip_file_name):
        print(f"Extracting {zip_file_name} to {output_directory}")
        with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
            zip_ref.extractall(output_directory)
        os.remove(zip_file_name)
        print(f"File extracted successfully to {output_directory}")
    else:
        print(f"Downloaded file is not a zip file. Saved as {zip_file_name}.")

# Ensure models folder exists
if not os.path.exists(models_output_directory):
    download_and_extract_google_drive_file(models_drive_file_url, models_output_directory)
else:
    print(f"The folder '{models_output_directory}' already exists. Skipping download.")

# Function to adjust model paths
def adjust_model_path(output_directory, valley_to_model_map):
    for root, dirs, files in os.walk(output_directory):
        if any(file.endswith(".pkl") for file in files):
            model_directory = root
            break
    else:
        raise FileNotFoundError("No .pkl model files found in the specified directory.")

    updated_model_map = {}
    for valley, model_filename in valley_to_model_map.items():
        model_path = os.path.join(model_directory, model_filename)
        if os.path.exists(model_path):
            updated_model_map[valley] = model_path
        else:
            print(f"Warning: Model file for '{valley}' not found. Skipping.")
    
    return updated_model_map

# Correct the models folder structure
valley_to_model_map = adjust_model_path(models_output_directory, valley_to_model_map)

# Load valley boundaries from GitHub
try:
    valley_boundaries = pd.read_csv(valley_boundaries_file_url)
    valley_boundaries['geometry'] = valley_boundaries['Boundary Coordinates'].apply(
        lambda x: Polygon(eval(x)) if pd.notnull(x) else None
    )
    valley_gdf = gpd.GeoDataFrame(valley_boundaries, geometry='geometry')
    print("Valley boundaries loaded successfully:")
    print(valley_gdf.head())
except Exception as e:
    print(f"Error loading the valley boundaries file: {e}")

# Function to determine the valley or closest valley
def get_valley_or_model(lat, lon):
    point = Point(lon, lat)
    for _, row in valley_gdf.iterrows():
        if row['geometry'] and row['geometry'].contains(point):
            valley_name = row['Valley']
            model_file = valley_to_model_map.get(valley_name, None)
            if model_file:
                return valley_name, model_file
            else:
                raise ValueError(f"No model file mapping found for valley: {valley_name}")
    valley_gdf['distance'] = valley_gdf['geometry'].apply(lambda geom: geom.centroid.distance(point) if geom else float('inf'))
    closest_valley = valley_gdf.loc[valley_gdf['distance'].idxmin()]
    valley_name = closest_valley['Valley']
    model_file = valley_to_model_map.get(valley_name, None)
    if model_file:
        return valley_name, model_file
    else:
        raise ValueError(f"No model file mapping found for valley: {valley_name}")

# Remaining Functions (unchanged)
def rename_columns(data):
    column_mapping = {
        "alder_pollen (grains/m³)": "Alder pollen (grains/m³)",
        "birch_pollen (grains/m³)": "Birch pollen (grains/m³)",
        "grass_pollen (grains/m³)": "Grass pollen (grains/m³)",
        "mugwort_pollen (grains/m³)": "Mugwort pollen (grains/m³)",
        "olive_pollen (grains/m³)": "Olive pollen (grains/m³)",
        "ragweed_pollen (grains/m³)": "Ragweed pollen (grains/m³)",
        "precipitation": "precipitation (mm)",
        "temperature_2m": "temperature_2m (°C)"
    }
    return data.rename(columns=column_mapping)

def generate_lagged_features(data, pollen_type):
    data[f"{pollen_type}_lag_1"] = data[pollen_type].shift(1)
    data[f"{pollen_type}_lag_2"] = data[pollen_type].shift(2)
    data[f"{pollen_type}_lag_3"] = data[pollen_type].shift(3)
    return data

def add_missing_features(data, required_features):
    for feature in required_features:
        if feature not in data.columns:
            data[feature] = 0
    return data

def load_model(model_path):
    if os.path.exists(model_path):
        return joblib.load(model_path)
    else:
        raise FileNotFoundError(f"Model file not found at: {model_path}")

def predict_pollen(model, data, features):
    X = data[features]
    return model.predict(X)

def main(config_file, historical_data_path):
    config = load_config_from_github(config_file)
    lat = config["latitude"]
    lon = config["longitude"]
    pollen_types = config["pollen_types"]
    selected_pollen = config["selected_pollen"]

    if selected_pollen not in pollen_types:
        raise ValueError(f"Invalid pollen type selected: {selected_pollen}. Valid options: {pollen_types}")

    valley_name, model_file = get_valley_or_model(lat, lon)
    print(f"Using model for valley: {valley_name} -> {model_file}")

    historical_data = pd.read_csv(historical_data_path)
    historical_data = rename_columns(historical_data)
    historical_data = generate_lagged_features(historical_data, selected_pollen)

    model_dict = load_model(model_file)
    if selected_pollen not in model_dict:
        raise ValueError(f"No model found for pollen type: {selected_pollen}")
    model = model_dict[selected_pollen]

    required_features = model.feature_names_in_
    historical_data = add_missing_features(historical_data, required_features)

    prepared_data = historical_data.dropna()

    today_data = prepared_data.iloc[-1:]
    today_prediction = predict_pollen(model, today_data, required_features)

    tomorrow_data = today_data.copy()
    tomorrow_data[selected_pollen] = today_prediction
    tomorrow_data = generate_lagged_features(tomorrow_data, selected_pollen).iloc[-1:]
    tomorrow_prediction = predict_pollen(model, tomorrow_data, required_features)

    return {
        "today_prediction": today_prediction[0],
        "tomorrow_prediction": tomorrow_prediction[0]
    }

if __name__ == "__main__":
    config_file = config_raw_url
    historical_data_file = "daily_data.csv"

    try:
        predictions = main(config_file, historical_data_file)
        print(f"Today's Prediction: {predictions['today_prediction']}")
        print(f"Tomorrow's Prediction: {predictions['tomorrow_prediction']}")
    except Exception as e:
        print(f"An error occurred: {e}")


The folder './ValleysModels1' already exists. Skipping download.
Valley boundaries loaded successfully:
                    Valley                               Boundary Coordinates  \
0           Alta Valsugana  [(45.908779905847766, 11.199106310628531), (46...   
1       Alto Garda e Ledro  [(45.95528476539058, 10.963022214633508), (45....   
2              Burgraviato  [(46.50883455076642, 11.131653482240315), (46....   
3  Oltradige-Bassa Atesina  [(46.467482496744346, 11.255986244457276), (46...   
4            Salto-Sciliar  [(46.51848007594064, 11.232025774529577), (46....   

                                            geometry  
0  POLYGON ((45.90878 11.19911, 46.10109 11.18055...  
1  POLYGON ((45.95528 10.96302, 45.85142 10.88187...  
2  POLYGON ((46.50883 11.13165, 46.69839 11.04435...  
3  POLYGON ((46.46748 11.25599, 46.3595 11.39256,...  
4  POLYGON ((46.51848 11.23203, 46.58579 11.26315...  
Using model for valley: Valle di Primiero -> ./ValleysModels1/ValleysModels1/Va

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
