In [None]:
from pathlib import Path
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
def download_file(url, file_path):
    """
    Download a file from a URL and save it to a specified path.

    :param url: URL to download
    :param file_path: Local path to save the downloaded file
    """
    try:
        with requests.get(url, stream=True, timeout=REQUEST_TIMEOUT, headers=HEADERS) as response:
            response.raise_for_status()  # Raise an error for bad status
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
        print(f"Downloaded: {file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

def download_nc_files_from_directory(base_url, day_dir):
    """
    Download all .nc files from a given directory using parallel downloads.

    :param base_url: Base URL for the directory containing .nc files (an HTML listing)
    :param day_dir: Local directory to save the downloaded files

    NOTE:
    - By default this filters for files that end with 'LV1.nc'. If your server uses a
      different pattern, change the check below (e.g. to '.nc').
    """
    try:
        response = requests.get(base_url, timeout=REQUEST_TIMEOUT, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Gather all .nc file URLs
        urls = []
        for link in soup.find_all('a'):
            href = link.get('href')
            if not href:
                continue
            # Adjust this line if you want all .nc files:
            #   if href.endswith('.nc'):
            # For Cloud Radar "LV1.nc" only:
            if href.endswith('LV1.nc'):
                # Prevent path traversal; only use the basename for local writes
                name = os.path.basename(href)
                file_url = f"{base_url.rstrip('/')}/{href}"
                file_path = os.path.join(day_dir, name)
                urls.append((file_url, file_path))

        # Download files in parallel
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_url = {executor.submit(download_file, url, path): url for url, path in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    future.result()
                except Exception as e:
                    print(f"Error downloading {url}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to access {base_url}: {e}")

def download_files_for_month(base_url_template, start_date, end_date, base_dir):
    """
    Download files for each day between start_date and end_date (inclusive).

    :param base_url_template: URL template with placeholders {year}, {month}, {day}
                              Example: "https://YOUR_PRIVATE_HOST/path/Y{year}/M{month}/D{day}"
    :param start_date: Start date (format: 'YYYY-MM-DD')
    :param end_date: End date (format: 'YYYY-MM-DD')
    :param base_dir: Base local directory where files are stored (created if missing)

    HOW TO USE base_url_template:
    - Replace YOUR_PRIVATE_HOST and path with your server structure.
    - Keep the {year}/{month}/{day} placeholders exactly as shown.
    - Example for a public-safe placeholder:
      "https://example.org/dataset/Y{year}/M{month}/D{day}"
    """
    current_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')

    while current_date <= end_date_obj:
        date_str = current_date.strftime('%Y-%m-%d')
        year_month = current_date.strftime('%Y-%m')
        day_str = current_date.strftime('%d')

        # Use the provided template instead of a hard-coded URL
        base_url = base_url_template.format(
            year=current_date.strftime('%Y'),
            month=current_date.strftime('%m'),
            day=day_str
        )

        day_dir = os.path.join(base_dir, year_month, date_str)
        print(f"Downloading files for {date_str} from {base_url} ...")
        download_nc_files_from_directory(base_url, day_dir)

        current_date += timedelta(days=1)

# -------------------------
# EXAMPLES
# -------------------------
# 1) Public-safe placeholder (edit to your server before running):
# base_url_template = "https://example.org/dataset/Y{year}/M{month}/D{day}"
#
# 2) If you need Cloud Radar with 'LV1.nc' suffix, keep the default filter above.
#    To download ALL .nc files instead, change `href.endswith('LV1.nc')` to `href.endswith('.nc')`.
#
# 3) Choose a neutral, repo-local output directory:
# base_dir = "./data/cloud_radar"
#
# 4) Pick dates:
# start_date = "2024-03-21"
# end_date   = "2024-03-31"
#
# 5) Run:
# download_files_for_month(base_url_template, start_date, end_date, base_dir)


In [None]:
def download_files_from_directory(base_url, day_dir, file_extension):
    """
    Download all files with a given extension from a directory using parallel downloads.

    :param base_url: Base URL for the directory containing the files (HTML listing page)
    :param day_dir: Local directory to save the downloaded files
    :param file_extension: File extension to filter (e.g., '.nc', '.dat')

    NOTE: Keep your real server URL private. In public code, pass it at runtime or
    store it in an environment variable. See usage examples at the bottom.
    """
    try:
        response = requests.get(base_url, timeout=REQUEST_TIMEOUT, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Gather all file URLs with the given extension
        urls = []
        for link in soup.find_all('a'):
            href = link.get('href')
            if not href:
                continue
            if href.endswith(file_extension):
                # Build absolute URL safely and prevent path traversal locally
                name = os.path.basename(href)
                file_url = f"{base_url.rstrip('/')}/{href}"
                file_path = os.path.join(day_dir, name)
                urls.append((file_url, file_path))

        # Download files in parallel
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_url = {executor.submit(download_file, url, path): url for url, path in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    future.result()
                except Exception as e:
                    print(f"Error downloading {url}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to access {base_url}: {e}")


def download_sonic_anemometer_files_for_month(base_url_template, start_date, end_date, base_dir):
    """
    Download all Sonic Anemometer files with '.dat' extension for each day in [start_date, end_date].

    :param base_url_template: URL template with placeholders {year}, {month}, {day}
                              Example (keep private; don't commit your real host):
                              "https://YOUR_PRIVATE_HOST/path/Y{year}/M{month}/D{day}"
    :param start_date: Start date (format: 'YYYY-MM-DD')
    :param end_date: End date (format: 'YYYY-MM-DD')
    :param base_dir: Base directory where files should be stored locally

    HOW TO USE base_url_template (without exposing private URLs in the repo):
    - In code (private script) or via an environment variable at runtime.
    - Keep the {year}/{month}/{day} placeholders unchanged.
    """
    current_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')

    while current_date <= end_date_obj:
        date_str = current_date.strftime('%Y-%m-%d')
        year_month = current_date.strftime('%Y-%m')
        day_str = current_date.strftime('%d')
        month_str = current_date.strftime('%m')

        base_url = base_url_template.format(
            year=current_date.strftime('%Y'),
            month=month_str,
            day=day_str
        )

        day_dir = os.path.join(base_dir, year_month, date_str)
        if not os.path.exists(day_dir):
            os.makedirs(day_dir)

        print(f"Downloading Sonic Anemometer files for {date_str}...")
        download_files_from_directory(base_url, day_dir, '.dat')

        current_date += timedelta(days=1)


# -------------------------
# EXAMPLES 
# -------------------------
# Do NOT hard-code private hosts or personal paths in your public repo.
# Instead, set them at runtime or via environment variables.

# Example (placeholder) — replace at runtime, not in the repo:
# base_url_template_sonic = "https://example.org/Sonic_Anemometer/Y{year}/M{month}/D{day}"
#
# Suggest a neutral local directory inside the repo (ok to commit empty, but
# add 'data/' to .gitignore so downloads are not committed):
# base_dir_sonic = "./data/sonic"
#
# start_date_sonic = "2024-07-01"
# end_date_sonic   = "2024-07-10"
#
# To run privately (e.g., setting the real host via env var):
#   import os
#   base_url_template_sonic = os.getenv("SONIC_URL_TEMPLATE", "https://example.org/Sonic_Anemometer/Y{year}/M{month}/D{day}")
#   download_sonic_anemometer_files_for_month(
#       base_url_template_sonic, start_date_sonic, end_date_sonic, base_dir_sonic
#   )
