<a href="https://colab.research.google.com/github/machiwao/CCTHESS1-CCTHESS2-Dev-and-Docs/blob/jessy/Albedo_data_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import ee
import geemap
import pandas as pd
from scipy.signal import savgol_filter

In [15]:
# Authenticate and initialize Earth Engine
cloud_project = 'heat-index-forecasting'

try:
  ee.Initialize(project=cloud_project)
except:
  ee.Authenticate()
  ee.Initialize(project=cloud_project)

stations = {
  "Sinait": ee.Geometry.Point([120.459762, 17.89019]).buffer(25000),
  "Tayabas": ee.Geometry.Point([121.596575, 14.018428]).buffer(25000),
  "Tanay": ee.Geometry.Point([121.36927, 14.581167]).buffer(25000),
  "Tuguegarao": ee.Geometry.Point([121.758469, 17.647678]).buffer(25000),
  "Virac": ee.Geometry.Point([124.209834, 13.576558]).buffer(25000),
}

start_date = "2014-01-01"
end_date   = "2024-01-01"

In [37]:
def process_station(station_name, geometry, start_date, end_date):
    """
    Extract daily Albedo time series from HLS (HLSL30 for Landsat, HLSS30 for Sentinel).
    Applies interpolation and saves to CSV.
    """

    print(f"Processing Albedo for {station_name}...")

    # Landsat (HLSL30)
    hlsl = ee.ImageCollection("NASA/HLS/HLSL30/v002") \
        .filterDate(start_date, end_date) \
        .filterBounds(geometry)

    def landsat_albedo(img):
        date = img.date().format("YYYY-MM-dd")
        blue = img.select("B2")
        red = img.select("B4")
        nir = img.select("B5")
        swir1 = img.select("B6")
        swir2 = img.select("B7")

        albedo = (
            blue.multiply(0.356)
            .add(red.multiply(0.130))
            .add(nir.multiply(0.373))
            .add(swir1.multiply(0.085))
            .add(swir2.multiply(0.072))
            .subtract(0.0018)
        ).divide(1.016).rename("Albedo")

        return ee.Feature(
            None,
            {
                "date": date,
                "Albedo": albedo.reduceRegion(
                    reducer=ee.Reducer.mean(),
                    geometry=geometry,
                    bestEffort=True,
                    maxPixels=1e9,
                ).get("Albedo"),
            },
        )

    hlsl_features = hlsl.map(landsat_albedo)

    # Sentinel (HLSS30)
    hlss = ee.ImageCollection("NASA/HLS/HLSS30/v002") \
        .filterDate(start_date, end_date) \
        .filterBounds(geometry)

    def sentinel_albedo(img):
        date = img.date().format("YYYY-MM-dd")
        blue = img.select("B2")
        red = img.select("B4")
        nir = img.select("B8")
        swir1 = img.select("B11")
        swir2 = img.select("B12")

        albedo = (
            blue.multiply(0.356)
            .add(red.multiply(0.130))
            .add(nir.multiply(0.373))
            .add(swir1.multiply(0.085))
            .add(swir2.multiply(0.072))
            .subtract(0.0018)
        ).rename("Albedo")

        return ee.Feature(
            None,
            {
                "date": date,
                "Albedo": albedo.reduceRegion(
                    reducer=ee.Reducer.mean(),
                    geometry=geometry,
                    bestEffort=True,
                    maxPixels=1e9,
                ).get("Albedo"),
            },
        )

    hlss_features = hlss.map(sentinel_albedo)

    # Merge Landsat + Sentinel results
    features = hlsl_features.merge(hlss_features)

    # Convert to pandas
    df = geemap.ee_to_df(ee.FeatureCollection(features))

    if df.empty:
        print(f"No data for {station_name}")
        return None

    # Process time series
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").groupby("date").mean().reset_index()

    # Keep original values
    df = df.rename(columns={"Albedo": "Albedo_original"})

    # Reindex to daily frequency
    full_range = pd.date_range(start=start_date, end=end_date, freq="D")
    df = df.set_index("date").reindex(full_range)
    df.index.name = "date"
    df = df.reset_index()

    # Interpolation
    df["Albedo_linear"] = df["Albedo_original"].interpolate(method="linear", limit_direction="both")
    df["Albedo_spline"] = df["Albedo_original"].interpolate(method="spline", order=2, limit_direction="both")

    # Metadata
    df["station"] = station_name

    # Save
    filename = f"{station_name}_Albedo.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {filename} with {df.shape[0]} rows")

    return df

In [38]:
all_dfs = {}
for station, geom in stations.items():
     all_dfs[station] = process_station(station, geom, start_date, end_date)

Processing Albedo for Sinait...
Saved Sinait_Albedo.csv with 3653 rows
Processing Albedo for Tayabas...
Saved Tayabas_Albedo.csv with 3653 rows
Processing Albedo for Tanay...
Saved Tanay_Albedo.csv with 3653 rows
Processing Albedo for Tuguegarao...
Saved Tuguegarao_Albedo.csv with 3653 rows
Processing Albedo for Virac...
Saved Virac_Albedo.csv with 3653 rows


In [None]:
# Check the first few rows of one of the dataframes
if all_dfs:
    first_station = list(all_dfs.keys())[0]
    print(f"First values for {first_station}:")
    display(all_dfs[first_station].head())
else:
    print("No dataframes were processed.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_values(csv_file, station_name):
    # Load CSV (expects columns: date, NDBI_original, NDBI_linear, NDBI_spline)
    df = pd.read_csv(csv_file, parse_dates=["date"])

    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(df["date"], df["Albedo_original"], label="Albedo (raw)", alpha=0.6)
    plt.plot(df["date"], df["Albedo_linear"], label="Albedo (linear interpolation)", linewidth=2)
    plt.plot(df["date"], df["Albedo_spline"], label="Albedo (spline interpolation)", linewidth=2)


    plt.xlabel("Date")
    plt.ylabel("Albedo")
    plt.title(f"Albedo vs. Interpolated Albedo - {station_name}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

station_names = ["Sinait", "Tanay", "Tayabas", "Tuguegarao", "Virac"]
for station in station_names:
    csv_file = f"{station}_Albedo.csv"
    plot_values(csv_file, station)