<a href="https://colab.research.google.com/github/machiwao/CCTHESS1-CCTHESS2-Dev-and-Docs/blob/jessy/NDBI_data_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# !pip install earthengine-api geemap scipy

In [31]:
import ee
import geemap
import pandas as pd
from scipy.signal import savgol_filter
import os

In [32]:
cloud_project = 'heat-index-forecasting'

try:
  ee.Initialize(project=cloud_project)
except:
  ee.Authenticate()
  ee.Initialize(project=cloud_project)

sinait_point = ee.Geometry.Point([120.459762, 17.89019]).buffer(25000)
tayabas_point = ee.Geometry.Point([121.596575, 14.018428]).buffer(25000)
tanay_point = ee.Geometry.Point([121.36927, 14.581167]).buffer(25000)
tuguegarao_point = ee.Geometry.Point([121.758469, 17.647678]).buffer(25000)
virac_point = ee.Geometry.Point([124.209834, 13.576558]).buffer(25000)

stations = {
    "Sinait": sinait_point,
    "Tanay": tanay_point,
    "Tayabas": tayabas_point,
    "Tuguegarao": tuguegarao_point,
    "Virac Synop": virac_point,
}

In [33]:
def addNDBI_L8(img):
    nd = img.normalizedDifference(['SR_B6','SR_B5']).rename('NDBI')
    return img.addBands(nd).select('NDBI').copyProperties(img, ['system:time_start'])

def addNDBI_S2(img):
    nd = img.normalizedDifference(['B11','B8']).rename('NDBI')
    return img.addBands(nd).select('NDBI').copyProperties(img, ['system:time_start'])

In [34]:
start_date = '2014-01-01'
end_date = '2024-01-01'

l8 = (ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')
      .filterDate(start_date, '2017-03-28')
      .map(addNDBI_L8))

s2 = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate('2017-03-28', end_date)   # Sentinel-2 availability
      .map(addNDBI_S2))

combined = l8.merge(s2).sort('system:time_start')

In [None]:
output_dir = "/content/NDBI_raw"
os.makedirs(output_dir, exist_ok=True)

all_raw = {}

for station_name, geom in stations.items():
    print(f"\n--- Extracting raw NDBI for {station_name} ---")

    # Extract features
    ts = combined.map(lambda img: ee.Feature(
        None, {
            'date': img.date().format('YYYY-MM-dd'),
            'NDBI': img.reduceRegion(
                reducer=ee.Reducer.mean(),
                geometry=geom,
                scale=100,
                bestEffort=True
            ).get('NDBI'),
            'station': station_name
        }
    ))

    ts_fc = ee.FeatureCollection(ts)

    # Convert to DataFrame
    df = geemap.ee_to_df(ts_fc)
    if df.empty:
        print(f"No data extracted for {station_name}")
        continue

    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date']).sort_values('date').reset_index(drop=True)

    # Save raw CSV
    raw_file = os.path.join(output_dir, f"{station_name}_NDBI_Raw.csv")
    df.to_csv(raw_file, index=False)
    print(f"Saved {raw_file} with {len(df)} records")

    all_raw[station_name] = df

print("\nFinished exporting raw NDBI CSVs for all stations!")


--- Extracting raw NDBI for Sinait ---


In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import glob
import os
from scipy.signal import savgol_filter

# 2. Set your Google Drive paths
# Change these paths to where your raw CSVs are located
input_folder  = "/content/drive/MyDrive/NDBI_raw"
output_folder = "/content/drive/MyDrive/NDBI_processed"

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# 3. Define full daily range
start_date = "2014-01-01"
end_date   = "2024-01-01"
full_range = pd.date_range(start_date, end_date, freq='D')

# 4. Process each CSV
all_dataframes = {}

for file in glob.glob(os.path.join(input_folder, "*.csv")):
    station_name = os.path.basename(file).split("_")[0]
    print(f"\n--- Processing {station_name} ---")

    # Load exported CSV
    df = pd.read_csv(file)

    # Ensure 'date' column exists and is datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    df = df.sort_values('date').reset_index(drop=True)

    # Reindex to full daily range
    g = df.set_index('date').reindex(full_range)
    g['station'] = station_name
    g['NDBI'] = g['NDBI'].astype(float)

    # Fill missing values
    g['NDBI'] = g['NDBI'].interpolate(method='linear').ffill().bfill()

    # Apply Savitzky–Golay smoothing
    try:
        g['NDBI_smooth'] = savgol_filter(g['NDBI'], window_length=31, polyorder=2)
    except:
        g['NDBI_smooth'] = g['NDBI']  # fallback if not enough data

    g.index.name = 'date'
    g = g.reset_index()

    # Save processed CSV into Drive
    output_file = os.path.join(output_folder, f"{station_name}_NDBI_Daily.csv")
    g.to_csv(output_file, index=False)
    print(f"✅ Saved {output_file} with {len(g)} daily records")

    all_dataframes[station_name] = g

print("\nAll stations processed and saved to Google Drive")