<a href="https://colab.research.google.com/github/machiwao/CCTHESS1-CCTHESS2-Dev-and-Docs/blob/jessy/NDVI_data_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import ee
import geemap
import pandas as pd

In [35]:
cloud_project = 'heat-index-forecasting'

try:
  ee.Initialize(project=cloud_project)
except:
  ee.Authenticate()
  ee.Initialize(project=cloud_project)

sinait_point = ee.Geometry.Point([120.459762, 17.89019]).buffer(25000)
tayabas_point = ee.Geometry.Point([121.596575, 14.018428]).buffer(25000)
tanay_point = ee.Geometry.Point([121.36927, 14.581167]).buffer(25000)
tuguegarao_point = ee.Geometry.Point([121.758469, 17.647678]).buffer(25000)
virac_point = ee.Geometry.Point([124.209834, 13.576558]).buffer(25000)

stations = {
    "Sinait": sinait_point,
    "Tanay": tanay_point,
    "Tayabas": tayabas_point,
    "Tuguegarao": tuguegarao_point,
    "Virac Synop": virac_point,
}

In [36]:
# Define date range
start_date = '2014-01-01'
end_date = '2024-01-01'

# Load the NOAA CDR VIIRS NDVI dataset
viirs_ndvi = ee.ImageCollection("NOAA/CDR/VIIRS/NDVI/V1").filterDate(start_date, end_date)

sample_image = viirs_ndvi.first()
band_names = sample_image.bandNames().getInfo()
print(f"Available bands: {band_names}")

Available bands: ['NDVI', 'TIMEOFDAY', 'QA']


In [37]:
def process_station_data(collection, geometry, station_name):
    """
    Process the collection to extract all bands with proper scaling
    """
    print(f"Processing data for {station_name}...")

    def extract_scaled_values(img):
        # Get the date without time
        date = ee.Date(img.get('system:time_start')).format('YYYY-MM-dd')

        # Apply scale factors according to dataset documentation
        ndvi_scaled = img.select('NDVI').multiply(0.0001)  # Scale factor: 0.0001
        timeofday_scaled = img.select('TIMEOFDAY')
        qa = img.select('QA')  # No scaling needed for QA

        # Mask invalid NDVI values (keep values between -1 and 1)
        valid_ndvi = ndvi_scaled.mask(ndvi_scaled.gte(-1).And(ndvi_scaled.lte(1)))

        # Create a new image with scaled bands
        scaled_img = ee.Image.cat([
            valid_ndvi.rename('NDVI_scaled'),
            timeofday_scaled.rename('TIMEOFDAY'),
            qa.rename('QA')
        ])

        # Extract values for all bands at the geometry
        stats = scaled_img.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=geometry,
            scale=5566,  # Native resolution of VIIRS NDVI
            bestEffort=True,
            maxPixels=1e9
        )

        # Add date info
        return ee.Feature(None, stats).set('date', date)

    # Map the extraction function over the collection
    features = collection.map(extract_scaled_values)

    return ee.FeatureCollection(features)

In [38]:
all_dataframes = {}

for station_name, geometry in stations.items():
    print(f"\n--- Processing {station_name} ---")

    try:
        # Extract features for this station
        station_features = process_station_data(viirs_ndvi, geometry, station_name)

        # Convert to DataFrame
        print(f"Converting {station_name} data to DataFrame...")
        df = geemap.ee_to_df(station_features)

        if not df.empty:
            # Clean up the DataFrame
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date').reset_index(drop=True)

            # Remove rows where all NDVI-related columns are null
            ndvi_columns = [col for col in df.columns if 'NDVI' in col.upper()]
            df = df.dropna(subset=ndvi_columns, how='all')

            # Store in dictionary
            all_dataframes[station_name] = df

            # Display info about the dataset
            print(f"{station_name} dataset shape: {df.shape}")
            print(f"Date range: {df['date'].min()} to {df['date'].max()}")
            print(f"Columns: {list(df.columns)}")

            # Save to CSV
            csv_filename = f'{station_name}_NDVI_Data.csv'
            df.to_csv(csv_filename, index=False)
            print(f"Saved {csv_filename}")

            # Display first few rows
            print(f"\nFirst 5 rows for {station_name}:")
            print(df.head())

        else:
            print(f"Warning: No data extracted for {station_name}")

    except Exception as e:
        print(f"Error processing {station_name}: {str(e)}")
        continue


--- Processing Sinait ---
Processing data for Sinait...
Converting Sinait data to DataFrame...
Sinait dataset shape: (3627, 4)
Date range: 2014-01-01 00:00:00 to 2023-12-31 00:00:00
Columns: ['NDVI_scaled', 'QA', 'TIMEOFDAY', 'date']
Saved Sinait_NDVI_Data.csv

First 5 rows for Sinait:
   NDVI_scaled          QA   TIMEOFDAY       date
0     0.713275   77.360005  454.000000 2014-01-01
1     0.709804   84.309817  436.000000 2014-01-02
2     0.660702  136.711794  437.094836 2014-01-03
3     0.335103  481.405549  536.000000 2014-01-04
4     0.711326   83.879264  518.000000 2014-01-05

--- Processing Tanay ---
Processing data for Tanay...
Converting Tanay data to DataFrame...
Tanay dataset shape: (3628, 4)
Date range: 2014-01-01 00:00:00 to 2023-12-31 00:00:00
Columns: ['NDVI_scaled', 'QA', 'TIMEOFDAY', 'date']
Saved Tanay_NDVI_Data.csv

First 5 rows for Tanay:
   NDVI_scaled          QA   TIMEOFDAY       date
0     0.693900  351.587880  454.000000 2014-01-01
1     0.534223  504.175880  43

In [39]:
print(f"Successfully processed {len(all_dataframes)} stations:")
for station_name, df in all_dataframes.items():
    print(f"- {station_name}: {df.shape[0]} records, {df.shape[1]} columns")

# Download all CSV files (for Google Colab)
print(f"\nDownloading CSV files...")
from google.colab import files
for station_name in all_dataframes.keys():
    csv_filename = f'VIIRS_NDVI_{station_name}.csv'
    try:
        files.download(csv_filename)
        print(f"Downloaded {csv_filename}")
    except:
        print(f"Could not download {csv_filename}")

print("Processing complete!")

Successfully processed 5 stations:
- Sinait: 3627 records, 4 columns
- Tanay: 3628 records, 4 columns
- Tayabas: 3628 records, 4 columns
- Tuguegarao: 3628 records, 4 columns
- Virac Synop: 3622 records, 4 columns

Downloading CSV files...
Could not download VIIRS_NDVI_Sinait.csv
Could not download VIIRS_NDVI_Tanay.csv
Could not download VIIRS_NDVI_Tayabas.csv
Could not download VIIRS_NDVI_Tuguegarao.csv
Could not download VIIRS_NDVI_Virac Synop.csv
Processing complete!
