<a href="https://colab.research.google.com/github/melkatewabe10/Machine-learning_LST-Estimation-/blob/main/Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Author: Tewabe Melkamu**

Date: 24/3/2025

Validation

In [None]:
!pip install rasterio
!pip install scipy
!pip install tqdm

**Script**

In [None]:
# Import necessary libraries
import numpy as np
import rasterio
import glob
import os
import re
from scipy.stats import pearsonr
from tqdm import tqdm

# ---------------------------
# Define the folder path at the beginning
# ---------------------------
folder_path = 'validation'  # Folder containing the TIFF files

def extract_year_from_filename(filename):
    """
    Extract a 4-digit year from the filename.
    For example, 'LST_2000.tif' returns 2000.
    """
    m = re.search(r'(\d{4})', os.path.basename(filename))
    return int(m.group(1)) if m else None

def read_stack(variable, folder_path):
    """
    Read and stack all TIFF files for a given variable (e.g., 'LST')
    from the specified folder. Assumes filenames like 'LST_2000.tif',
    'LST_2001.tif', ... up to 'LST_2025.tif'.

    Returns:
        stack (np.ndarray): 3D array with dimensions (time, rows, cols)
        meta (dict): Metadata from the first file (assumed consistent)
        nodata_value (float or None): nodata value from the metadata
        time_vector (np.ndarray): 1D array of years
    """
    pattern = os.path.join(folder_path, f"{variable}_*.tif")
    file_list = glob.glob(pattern)
    if not file_list:
        raise FileNotFoundError(f"No files found for variable '{variable}' in {folder_path}")

    # Sort files based on the extracted year
    file_list = sorted(file_list, key=lambda x: extract_year_from_filename(x))

    # Extract years from filenames
    years = [extract_year_from_filename(f) for f in file_list if extract_year_from_filename(f) is not None]
    if not years:
        raise ValueError(f"No valid year information found in files for variable '{variable}'.")

    # Optional: Verify that the data spans 2000 to 2025 (26 time steps)
    years = sorted(years)
    if years[0] != 2000 or years[-1] != 2025 or len(years) != 26:
        raise ValueError(f"Expected years from 2000 to 2025 (26 files) for variable '{variable}', but got: {years}")
    time_vector = np.array(years, dtype=np.float32)

    # Read the first file to obtain metadata and dimensions
    with rasterio.open(file_list[0]) as src:
        meta = src.meta.copy()
        nodata_value = src.nodata
        first_band = src.read(1)
        rows, cols = first_band.shape
        stack = [first_band]

    # Read and stack the remaining files
    for file in file_list[1:]:
        with rasterio.open(file) as src:
            band = src.read(1)
            if band.shape != (rows, cols):
                raise ValueError(f"Dimension mismatch in file: {file}")
            stack.append(band)

    # Stack into a 3D numpy array: (time, rows, cols)
    stack = np.stack(stack, axis=0)
    return stack, meta, nodata_value, time_vector

def main():
    # Read the data stacks for LST and the other variables: SM, ET, P, NP.
    lst_stack, meta, nodata_value, time_vector = read_stack("LST", folder_path)
    sm_stack, _, _, _   = read_stack("SM", folder_path)
    et_stack, _, _, _   = read_stack("ET", folder_path)
    p_stack, _, _, _    = read_stack("P", folder_path)
    np_stack, _, _, _   = read_stack("NP", folder_path)

    # Verify that all stacks share the same dimensions: (time, rows, cols)
    shape = lst_stack.shape
    if (sm_stack.shape != shape or et_stack.shape != shape or
        p_stack.shape != shape or np_stack.shape != shape):
        raise ValueError("Mismatch in stack dimensions among variables.")

    time_steps, rows, cols = shape

    # Prepare dictionaries to store output correlation and p-value maps for each variable.
    variables = ["SM", "ET", "P", "NP"]
    corr_maps = {var: np.full((rows, cols), np.nan, dtype=np.float32) for var in variables}
    p_maps    = {var: np.full((rows, cols), np.nan, dtype=np.float32) for var in variables}

    # Loop over every pixel in the spatial domain.
    for i in tqdm(range(rows), desc="Processing rows"):
        for j in range(cols):
            # Extract LST time series for the current pixel.
            lst_series = lst_stack[:, i, j]
            # Skip if LST has nodata values or is constant.
            if nodata_value is not None and np.any(lst_series == nodata_value):
                continue
            if np.std(lst_series) == 0:
                continue

            # For each variable, compute the Pearson correlation with LST.
            for var, stack in zip(variables, [sm_stack, et_stack, p_stack, np_stack]):
                series = stack[:, i, j]
                # Skip if the series has nodata values or is constant.
                if nodata_value is not None and np.any(series == nodata_value):
                    continue
                if np.std(series) == 0:
                    continue
                # Compute the correlation coefficient and p-value.
                r, p = pearsonr(lst_series, series)
                # Only store significant correlations (p < 0.05).
                if p < 0.05:
                    corr_maps[var][i, j] = r
                    p_maps[var][i, j] = p

    # Save output maps as GeoTIFFs for each variable pairing (LST vs. SM, ET, P, NP).
    for var in variables:
        out_corr = f"LST_{var}_correlation.tif"
        out_pval = f"LST_{var}_pvalue.tif"

        # Update metadata for a single-band float32 output.
        meta.update(count=1, dtype=rasterio.float32)

        with rasterio.open(out_corr, 'w', **meta) as dst:
            dst.write(corr_maps[var], 1)

        with rasterio.open(out_pval, 'w', **meta) as dst:
            dst.write(p_maps[var], 1)

    print("GeoTIFF files for correlation coefficients and p-values have been generated for SM, ET, P, and NP.")

if __name__ == '__main__':
    main()
