<a href="https://colab.research.google.com/github/melkatewabe10/Machine-learning_LST-Estimation-/blob/main/Temporal_correlation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Author: Tewabe Melkamu**

Date: 24/3/2025

Temporal correlation

In [None]:
!pip install rasterio
!pip install scipy
!pip install tqdm

**Option one**

In [None]:
# Import necessary libraries
import numpy as np
import rasterio
import glob
import os
import re
from scipy.stats import pearsonr
from tqdm import tqdm  # Provides a progress bar for loops

def extract_year_from_filename(filename):
    """
    Extract a 4-digit year from the given filename using regular expressions.

    Parameters:
        filename (str): The file name from which to extract the year.

    Returns:
        int or None: The extracted year as an integer, or None if no year is found.
    """
    match = re.search(r'(\d{4})', filename)
    if match:
        return int(match.group(1))
    else:
        return None

def load_tif_files(folder_path):
    """
    Retrieve and sort all TIFF files in a folder based on the year extracted from the file names.

    Parameters:
        folder_path (str): Path to the folder containing TIFF files.

    Returns:
        list: Sorted list of file paths.
    """
    # Retrieve all files ending with .tif
    file_list = glob.glob(os.path.join(folder_path, '*.tif'))
    # Sort files using the extracted year from their filenames
    sorted_files = sorted(file_list, key=lambda x: extract_year_from_filename(os.path.basename(x)))
    return sorted_files

def read_data_stack(file_list):
    """
    Read each TIFF file (assumed to be single band) and stack them into a 3D numpy array.

    Parameters:
        file_list (list): List of sorted TIFF file paths.

    Returns:
        tuple: (data_stack, meta, nodata_value)
            - data_stack (numpy.ndarray): 3D array of shape (time, rows, cols).
            - meta (dict): Metadata from the first file (assumed consistent for all files).
            - nodata_value (float or None): The nodata value from the first file.
    """
    # Open the first file to get spatial dimensions and metadata
    with rasterio.open(file_list[0]) as src:
        meta = src.meta.copy()
        nodata_value = src.nodata
        # Read the first band (assuming a single band per file)
        arr = src.read(1)
        data_stack = [arr]
        rows, cols = arr.shape

    # Read the remaining files and check dimensions for consistency
    for file in file_list[1:]:
        with rasterio.open(file) as src:
            arr = src.read(1)
            if arr.shape != (rows, cols):
                raise ValueError(f"Dimension mismatch in file {file}")
            data_stack.append(arr)

    # Stack the list of 2D arrays into a 3D array: shape = (time, rows, cols)
    data_stack = np.stack(data_stack, axis=0)
    return data_stack, meta, nodata_value

def compute_pixel_correlation(time_vector, pixel_series, nodata_value):
    """
    Compute the Pearson correlation coefficient and p-value for a pixel's time series.

    Parameters:
        time_vector (numpy.ndarray): 1D array of time points (years).
        pixel_series (numpy.ndarray): 1D array of LST values for a single pixel.
        nodata_value (float or None): Value representing missing data.

    Returns:
        tuple: (r, p) where r is the correlation coefficient and p is the p-value.
               If the pixel contains nodata or is constant, returns (np.nan, np.nan).
    """
    # If the pixel has any nodata values, return NaNs
    if nodata_value is not None and np.any(pixel_series == nodata_value):
        return np.nan, np.nan
    # If all values are nan or constant, correlation cannot be computed
    if np.all(np.isnan(pixel_series)) or np.std(pixel_series) == 0:
        return np.nan, np.nan
    # Compute Pearson correlation and p-value
    r, p = pearsonr(time_vector, pixel_series)
    return r, p

def main():
    # Specify the folder containing the yearly LST TIFF files
    folder_path = 'path_to_your_folder'  # Update this path to your folder

    # Load the sorted list of TIFF files from the folder
    file_list = load_tif_files(folder_path)

    # Check if any files were found
    if len(file_list) == 0:
        raise FileNotFoundError("No TIFF files found in the specified folder.")

    # Extract years from file names to create the time vector
    years = []
    for file in file_list:
        year = extract_year_from_filename(os.path.basename(file))
        if year is not None:
            years.append(year)
    if not years:
        raise ValueError("No valid year information found in file names.")

    # Ensure the years are sorted and convert to a numpy array
    years = sorted(years)
    time_vector = np.array(years)

    # Load all TIFF files and stack them into a 3D array
    data, meta, nodata_value = read_data_stack(file_list)
    num_years, rows, cols = data.shape

    # Initialize output arrays for correlation coefficients and p-values
    corr_array = np.full((rows, cols), np.nan, dtype=np.float32)
    pvalue_array = np.full((rows, cols), np.nan, dtype=np.float32)

    # Loop over each pixel in the spatial domain to compute statistics
    for i in tqdm(range(rows), desc="Processing rows"):
        for j in range(cols):
            # Extract the time series for the current pixel across all years
            pixel_series = data[:, i, j]
            # Skip the pixel if nodata is present or if the values are constant
            if nodata_value is not None and np.any(pixel_series == nodata_value):
                continue
            if np.std(pixel_series) == 0:
                continue
            # Compute Pearson correlation coefficient and p-value
            r, p = pearsonr(time_vector, pixel_series)
            corr_array[i, j] = r
            pvalue_array[i, j] = p

    # Update metadata for a single-band output file (data type is float32)
    meta.update(count=1, dtype=rasterio.float32)

    # Define output file names
    output_corr_tif = 'output_correlation.tif'
    output_pvalue_tif = 'output_pvalue.tif'

    # Write the correlation coefficients to a new GeoTIFF file
    with rasterio.open(output_corr_tif, 'w', **meta) as dst:
        dst.write(corr_array, 1)

    # Write the p-values to a separate GeoTIFF file
    with rasterio.open(output_pvalue_tif, 'w', **meta) as dst:
        dst.write(pvalue_array, 1)

    print("The correlation and p-value GeoTIFF files have been generated successfully.")

if __name__ == '__main__':
    main()


**Option two**

In [None]:
# Import necessary libraries for numerical operations, file handling, and geospatial data processing.
import numpy as np
import rasterio
import glob
import os
import re
from scipy.stats import pearsonr
from tqdm import tqdm  # For displaying progress in loops

# ---------------------------
# File Reading: at the Beginning
# ---------------------------

# Specify the folder containing the TIFF files.
folder_path = 'taiwan _correlation'  # Adjust as needed

# Retrieve all TIFF files that match the naming pattern "LST_*.tif"
file_list = glob.glob(os.path.join(folder_path, 'LST_*.tif'))
if not file_list:
    raise FileNotFoundError(f"No TIFF files found in folder: {folder_path}")

def extract_year(filename):
    """
    Extract a 4-digit year from a filename.

    Parameters:
        filename (str): Filename (e.g., 'LST_2000.tif').

    Returns:
        int or None: Extracted year as integer or None if not found.
    """
    m = re.search(r'(\d{4})', os.path.basename(filename))
    return int(m.group(1)) if m else None

# Sort the file list based on the extracted year.
file_list = sorted(file_list, key=lambda x: extract_year(x))

# Extract years from the filenames to build the time vector.
years = [extract_year(file) for file in file_list if extract_year(file) is not None]
if not years:
    raise ValueError("No valid year information found in file names.")

years = sorted(years)
time_vector = np.array(years)

# ---------------------------
# Function Definitions
# ---------------------------

def read_data_stack(file_list):
    """
    Read each single-band TIFF file and stack them into a 3D numpy array.

    Parameters:
        file_list (list): List of TIFF file paths (assumed sorted by year).

    Returns:
        tuple: (data_stack, meta, nodata_value)
            - data_stack (np.ndarray): 3D array with dimensions (time, rows, cols).
            - meta (dict): Metadata from the first file.
            - nodata_value (float or None): The nodata value from the file metadata.
    """
    # Open the first file to obtain metadata and dimensions.
    with rasterio.open(file_list[0]) as src:
        meta = src.meta.copy()
        nodata_value = src.nodata
        first_band = src.read(1)
        rows, cols = first_band.shape
        data_stack = [first_band]

    # Read the remaining files and append the data.
    for file in file_list[1:]:
        with rasterio.open(file) as src:
            band = src.read(1)
            if band.shape != (rows, cols):
                raise ValueError(f"Dimension mismatch in file: {file}")
            data_stack.append(band)

    # Stack into a 3D numpy array: (time, rows, cols)
    data_stack = np.stack(data_stack, axis=0)
    return data_stack, meta, nodata_value

def compute_pixel_correlation(time_vector, pixel_series, nodata_value):
    """
    Compute the Pearson correlation coefficient and p-value for a pixel's time series.

    Parameters:
        time_vector (np.ndarray): 1D array of time points (years).
        pixel_series (np.ndarray): 1D array of LST values for one pixel over time.
        nodata_value (float or None): Value representing missing data.

    Returns:
        tuple: (r, p) where r is the correlation coefficient and p is the p-value.
               Returns (np.nan, np.nan) if the pixel data is invalid.
    """
    if nodata_value is not None and np.any(pixel_series == nodata_value):
        return np.nan, np.nan
    if np.all(np.isnan(pixel_series)) or np.std(pixel_series) == 0:
        return np.nan, np.nan
    r, p = pearsonr(time_vector, pixel_series)
    return r, p

# ---------------------------
# Main Processing Function
# ---------------------------

def main():
    # Stack all the TIFF files into a 3D array.
    data, meta, nodata_value = read_data_stack(file_list)
    num_years, rows, cols = data.shape

    # Prepare output arrays for correlation coefficients and p-values.
    corr_array = np.full((rows, cols), np.nan, dtype=np.float32)
    pvalue_array = np.full((rows, cols), np.nan, dtype=np.float32)

    # Loop through every pixel (nested loops over rows and columns).
    for i in tqdm(range(rows), desc="Processing rows"):
        for j in range(cols):
            pixel_series = data[:, i, j]

            # Skip processing if nodata is present or if the pixel values are constant.
            if nodata_value is not None and np.any(pixel_series == nodata_value):
                continue
            if np.std(pixel_series) == 0:
                continue

            # Compute the Pearson correlation coefficient and p-value.
            r, p = compute_pixel_correlation(time_vector, pixel_series, nodata_value)
            corr_array[i, j] = r
            pvalue_array[i, j] = p

    # Update metadata for single-band output (data type float32).
    meta.update(count=1, dtype=rasterio.float32)
    output_corr_tif = 'taiwan_correlation_coefficient.tif'
    output_pvalue_tif = 'taiwan_correlation_pvalue.tif'

    # Write the correlation coefficients to a GeoTIFF file.
    with rasterio.open(output_corr_tif, 'w', **meta) as dst:
        dst.write(corr_array, 1)

    # Write the p-values to a separate GeoTIFF file.
    with rasterio.open(output_pvalue_tif, 'w', **meta) as dst:
        dst.write(pvalue_array, 1)

    print("GeoTIFF files for correlation coefficients and p-values have been generated successfully.")

# Run the main function when the script is executed.
if __name__ == '__main__':
    main()


**Option three**

In [None]:
# Import required libraries
import os
import glob
import numpy as np
import rasterio
from scipy import stats

# If running on Google Colab, mount Google Drive to access your files.
# This step is only needed if you run the code in Colab.
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print("Google Drive mount not available or not required in this environment.")

def extract_year_from_filename(filename):
    """
    Extract the year from the given filename.
    Expected filename format: LST_YYYY.tif (e.g., LST_2000.tif).

    Parameters:
        filename (str): Full path or filename.

    Returns:
        int: The year extracted from the filename.
    """
    basename = os.path.basename(filename)
    parts = basename.split('_')
    if len(parts) < 2:
        raise ValueError(f"Filename {filename} does not follow the expected pattern 'LST_YYYY.tif'.")
    # Remove file extension and convert year string to integer.
    year_str = parts[1].split('.')[0]
    return int(year_str)

def load_lst_data(folder_path, pattern="LST_*.tif"):
    """
    Load LST (Land Surface Temperature) data from TIFF files in the specified folder.

    The function assumes that files are named in the format LST_YYYY.tif.

    Parameters:
        folder_path (str): Directory where the LST TIFF files are stored.
        pattern (str): Pattern to match file names (default is "LST_*.tif").

    Returns:
        tuple: A tuple containing:
            - years (np.ndarray): Array of years extracted from file names.
            - lst_stack (np.ndarray): 3D array with shape (n_years, rows, cols) of LST data.
            - meta (dict): Metadata from the first TIFF file (used for saving outputs).
    """
    # Build the search pattern
    search_pattern = os.path.join(folder_path, pattern)
    file_list = glob.glob(search_pattern)

    # Check if any files were found
    if not file_list:
        raise FileNotFoundError("No TIFF files were found in the specified folder with the given pattern.")

    # Sort files by the year extracted from their names
    file_list.sort(key=extract_year_from_filename)

    # Extract years from filenames
    years = np.array([extract_year_from_filename(f) for f in file_list])
    print(f"Found {len(years)} files corresponding to years: {years}")

    # Open the first file to obtain spatial dimensions and metadata
    with rasterio.open(file_list[0]) as src:
        meta = src.meta.copy()
        sample_data = src.read(1)
        height, width = sample_data.shape
        print(f"Raster dimensions determined from sample: width = {width}, height = {height}")

    # Pre-allocate an array to store LST data from all years
    n_years = len(years)
    lst_stack = np.empty((n_years, height, width), dtype=np.float32)

    # Loop over each file, read the data, and store in the array
    for idx, filepath in enumerate(file_list):
        with rasterio.open(filepath) as src:
            data = src.read(1)
            lst_stack[idx, :, :] = data
        print(f"Loaded data for year {years[idx]} from file: {filepath}")

    return years, lst_stack, meta

def compute_pixelwise_correlation(years, lst_stack):
    """
    Compute the pixel-wise Pearson correlation coefficient and corresponding p-value between time and LST.

    For each pixel (i,j) across the time series, the function calculates:
      - r: the Pearson correlation coefficient between the vector of years and the LST values.
      - p: the two-tailed p-value testing the null hypothesis of no correlation.

    Parameters:
        years (np.ndarray): 1D array of years (shape: (n_years,)).
        lst_stack (np.ndarray): 3D array of LST values (shape: (n_years, rows, cols)).

    Returns:
        tuple: A tuple containing two 2D arrays (rows, cols):
            - r (np.ndarray): Pearson correlation coefficient for each pixel.
            - p_values (np.ndarray): p-value for each pixel.
    """
    # Number of time points
    n_years = len(years)

    # Convert years to a float array (for numerical computations)
    x = years.astype(np.float32)

    # Compute mean and standard deviation of the time vector
    mean_x = np.mean(x)
    std_x = np.std(x, ddof=1)  # ddof=1 gives the sample standard deviation
    print(f"Time vector: mean = {mean_x}, standard deviation = {std_x}")

    # Compute the mean and standard deviation of LST values for each pixel (over time)
    mean_y = np.mean(lst_stack, axis=0)
    std_y = np.std(lst_stack, axis=0, ddof=1)

    # Center the time vector by subtracting its mean
    x_centered = x - mean_x
    # Reshape x_centered for broadcasting: shape becomes (n_years, 1, 1)
    x_centered = x_centered[:, np.newaxis, np.newaxis]

    # Center the LST stack by subtracting the mean for each pixel
    lst_centered = lst_stack - mean_y

    # Calculate covariance for each pixel over time:
    # covariance = sum((x - mean_x)*(y - mean_y)) / (n_years - 1)
    covariance = np.sum(x_centered * lst_centered, axis=0) / (n_years - 1)

    # Calculate Pearson correlation coefficient per pixel:
    # r = covariance / (std_x * std_y)
    r = covariance / (std_x * std_y)
    # For pixels with no temporal variation (std_y == 0), set correlation to NaN
    r = np.where(std_y == 0, np.nan, r)

    # Compute the t-statistic for each pixel:
    # t = r * sqrt((n_years - 2) / (1 - r^2))
    with np.errstate(divide='ignore', invalid='ignore'):
        t_stat = r * np.sqrt((n_years - 2) / np.clip(1 - r**2, 1e-10, None))

    # Calculate two-tailed p-values using the t-distribution:
    p_values = 2 * stats.t.sf(np.abs(t_stat), df=n_years - 2)
    # For pixels with no variation, assign p-value as NaN
    p_values = np.where(std_y == 0, np.nan, p_values)

    print("Computed pixel-wise Pearson correlation coefficients and p-values.")
    return r, p_values

def save_tiff(output_path, data, meta):
    """
    Save a 2D numpy array as a GeoTIFF file.

    Parameters:
        output_path (str): Path for the output TIFF file.
        data (np.ndarray): 2D array (rows x cols) to be saved.
        meta (dict): Metadata dictionary (typically from an input file) that is updated here.
    """
    # Update metadata to indicate single band and data type float32
    meta.update(dtype=rasterio.float32, count=1)
    with rasterio.open(output_path, 'w', **meta) as dst:
        dst.write(data.astype(np.float32), 1)
    print(f"Saved output TIFF file: {output_path}")

def main():
    """
    Main function to execute the following steps:
      1. Load LST data from a series of GeoTIFF files.
      2. Compute the pixel-wise Pearson correlation coefficient and p-value between time and LST.
      3. Save the correlation coefficient and p-value maps as separate TIFF files.
    """
    # Specify the Google Drive folder path where the LST TIFF files are stored.
    # Adjust this path as needed.
    folder_path = "/content/drive/MyDrive/Taiwan_corelation"

    # Load LST data from the TIFF files.
    years, lst_stack, meta = load_lst_data(folder_path)

    # Compute the pixel-wise correlation and p-values.
    correlation, p_values = compute_pixelwise_correlation(years, lst_stack)

    # Define the output file paths.
    output_corr_path = os.path.join(folder_path, "LST_correlation.tif")
    output_pval_path = os.path.join(folder_path, "LST_p_value.tif")

    # Save the correlation coefficient map.
    save_tiff(output_corr_path, correlation, meta)

    # Save the p-value map.
    save_tiff(output_pval_path, p_values, meta)

    print("Processing complete. Both correlation and p-value maps have been saved.")

# Execute the main function when the script is run.
if __name__ == "__main__":
    main()
