<a href="https://colab.research.google.com/github/melkatewabe10/Machine-learning_LST-Estimation-/blob/main/SWAT_Calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Index calcualations and conversion**

In [None]:
!pip install rasterio
!pip install rioxarray

# Data conversion: NetCDF to tif

In [None]:
import xarray as xr
import rioxarray as rio
import rasterio
import os

# -------------------- CONFIGURATION --------------------
file_path = '/content/drive/MyDrive/NEW DEMO/median2016_2022.nc'
reference_tif_path = '/content/drive/MyDrive/NEW FOLDER/NEWTRANING/ALB_2001_01.tif'
output_dir = '/content/drive/MyDrive/NEW FOLDER/TVPDI' # Define an output directory

# -------------------- LOAD DATA --------------------
ds = xr.open_dataset(file_path)

# Rename lat/lon to y/x if needed
ds = ds.rename({'lat': 'y', 'lon': 'x'})

# Extract 'rain' variable
rain = ds['rain']

# Set spatial dimensions for rioxarray
rain = rain.rio.set_spatial_dims(x_dim="x", y_dim="y")
rain = rain.rio.write_crs("EPSG:4326")  # WGS84

# -------------------- READ REFERENCE TIF --------------------
with rasterio.open(reference_tif_path) as ref:
    ref_crs = ref.crs
    ref_transform = ref.transform
    ref_shape = (ref.height, ref.width)

# -------------------- CREATE OUTPUT DIR --------------------
os.makedirs(output_dir, exist_ok=True)  # Create the output directory

# -------------------- LOOP THROUGH 4D DATA --------------------
for year in rain['year'].values:
    for season in rain['season'].values:
        # Extract 2D slice
        slice_2d = rain.sel(year=year, season=season)

        # Reproject to match reference
        reprojected = slice_2d.rio.reproject(
            dst_crs=ref_crs,
            transform=ref_transform,
            shape=ref_shape,
            resampling=rasterio.enums.Resampling.bilinear
        )

        # Save as GeoTIFF
        output_filename = f"prepmd_{year}_{season}.tif"
        output_path = os.path.join(output_dir, output_filename) # Changed to output_dir
        reprojected.rio.to_raster(output_path, compress='LZW')

        print(f"✅ Saved: {output_path}")

print("🎉 Done converting all slices.")

# Z_score outlier mask method

In [None]:
import os
import numpy as np
import rasterio
import matplotlib.pyplot as plt

def mask_outliers_zscore(data, threshold=3.0):
    """
    Apply Z-score masking to remove outliers in raster data.
    Pixels with |Z| > threshold are set to NaN.
    """
    mean = np.nanmean(data)
    std = np.nanstd(data)

    if std == 0 or np.isnan(std):
        raise ValueError("Standard deviation is zero or NaN. Cannot compute Z-score.")

    z_scores = (data - mean) / std
    mask = np.abs(z_scores) > threshold
    masked_data = np.where(mask, np.nan, data)

    return masked_data, mask

def process_single_raster(input_path, output_path, mask_path=None, threshold=3.0, show_hist=False):
    """
    Process a single raster file with Z-score masking.
    Saves both masked output and optional mask file.
    """
    with rasterio.open(input_path) as src:
        data = src.read(1).astype(float)
        profile = src.profile

    masked_data, outlier_mask = mask_outliers_zscore(data, threshold=threshold)

    profile.update(dtype='float32', nodata=np.nan)

    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(masked_data.astype(np.float32), 1)

    if mask_path:
        with rasterio.open(mask_path, 'w', **profile) as dst:
            dst.write(outlier_mask.astype(np.uint8), 1)

    if show_hist:
        plt.figure(figsize=(12, 4))
        plt.subplot(1, 2, 1)
        plt.hist(data[~np.isnan(data)].flatten(), bins=50, color='green')
        plt.title(f'Original NDVI: {os.path.basename(input_path)}')

        plt.subplot(1, 2, 2)
        plt.hist(masked_data[~np.isnan(masked_data)].flatten(), bins=50, color='red')
        plt.title('Masked NDVI (Z-Score)')
        plt.tight_layout()
        plt.show()

def process_ndvi_files(input_folder, output_folder, mask_folder=None, threshold=3.0, show_hist=False):
    """
    Batch-process only raster files starting with 'NDVI' in a folder.
    Applies Z-score outlier masking to each.
    """
    os.makedirs(output_folder, exist_ok=True)
    if mask_folder:
        os.makedirs(mask_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith('.tif') and filename.upper().startswith('ET'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(input_folder, f"masked_{filename}")
            mask_path = os.path.join(input_folder, f"mask_{filename}") if mask_folder else None

            print(f"Processing NDVI file: {filename}")
            try:
                process_single_raster(input_path, output_path, mask_path, threshold, show_hist)
            except Exception as e:
                print(f"⚠️ Error processing {filename}: {e}")

# ======= USER PARAMETERS =======

input_folder = "/content/drive/MyDrive/NEW FOLDER/TVPDI"
output_folder = "/content/drive/MyDrive/NEW FOLDER/TVPDI"
mask_folder = "/content/drive/MyDrive/NEW FOLDER/TVPDI/Mask_ET"  # Optional
zscore_threshold = 3.0
show_histogram = True  # Set to False to disable histogram display

# ======= RUN PROCESSING =======

process_ndvi_files(
    input_folder=input_folder,
    output_folder=output_folder,
    mask_folder=mask_folder,
    threshold=zscore_threshold,
    show_hist=show_histogram
)

# Global mean based normalization

In [None]:
import os
import numpy as np
import rasterio
from rasterio.enums import Resampling
from math import sqrt

# Set your input and output folders
data_folder = '/content/drive/MyDrive/NEW FOLDER/TVPDI'
# output_folder='/content/drive/MyDrive/NEW FOLDER/TVPDI' # Outputting to the same folder for now
# os.makedirs(output_folder, exist_ok=True) # Ensure output directory exists

# Season code mapping (remains the same)
season_to_month = {
    '01': '01',  # Winter
    '02': '02',  # Spring
    '03': '03',  # Summer
    '04': '04'   # Autumn
}

# --- Step 1: Categorize files by season and initialize min/max for each season ---
# Dictionary to store file paths and min/max for each season
# Structure: {season_code: {'files': [list_of_full_paths], 'min': np.inf, 'max': -np.inf}}
season_data = {
    '01': {'files': [], 'min': np.inf, 'max': -np.inf}, # Winter
    '02': {'files': [], 'min': np.inf, 'max': -np.inf}, # Spring
    '03': {'files': [], 'min': np.inf, 'max': -np.inf}, # Summer
    '04': {'files': [], 'min': np.inf, 'max': -np.inf}  # Autumn
}

print("Scanning files and categorizing by season...")
for filename in os.listdir(data_folder):
    if filename.endswith('.tif') and filename.startswith('masked_ET_'):
        parts = filename.split('_')
        # Ensure filename has enough parts to extract season code and year
        # Expected format: masked_NDLI_YYYY_SS.tif, so 4 parts
        if len(parts) >= 4: # Changed from 3 to 4 to account for 'YYYY' and 'SS.tif'
            year = parts[2] # Year is now at index 2
            season_code = parts[3].split('.')[0] # Season code is now at index 3

            if season_code in season_data:
                input_path = os.path.join(data_folder, filename)
                season_data[season_code]['files'].append(input_path)

                # --- Step 2: Find global min and max values for each season ---
                with rasterio.open(input_path) as src:
                    data = src.read(1)
                    # Mask invalid values
                    data = np.where((data == src.nodata) | (np.isnan(data)), np.nan, data)

                    # Update season-specific min and max, ignoring NaN values
                    if not np.all(np.isnan(data)): # Only update if there's actual data
                        local_min = np.nanmin(data)
                        local_max = np.nanmax(data)
                        season_data[season_code]['min'] = min(season_data[season_code]['min'], local_min)
                        season_data[season_code]['max'] = max(season_data[season_code]['max'], local_max)
            else:
                print(f"Warning: Unexpected season code '{season_code}' found in '{filename}'. Skipping.")
        else:
            print(f"Warning: Filename '{filename}' does not match expected format (e.g., masked_NDLI_YYYY_SS.tif). Skipping.")


print("\nGlobal Min/Max values for each season:")
for season_code, s_data in season_data.items():
    if s_data['files']: # Only print if there are files for this season
        print(f"  Season {season_code} (Month: {season_to_month.get(season_code)}):")
        print(f"    Min: {s_data['min']}, Max: {s_data['max']}")
        if s_data['max'] == s_data['min']:
            print(f"    Warning: Min and Max are identical for this season. Normalization will not change values.")
    else:
        print(f"  No files found for Season {season_code}.")


# --- Step 3: Apply season-specific global min-max normalization to each file ---
print("\nApplying season-specific global min-max normalization to files...")
for season_code, s_data in season_data.items():
    if not s_data['files']:
        print(f"Skipping normalization for Season {season_code} (no files).")
        continue

    current_global_min = s_data['min']
    current_global_max = s_data['max']

    if current_global_max == current_global_min:
        print(f"  Warning: Global min and max are identical for Season {season_code}. Normalization will set values to 0 if min=0, otherwise it will remain unchanged where min and max are same for these files.")
        # We will handle this by simply writing the original data in the loop below
        # but it's good to give a heads-up here.

    for input_path in s_data['files']:
        # Extract filename from full path for output naming
        filename = os.path.basename(input_path)
        print(f"Processing: {filename}")

        parts = filename.split('_')
        year = parts[2] # Year is at index 2
        # season_code is already known from the outer loop
        month_code = season_to_month.get(season_code, '00')

        output_filename = f'ETn_{year}_{month_code}.tif'
        output_path = os.path.join(data_folder, output_filename) # Output to same folder

        with rasterio.open(input_path) as src:
            profile = src.profile
            data = src.read(1)

            # Mask invalid values
            data = np.where((data == src.nodata) | (np.isnan(data)), np.nan, data)

            if current_global_max == current_global_min:
                # If min and max are the same, result of division by zero would be NaN/inf.
                # If the min/max is 0, normalize to 0, otherwise keep original value.
                if current_global_min == 0:
                    normalized_data = np.zeros_like(data, dtype=rasterio.float32)
                    normalized_data[np.isnan(data)] = np.nan # Preserve NaNs
                else:
                    normalized_data = data # If min/max are same but not zero, keep original values
            else:
                # Apply season-specific global min-max normalization
                normalized_data = ((data - current_global_min) / (current_global_max - current_global_min))
                # normalized_data = normalized_data * sqrt(3) / 3 # Apply your specific scaling

            # Update metadata for output
            profile.update(
                dtype=rasterio.float32,
                nodata=np.nan
            )

            # Save the new image
            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(normalized_data.astype(rasterio.float32), 1)

        print(f"Saved as: {output_filename}")

print("\nAll files processed successfully with season-specific global min-max normalization.")


# SWATI Vs.TVPDI Calculation

In [None]:
import os
import numpy as np
import rasterio
from math import sqrt

# Set your input and output folders
data_folder = '/content/drive/MyDrive/NEW FOLDER/SWATI'
output_folder = '/content/drive/MyDrive/NEW FOLDER/DELETE STAT'
os.makedirs(output_folder, exist_ok=True)

# Season code mapping
season_to_month = {
    '01': '01',  # Winter
    '02': '02',  # Spring
    '03': '03',  # Summer
    '04': '04'   # Autumn
}

# # Constant √3/3
SQRT3_DIV3 = sqrt(3) / 3

# Loop through each Tcn file to start the processing
for filename in sorted(os.listdir(data_folder)):

    if filename.endswith('.tif') and filename.startswith('NDLIn_'):
        print(f"Processing: {filename}")

        # Parse year and season code from the Tcn filename
        parts = filename.split('_')
        year = parts[1]
        season_code = parts[2].split('.')[0]
        month_code = season_to_month.get(season_code, '00')  # Default '00' if missing

        # Build paths for input files using "ndli"
        ndli_path = os.path.join(data_folder, f'NDLIn_{year}_{season_code}.tif')
        ndvi_path = os.path.join(data_folder, f'NDVIn_{year}_{season_code}.tif')
        lst_path = os.path.join(data_folder, f'Tcn_{year}_{season_code}.tif')

        # Check if all needed files exist
        if not (os.path.exists(ndli_path) and os.path.exists(ndvi_path) and os.path.exists(lst_path)):
            print(f"  Warning: Missing ndli, NDVIn, or Tcn for {year} Season {season_code}. Skipping...")
            continue

        # Open input rasters
        with rasterio.open(ndli_path) as ndli_src, \
             rasterio.open(ndvi_path) as ndvi_src, \
             rasterio.open(lst_path) as lst_src:

            ndli = ndli_src.read(1)
            ndvi = ndvi_src.read(1)
            lst = lst_src.read(1)

            # Mask invalid values
            ndli = np.where((ndli == ndli_src.nodata) | (np.isnan(ndli)), np.nan, ndli)
            ndvi = np.where((ndvi == ndvi_src.nodata) | (np.isnan(ndvi)), np.nan, ndvi)
            lst = np.where((lst == lst_src.nodata) | (np.isnan(lst)), np.nan, lst)

            # Apply the SWATI formula with the 'ndli' data
            swati = np.sqrt(
                (SQRT3_DIV3 - ndli) ** 2 +
                (SQRT3_DIV3 - ndvi) ** 2 +
                (lst) ** 2
            )
            # swati = np.sqrt(
            #       ((1 - ndli) ** 2 + (1 - ndvi) ** 2 + (lst) ** 2) / 3
            #   )

            # # swati = np.sqrt(
            # #     (1 - ndli) ** 2 +
            # #     (1 - ndvi) ** 2 +
            # #     (lst) ** 2
            # # )

            # Save output
            output_filename = f'SWATIl_{year}_{month_code}.tif'
            output_path = os.path.join(output_folder, output_filename)

            profile = ndli_src.profile
            profile.update(
                dtype=rasterio.float32,
                nodata=np.nan
            )

            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(swati.astype(rasterio.float32), 1)

        print(f"  Saved SWATI file: {output_filename}")

print("\nAll SWATI files processed successfully.")

# NEW SWATI Vs.TVPDI Calculation

In [None]:
import os
import numpy as np
import rasterio
from math import sqrt

# --- Configuration ---
data_folder = '/content/drive/MyDrive/NEW FOLDER/TVPDI'
# output_folder = '/content/drive/MyDrive/NEW FOLDER/DELETE FOLDER'
# os.makedirs(output_folder, exist_ok=True)

# Season code mapping (adjust if needed)
season_to_month = {
    '01': '01',
    '02': '02',
    '03': '03',
    '04': '04'
}

# Ideal NDVI/NDLI value
SQRT3_DIV3 = sqrt(3) / 3

# --- Processing Loop ---
for filename in sorted(os.listdir(data_folder)):
    if filename.endswith('.tif') and filename.startswith('Tcn_'):
        print(f"Processing file: {filename}")

        # Extract year and season from filename
        parts = filename.replace('.tif', '').split('_')
        if len(parts) != 3:
            print(f"  ⚠️ Skipping invalid filename format: {filename}")
            continue

        year, season_code = parts[1], parts[2]
        month_code = season_to_month.get(season_code, '00')

        # Define paths to all required files
        lst_path   = os.path.join(data_folder, f'Tcn_{year}_{season_code}.tif')
        ndvi_path  = os.path.join(data_folder, f'NDVIn_{year}_{season_code}.tif')
        ndli_path  = os.path.join(data_folder, f'prepn_{year}_{season_code}.tif')

        # Ensure all files exist
        if not all(map(os.path.exists, [lst_path, ndvi_path, ndli_path])):
            print(f" Missing files for {year} season {season_code}. Skipping...")
            continue

        # --- Read and mask raster data ---
        with rasterio.open(lst_path) as lst_src, \
             rasterio.open(ndvi_path) as ndvi_src, \
             rasterio.open(ndli_path) as ndli_src:

            # Read data
            lst  = lst_src.read(1).astype(float)
            ndvi = ndvi_src.read(1).astype(float)
            ndli = ndli_src.read(1).astype(float)

            # Replace nodata values with np.nan
            lst[lst == lst_src.nodata]     = np.nan
            ndvi[ndvi == ndvi_src.nodata]  = np.nan
            ndli[ndli == ndli_src.nodata]  = np.nan

            # Valid data mask
            valid_mask = ~np.isnan(lst) & ~np.isnan(ndvi) & ~np.isnan(ndli)

            # Initialize output with NaNs
            swati = np.full_like(lst, np.nan, dtype=float)

            # --- Compute SWATI (LST first) ---
            swati[valid_mask] = np.sqrt(
                (lst[valid_mask])**2 +
                (SQRT3_DIV3 - ndvi[valid_mask])**2 +
                (SQRT3_DIV3 - ndli[valid_mask])**2
            )

            # --- Save output raster ---
            output_filename = f'TVPDI_{year}_{month_code}.tif'
            output_path = os.path.join(data_folder, output_filename)

            # Copy metadata and update data type/nodata
            profile = lst_src.profile.copy()
            profile.update(dtype=rasterio.float32, nodata=np.nan)

            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(swati.astype(rasterio.float32), 1)

            print(f"  ✅ SWATI saved: {output_filename}")

print("\n✅ All SWATI files computed successfully.")


# TVPDI cdd based

In [None]:
import os
import numpy as np
import rasterio
from math import sqrt

# Set your input and output folders
data_folder = '/content/drive/MyDrive/NEW FOLDER/TVPDI'
# output_folder = '/content/drive/MyDrive/SEASON_THREE_SWATIdd'
# os.makedirs(output_folder, exist_ok=True)

# Season code mapping
season_to_month = {
    '01': '01',  # Winter (Dec–Feb)
    '02': '02',  # Spring (Mar–May)
    '03': '03',  # Summer (Jun–Aug)
    '04': '04'   # Autumn (Sep–Nov)
}

# Constant √3/3
SQRT3_DIV3 = sqrt(3) / 3

# Loop through each Tcn file (trigger file)
for filename in sorted(os.listdir(data_folder)):

    if filename.endswith('.tif') and filename.startswith('Tcn_'):
        print(f"Processing: {filename}")

        # Parse year and season code
        parts = filename.split('_')
        year = parts[1]
        season_code = parts[2].split('.')[0]
        month_code = season_to_month.get(season_code, '00')  # Default '00' if mapping fails

        # Build paths for input files
        tcn_path = os.path.join(data_folder, f'Tcn_{year}_{season_code}.tif')
        ndvi_path = os.path.join(data_folder, f'NDVIn_{year}_{season_code}.tif')
        cdd_path = os.path.join(data_folder, f'cddn_{year}_{season_code}.tif')

        # Check if all needed files exist
        if not (os.path.exists(ndvi_path) and os.path.exists(cdd_path)):
            print(f"  Warning: Missing NDVIn or cddn for {year} Season {season_code}. Skipping...")
            continue

        # Open input rasters
        with rasterio.open(tcn_path) as tcn_src, \
             rasterio.open(ndvi_path) as ndvi_src, \
             rasterio.open(cdd_path) as cdd_src:

            tcn = tcn_src.read(1)
            ndvi = ndvi_src.read(1)
            cdd = cdd_src.read(1)

            # Mask invalid values
            tcn = np.where((tcn == tcn_src.nodata) | (np.isnan(tcn)), np.nan, tcn)
            ndvi = np.where((ndvi == ndvi_src.nodata) | (np.isnan(ndvi)), np.nan, ndvi)
            cdd = np.where((cdd == cdd_src.nodata) | (np.isnan(cdd)), np.nan, cdd)

            # Apply the TVPDI formula using cdd
            tvpdi = np.sqrt(
                (SQRT3_DIV3 - ndvi) ** 2 +
                (cdd) ** 2 +
                (tcn) ** 2
            )

            # Save output
            output_filename = f'TVPDIC_{year}_{month_code}.tif'
            output_path = os.path.join(data_folder, output_filename)

            profile = tcn_src.profile
            profile.update(
                dtype=rasterio.float32,
                nodata=np.nan
            )

            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(tvpdi.astype(rasterio.float32), 1)

        print(f"  ✅ Saved TVPDI file: {output_filename}")

print("\n✅ All TVPDI files processed successfully.")


In [None]:
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1.2 cligj-0.7.2 rasterio-1.4.3


# Histogram plot

In [None]:
# prompt: pleasae help me to plot histogram of cddn  file in  the foldder :/content/drive/MyDrive/NEW FOLDER/TVPDI

import numpy as np
import matplotlib.pyplot as plt
import rasterio
import os

# Define the folder containing the cddn files
cddn_folder = '/content/drive/MyDrive/NEW FOLDER/SWATI'

# List all files in the folder and filter for cddn files
cddn_files = [f for f in os.listdir(cddn_folder) if f.startswith('NDVI_') and f.endswith('.tif')]

if not cddn_files:
    print(f"No files starting with 'cddn_' found in {cddn_folder}")
else:
    print(f"Found {len(cddn_files)} 'cddn_' files. Plotting histograms...")
    # Iterate through each cddn file and plot its histogram
    for filename in sorted(cddn_files): # Sort to process in a consistent order
        file_path = os.path.join(cddn_folder, filename)
        print(f"Generating histogram for: {filename}")

        try:
            with rasterio.open(file_path) as src:
                data = src.read(1) # Read the first band

                # Mask nodata values if present
                if src.nodata is not None:
                    data = np.where(data == src.nodata, np.nan, data)

                # Remove NaN values before plotting histogram
                valid_data = data[~np.isnan(data)]

                if valid_data.size > 0:
                    # Create the histogram
                    plt.figure(figsize=(10, 6))
                    plt.hist(valid_data.flatten(), bins=50, color='skyblue', edgecolor='black')
                    plt.title(f'Histogram of {filename}')
                    plt.xlabel('Pixel Value')
                    plt.ylabel('Frequency')
                    plt.grid(axis='y', alpha=0.75)
                    plt.show()
                else:
                    print(f"  Warning: No valid data found in {filename} to plot histogram.")

        except rasterio.errors.RasterioIOError as e:
            print(f"  Error opening or reading raster file {filename}: {e}")
        except Exception as e:
            print(f"  An unexpected error occurred while processing {filename}: {e}")

print("\nHistogram plotting complete.")


In [None]:
!pip install rasterio

# Data exploration and checking

In [None]:
import os
import glob
import rasterio
from rasterio.features import shapes
from shapely.geometry import shape
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # Import for plotting
import seaborn as sns # Import for potentially nicer plots
from google.colab import drive # Assuming you are running this in Google Colab

# --- Configuration ---
# !!! IMPORTANT: Update this path to your folder in Google Drive !!!
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/SEASON_DELETE2' # Updated path
# Ensure the output directory exists or create it
OUTPUT_DIR = '/content/drive/MyDrive/NEW FOLDER/DELETE STAT'
os.makedirs(OUTPUT_DIR, exist_ok=True)


START_YEAR = 2001
END_YEAR = 2024
TARGET_SEASON = '02' # Season code to process

# Construct output CSV name more robustly for the new data
OUTPUT_CSV_NAME = os.path.join(OUTPUT_DIR, f'demoTcn_NDLI_NDVI_data_{TARGET_SEASON}.csv')


# --- Helper function to get coordinates ---
def get_coordinates(transform, rows, cols):
    """Calculates longitude and latitude for given rows and columns."""
    xs, ys = rasterio.transform.xy(transform, rows, cols)
    # Assuming WGS84 (EPSG:4326) for lat/long.
    # If your data is in a different CRS, you might need to reproject.
    return xs, ys # typically longitude, latitude

# --- Main Processing Logic ---
all_pixel_data = []

print(f"Processing files from {START_YEAR} to {END_YEAR} for season {TARGET_SEASON}...")

for year in range(START_YEAR, END_YEAR + 1):
    year_str = str(year)
    print(f"\nProcessing year: {year_str}")

    # Construct file paths for the current year and target season for Tcn, NDLIn, NDVIn
    tcn_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'TVPDICC_{year_str}_{TARGET_SEASON}*.tif')
    ndli_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'TVPDICC_{year_str}_{TARGET_SEASON}*.tif')
    ndvi_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'TVPDICC_{year_str}_{TARGET_SEASON}*.tif')

    # Find the files - using glob to handle potential extra characters in filename if any
    tcn_files = glob.glob(tcn_file_pattern)
    ndli_files = glob.glob(ndli_file_pattern)
    ndvi_files = glob.glob(ndvi_file_pattern)

    # --- File Existence Checks ---
    if not tcn_files:
        print(f"  Tcn file not found for {year_str}_{TARGET_SEASON}. Skipping year...")
        continue
    if not ndli_files:
        print(f"  NDLIn file not found for {year_str}_{TARGET_SEASON}. Skipping year...")
        continue
    if not ndvi_files:
        print(f"  NDVIn file not found for {year_str}_{TARGET_SEASON}. Skipping year...")
        continue

    # Assuming one file per type per year/season if found
    tcn_file_path = tcn_files[0]
    ndli_file_path = ndli_files[0]
    ndvi_file_path = ndvi_files[0]

    print(f"  Found Tcn:  {os.path.basename(tcn_file_path)}")
    print(f"  Found NDLI: {os.path.basename(ndli_file_path)}")
    print(f"  Found NDVI: {os.path.basename(ndvi_file_path)}")


    try:
        with rasterio.open(tcn_file_path) as tcn_src, \
             rasterio.open(ndli_file_path) as ndli_src, \
             rasterio.open(ndvi_file_path) as ndvi_src:

            # Read Tcn data and create a mask (similar to how Tsc was used)
            tcn_data = tcn_src.read(1)
            transform = tcn_src.transform # Get affine transform for coordinate calculation
            nodata_val_tcn = tcn_src.nodata

            # Create mask: True where data is NOT nodata and is finite
            if nodata_val_tcn is not None:
                mask = (tcn_data != nodata_val_tcn) & (~np.isnan(tcn_data))
            else:
                # If NoData value is not defined, assume all finite values are valid
                mask = np.isfinite(tcn_data)

            # Read NDLIn and NDVIn data
            ndli_data_raw = ndli_src.read(1)
            ndvi_data_raw = ndvi_src.read(1)

            # Check if all rasters have the same shape
            if not (tcn_data.shape == ndli_data_raw.shape == ndvi_data_raw.shape):
                print(f"  ERROR: Raster dimensions do not match for year {year_str}. Skipping year.")
                continue

            # Get row and column indices of valid pixels from the Tcn mask
            row_indices, col_indices = np.where(mask)

            if row_indices.size == 0:
                print(f"  No valid data pixels found in Tcn for {year_str}_{TARGET_SEASON} based on the mask. Skipping year.")
                continue

            # Get the actual values for the masked pixels
            tcn_values = tcn_data[mask]
            ndli_values = ndli_data_raw[mask]
            ndvi_values = ndvi_data_raw[mask]

            # Get coordinates for the masked pixels
            longitudes, latitudes = get_coordinates(transform, row_indices, col_indices)

            # Store data for this year
            for i in range(len(longitudes)):
                all_pixel_data.append({
                    'lat': latitudes[i],
                    'long': longitudes[i],
                    'year': year,
                    'Tcn': tcn_values[i],
                    'NDLIn': ndli_values[i],
                    'NDVIn': ndvi_values[i]
                })
            print(f"  Processed {len(longitudes)} pixels for {year_str}_{TARGET_SEASON}.")

    except rasterio.errors.RasterioIOError as e:
        print(f"  Error opening one of the files for year {year_str}: {e}. Skipping year.")
    except Exception as e:
        print(f"  An unexpected error occurred processing files for year {year_str}: {e}")

# Convert collected data to a Pandas DataFrame
df = pd.DataFrame(all_pixel_data)

# Save DataFrame to CSV
if not df.empty:
    # The OUTPUT_CSV_NAME is already an absolute path to the file in Drive
    df.to_csv(OUTPUT_CSV_NAME, index=False)
    print(f"\nSuccessfully extracted data and saved to: {OUTPUT_CSV_NAME}")

    # --- Grouping and Plotting ---
    print("\nStarting data aggregation and plotting...")

    # Group by year and calculate the mean for each index
    # We will exclude 'lat' and 'long' from the mean calculation as they are coordinates
    df_grouped_by_year = df.groupby('year')[['Tcn', 'NDLIn', 'NDVIn']].mean().reset_index()

    print("\nAggregated Data (Mean per Year):")
    print(df_grouped_by_year.head())

    # Create plots for each index vs. year
    # Define the indices you want to plot
    indices_to_plot = ['Tcn', 'NDLIn', 'NDVIn']

    for index_col in indices_to_plot:
        plt.figure(figsize=(10, 6)) # Create a new figure for each plot
        sns.lineplot(data=df_grouped_by_year, x='year', y=index_col, marker='o') # Line plot with markers
        plt.title(f'Mean {index_col} Over Years (Season {TARGET_SEASON})')
        plt.xlabel('Year')
        plt.ylabel(f'Mean {index_col}')
        plt.grid(True)
        plt.xticks(df_grouped_by_year['year'].unique(), rotation=45) # Show all years on x-axis, rotate for readability
        plt.tight_layout() # Adjust layout to prevent labels from overlapping

        # Save the plot
        plot_filename = os.path.join(OUTPUT_DIR, f'{index_col}_vs_Year_Season_{TARGET_SEASON}.png')
        plt.savefig(plot_filename)
        print(f"Plot saved to: {plot_filename}")
        plt.show() # Display the plot

else:
    print("\nNo data was extracted. CSV file not created, and no plots will be generated.")

print("\nProcessing complete.")

In [None]:
!pip install matplotlib rasterio
!pip install rasterio

# TEMP_DATA ORGANIZATION

In [None]:
import os
import rasterio
import numpy as np
from rasterio.enums import Resampling

# Set folders
src_folder = "/content/drive/MyDrive/NEW FOLDER/TVPDI2"  # Change to your folder
dst_folder = "/content/drive/MyDrive/NEW FOLDER/TVPDI"
os.makedirs(dst_folder, exist_ok=True)

# Season recoding
season_map = {'DJF': '01', 'MAM': '02', 'JJA': '03', 'SON': '04'}

# Loop through files
for fname in os.listdir(src_folder):
    if fname.endswith(".tif") and fname.startswith("cdd_"):
        parts = fname.replace(".tif", "").split("_")
        if len(parts) == 3:
            _, year_str, season = parts
            if season in season_map:
                year = int(year_str)
                if 2001 <= year <= 2022:
                    new_name = f"cdd_{year}_{season_map[season]}.tif"
                    src_path = os.path.join(src_folder, fname)
                    dst_path = os.path.join(dst_folder, new_name)

                    with rasterio.open(src_path) as src:
                        profile = src.profile
                        data = src.read(1).astype(np.float32)


                        profile.update(dtype=rasterio.float32)

                        with rasterio.open(dst_path, 'w', **profile) as dst:
                            dst.write(data, 1)

                    print(f"✅ Converted and saved: {new_name}")



# Validation of LST: Tc,To,Ta

In [None]:
import os
import glob
import rasterio
from rasterio.features import shapes
from shapely.geometry import shape
import numpy as np
import pandas as pd
from google.colab import drive # Assuming you are running this in Google Colab

# --- Configuration ---
# !!! IMPORTANT: Update this path to your folder in Google Drive !!!
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/NEW FOLDER/NEWTRANING' # Example: '/content/drive/MyDrive/TemperatureData'
# Ensure the output directory exists or create it
OUTPUT_DIR = '/content/drive/MyDrive/NEW FOLDER/STAT'
os.makedirs(OUTPUT_DIR, exist_ok=True)


START_YEAR = 2001
END_YEAR = 2024
TARGET_SEASON = '04' # Season code to process

# Construct output CSV name more robustly
OUTPUT_CSV_NAME = os.path.join(OUTPUT_DIR, f'TEMP_data_{TARGET_SEASON}.csv')


# --- Helper function to get coordinates ---
def get_coordinates(transform, rows, cols):
    """Calculates longitude and latitude for given rows and columns."""
    xs, ys = rasterio.transform.xy(transform, rows, cols)
    # Assuming WGS84 (EPSG:4326) for lat/long.
    # If your data is in a different CRS, you might need to reproject.
    return xs, ys # typically longitude, latitude

# --- Main Processing Logic ---
all_pixel_data = []

print(f"Processing files from {START_YEAR} to {END_YEAR} for season {TARGET_SEASON}...")

for year in range(START_YEAR, END_YEAR + 1):
    year_str = str(year)
    print(f"\nProcessing year: {year_str}")

    # Construct file paths for the current year and target season
    tsc_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'Tc_{year_str}_{TARGET_SEASON}*.tif')
    tso_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'LST_{year_str}_{TARGET_SEASON}*.tif')
    ts_file_pattern  = os.path.join(DRIVE_FOLDER_PATH, f'Ta_{year_str}_{TARGET_SEASON}*.tif') # Corrected variable name

    # Find the files - using glob to handle potential extra characters in filename if any
    tsc_files = glob.glob(tsc_file_pattern)
    tso_files = glob.glob(tso_file_pattern)
    ts_files  = glob.glob(ts_file_pattern)

    if not tsc_files:
        print(f"  Tsc file not found for {year_str}_{TARGET_SEASON}. Skipping year...")
        continue
    if not tso_files:
        print(f"  Tso file not found for {year_str}_{TARGET_SEASON}. Skipping year...")
        continue

    # Assuming one file per type per year/season if found
    tsc_file_path = tsc_files[0]
    tso_file_path = tso_files[0]
    ts_file_path = ts_files[0] if ts_files else None # Will be None if Ts file not found

    print(f"  Found Tsc: {os.path.basename(tsc_file_path)}")
    print(f"  Found Tso: {os.path.basename(tso_file_path)}")
    if ts_file_path:
        print(f"  Found Ts:  {os.path.basename(ts_file_path)}")
    else:
        print(f"  Ts file not found for {year_str}_{TARGET_SEASON}. Ts_T values will be NaN.")

    try:
        with rasterio.open(tsc_file_path) as tsc_src, \
             rasterio.open(tso_file_path) as tso_src:

            # Read Tsc data and create a mask
            tsc_data = tsc_src.read(1)
            transform = tsc_src.transform # Get affine transform for coordinate calculation
            nodata_val_tsc = tsc_src.nodata

            # Create mask: True where data is NOT nodata
            if nodata_val_tsc is not None:
                mask = (tsc_data != nodata_val_tsc) & (~np.isnan(tsc_data))
            else:
                # If NoData value is not defined, assume all finite values are valid
                mask = np.isfinite(tsc_data)

            # Read Tso data
            tso_data_raw = tso_src.read(1) # Read before shape check

            # Check if Tso has the same shape as Tsc
            if not (tsc_data.shape == tso_data_raw.shape):
                print(f"  ERROR: Tsc and Tso raster dimensions do not match for year {year_str}. Skipping year.")
                continue

            ts_data_raw = None # Initialize
            ts_temps_available = False

            if ts_file_path:
                try:
                    with rasterio.open(ts_file_path) as ts_src:
                        # Check if Ts has the same shape as Tsc
                        current_ts_data = ts_src.read(1)
                        if not (tsc_data.shape == current_ts_data.shape):
                            print(f"  WARNING: Ts raster dimensions ({current_ts_data.shape}) do not match Tsc ({tsc_data.shape}) for year {year_str}. Ts_T values for this year will be NaN.")
                        else:
                            ts_data_raw = current_ts_data
                            ts_temps_available = True
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open or read Ts file {os.path.basename(ts_file_path)} for year {year_str}: {e}. Ts_T values for this year will be NaN.")
            # If ts_file_path is None, ts_temps_available remains False

            # Get row and column indices of valid pixels from the Tsc mask
            row_indices, col_indices = np.where(mask)

            if row_indices.size == 0:
                print(f"  No valid data pixels found in Tsc for {year_str}_{TARGET_SEASON} based on the mask. Skipping year.")
                continue

            # Get the actual temperature values for the masked pixels
            tsc_temps = tsc_data[mask]
            tso_temps = tso_data_raw[mask] # Apply mask to Tso data

            if ts_temps_available and ts_data_raw is not None:
                ts_temps_masked = ts_data_raw[mask]
            else:
                # Create an array of NaNs with the same length as tsc_temps
                # Use the same dtype as tsc_temps if it's float, otherwise default to float32 for NaNs
                nan_dtype = tsc_temps.dtype if np.issubdtype(tsc_temps.dtype, np.floating) else np.float32
                ts_temps_masked = np.full_like(tsc_temps, np.nan, dtype=nan_dtype)

            # Get coordinates for the masked pixels
            longitudes, latitudes = get_coordinates(transform, row_indices, col_indices)

            # Store data for this year
            for i in range(len(longitudes)):
                all_pixel_data.append({
                    'lat': latitudes[i],
                    'long': longitudes[i],
                    'year': year,
                    'Tc': tsc_temps[i],
                    'Tso': tso_temps[i],
                    'Ta': ts_temps_masked[i] # This will be NaN if Ts data wasn't available/valid
                })
            print(f"  Processed {len(longitudes)} pixels for {year_str}_{TARGET_SEASON}.")

    except rasterio.errors.RasterioIOError as e:
        print(f"  Error opening Tsc or Tso file for year {year_str}: {e}. Skipping year.")
    except Exception as e:
        print(f"  An unexpected error occurred processing files for year {year_str}: {e}")

# Convert collected data to a Pandas DataFrame
df = pd.DataFrame(all_pixel_data)

# Save DataFrame to CSV
if not df.empty:
    # The OUTPUT_CSV_NAME is already an absolute path to the file in Drive
    df.to_csv(OUTPUT_CSV_NAME, index=False)
    print(f"\nSuccessfully extracted data and saved to: {OUTPUT_CSV_NAME}")
else:
    print("\nNo data was extracted. CSV file not created.")

print("\nProcessing complete.")

# All_indices_data_csv export

In [None]:
import os
import glob
import rasterio
import numpy as np
import pandas as pd
# from google.colab import drive # Assuming you are running this in Google Colab

# --- Configuration ---
# !!! IMPORTANT: Update paths to your data folders in Google Drive !!!
# Path for all secondary files (TVPDIC, cdd, prep, TVPDI)
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/NEW FOLDER/TVPDI'

# Path for the primary SWATI files (2001-2024)
SWATI_FOLDER_PATH = '/content/drive/MyDrive/NEW FOLDER/SWATI'

# Ensure the output directory exists or create it
OUTPUT_DIR = '/content/drive/MyDrive/NEW FOLDER/STAT'
os.makedirs(OUTPUT_DIR, exist_ok=True)

START_YEAR = 2001
END_YEAR = 2024 # Set to 2024 to process all available SWATI files
TARGET_SEASON = '04' # Season code to process (e.g., '01' for DJF)

# --- EDITED: Updated CSV name to reflect all included files ---
OUTPUT_CSV_NAME = os.path.join(OUTPUT_DIR, f'All_indices_{TARGET_SEASON}.csv')

# --- Helper function to get coordinates (reusable) ---
def get_coordinates(transform, rows, cols):
    """Calculates longitude and latitude for given rows and columns."""
    xs, ys = rasterio.transform.xy(transform, rows, cols)
    return xs, ys

# --- Main Processing Logic ---
all_pixel_data = []

# --- EDITED: Updated print statements to include the new TVPDI file ---
print(f"Processing files from {START_YEAR} to {END_YEAR} for season {TARGET_SEASON}...")
print(f"Primary reference file: SWATI_YYYY_SS*.tif (from {SWATI_FOLDER_PATH})")
print(f"Secondary files (2001-2022): TVPDIC_*, TVPDI_*, cdd_*, prep_* (from {DRIVE_FOLDER_PATH})")
print(f"For 2023-2024, only SWATI data will be extracted; other values will be NaN.")

for year in range(START_YEAR, END_YEAR + 1):
    year_str = str(year)
    print(f"\nProcessing year: {year_str}, Season: {TARGET_SEASON}")

    # --- EDITED: Define file patterns for all five files from their respective folders ---
    swati_file_pattern = os.path.join(SWATI_FOLDER_PATH, f'SWATI_{year_str}_{TARGET_SEASON}*.tif')
    tvpdic_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'TVPDIC_{year_str}_{TARGET_SEASON}*.tif')
    tvpdi_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'TVPDI_{year_str}_{TARGET_SEASON}*.tif') # Added TVPDI
    cdd_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'cdd_{year_str}_{TARGET_SEASON}*.tif')
    prep_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'meanr_{year_str}_{TARGET_SEASON}*.tif')

    # Find files
    swati_files = glob.glob(swati_file_pattern)
    tvpdic_files = glob.glob(tvpdic_file_pattern)
    tvpdi_files = glob.glob(tvpdi_file_pattern) # Find TVPDI files
    cdd_files = glob.glob(cdd_file_pattern)
    prep_files = glob.glob(prep_file_pattern)

    # Check for primary SWATI file first
    if not swati_files:
        print(f"  Primary SWATI file not found for {year_str}_{TARGET_SEASON}. Skipping year/season...")
        continue

    swati_file_path = swati_files[0]
    tvpdic_file_path = tvpdic_files[0] if tvpdic_files else None
    tvpdi_file_path = tvpdi_files[0] if tvpdi_files else None # Get path for TVPDI
    cdd_file_path = cdd_files[0] if cdd_files else None
    prep_file_path = prep_files[0] if prep_files else None

    print(f"  Found primary SWATI: {os.path.basename(swati_file_path)}")
    if tvpdic_file_path:
        print(f"  Found TVPDIC: {os.path.basename(tvpdic_file_path)}")
    else:
        print(f"  TVPDIC file not found for {year_str}_{TARGET_SEASON}. tvpdic values will be NaN.")
    if tvpdi_file_path:
        print(f"  Found TVPDI: {os.path.basename(tvpdi_file_path)}")
    else:
        print(f"  TVPDI file not found for {year_str}_{TARGET_SEASON}. tvpdi values will be NaN.")
    if cdd_file_path:
        print(f"  Found cdd: {os.path.basename(cdd_file_path)}")
    else:
        print(f"  cdd file not found for {year_str}_{TARGET_SEASON}. cdd values will be NaN.")
    if prep_file_path:
        print(f"  Found prep: {os.path.basename(prep_file_path)}")
    else:
        print(f"  prep file not found for {year_str}_{TARGET_SEASON}. prep values will be NaN.")

    try:
        # Open primary SWATI file to get transform, mask, etc.
        with rasterio.open(swati_file_path) as swati_src:
            swati_data = swati_src.read(1)
            transform = swati_src.transform
            nodata_val_swati = swati_src.nodata

            # Create mask based on the primary SWATI data
            if nodata_val_swati is not None:
                mask = (swati_data != nodata_val_swati) & (~np.isnan(swati_data))
            else:
                mask = np.isfinite(swati_data)

            # Initialize raw data variables for secondary files
            tvpdic_data_raw = None
            tvpdi_data_raw = None # Initialized for TVPDI
            cdd_data_raw = None
            prep_data_raw = None

            # Process TVPDIC file
            if tvpdic_file_path:
                try:
                    with rasterio.open(tvpdic_file_path) as tvpdic_src:
                        if not (swati_data.shape == tvpdic_src.shape):
                            print(f"  WARNING: TVPDIC raster dimensions ({tvpdic_src.shape}) do not match primary SWATI ({swati_data.shape}). tvpdic values will be NaN.")
                        else:
                            tvpdic_data_raw = tvpdic_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read TVPDIC file: {e}. tvpdic values will be NaN.")

            # --- EDITED: Process TVPDI file ---
            if tvpdi_file_path:
                try:
                    with rasterio.open(tvpdi_file_path) as tvpdi_src:
                        if not (swati_data.shape == tvpdi_src.shape):
                            print(f"  WARNING: TVPDI raster dimensions ({tvpdi_src.shape}) do not match primary SWATI ({swati_data.shape}). tvpdi values will be NaN.")
                        else:
                            tvpdi_data_raw = tvpdi_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read TVPDI file: {e}. tvpdi values will be NaN.")

            # Process cdd file
            if cdd_file_path:
                try:
                    with rasterio.open(cdd_file_path) as cdd_src:
                        if not (swati_data.shape == cdd_src.shape):
                            print(f"  WARNING: cdd raster dimensions ({cdd_src.shape}) do not match primary SWATI ({swati_data.shape}). cdd values will be NaN.")
                        else:
                            cdd_data_raw = cdd_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read cdd file: {e}. cdd values will be NaN.")

            # Process prep file
            if prep_file_path:
                try:
                    with rasterio.open(prep_file_path) as prep_src:
                        if not (swati_data.shape == prep_src.shape):
                            print(f"  WARNING: prep raster dimensions ({prep_src.shape}) do not match primary SWATI ({swati_data.shape}). prep values will be NaN.")
                        else:
                            prep_data_raw = prep_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read prep file: {e}. prep values will be NaN.")

            row_indices, col_indices = np.where(mask)

            if row_indices.size == 0:
                print(f"  No valid data pixels found in primary SWATI for {year_str}_{TARGET_SEASON}. Skipping.")
                continue

            # Extract values from the primary SWATI data
            swati_values_at_mask = swati_data[mask]
            nan_dtype = swati_values_at_mask.dtype if np.issubdtype(swati_values_at_mask.dtype, np.floating) else np.float32

            # Prepare masked arrays for all secondary files, defaulting to NaN
            tvpdic_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype)
            if tvpdic_data_raw is not None:
                tvpdic_values_masked = tvpdic_data_raw[mask]

            tvpdi_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype) # For TVPDI
            if tvpdi_data_raw is not None:
                tvpdi_values_masked = tvpdi_data_raw[mask]

            cdd_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype)
            if cdd_data_raw is not None:
                cdd_values_masked = cdd_data_raw[mask]

            prep_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype)
            if prep_data_raw is not None:
                prep_values_masked = prep_data_raw[mask]

            longitudes, latitudes = get_coordinates(transform, row_indices, col_indices)

            # --- EDITED: Append data for each valid pixel with all five data columns ---
            for i in range(len(longitudes)):
                all_pixel_data.append({
                    'lat': latitudes[i],
                    'long': longitudes[i],
                    'year': year,
                    'swati': swati_values_at_mask[i],
                    'tvpdic': tvpdic_values_masked[i],
                    'tvpdi': tvpdi_values_masked[i], # Added TVPDI data
                    'cdd': cdd_values_masked[i],
                    'prep': prep_values_masked[i]
                })
            print(f"  Processed {len(longitudes)} pixels for {year_str}_{TARGET_SEASON}.")

    except rasterio.errors.RasterioIOError as e:
        print(f"  Error opening primary SWATI file {os.path.basename(swati_file_path)}: {e}. Skipping.")
    except Exception as e:
        print(f"  An unexpected error occurred processing files for {year_str}_{TARGET_SEASON}: {e}")
        import traceback
        traceback.print_exc()

# Create a Pandas DataFrame from the collected pixel data
if all_pixel_data:
    df = pd.DataFrame(all_pixel_data)
    df.to_csv(OUTPUT_CSV_NAME, index=False)
    print(f"\nSuccessfully extracted data and saved to: {OUTPUT_CSV_NAME}")
else:
    print("\nNo data was extracted. CSV file not created.")

print("\nProcessing complete.")

# SWATI_mean rain_ Organization

In [None]:
import os
import shutil

src = '/content/drive/MyDrive/PREP_VALIDATION'
dst = '/content/drive/MyDrive/NEW FOLDER/TVPDI'

os.makedirs(dst, exist_ok=True)

for year in range(2001, 2025):
    for season in ['01', '02', '03', '04']:
        file_name = f'meanr_{year}_{season}.tif'  # change extension if needed
        src_path = os.path.join(src, file_name)
        dst_path = os.path.join(dst, file_name)

        if os.path.isfile(src_path):
            shutil.copy2(src_path, dst_path)
            print(f'✅ Copied file: {file_name}')
        else:
            print(f' Not found: {file_name}')



# SWATI_Cdd_ Organization

In [None]:
import os
import shutil

# Define source and destination directories
src = '/content/drive/MyDrive/NEW FOLDER/SWATI'
dst = '/content/drive/MyDrive/NEW FOLDER/TVPDI'

# # Ensure the destination directory exists
# os.makedirs(dst, exist_ok=True)

# Seasonal mapping: from string suffix to numeric code
season_map = {
    '01': '01',
    '02': '02',
    '03': '03',
    '04': '04'
}

# Process years and seasons
for year in range(2001, 2025):
    for season_str, season_code in season_map.items():
        # Original file format
        original_name = f'SWATI_{year}_{season_str}.tif'
        src_path = os.path.join(src, original_name)

        # Target renamed format
        renamed_name = f'SWATI_{year}_{season_code}.tif'
        dst_path = os.path.join(dst, renamed_name)

        if os.path.isfile(src_path):
            shutil.copy2(src_path, dst_path)
            print(f'✅ Copied and renamed: {original_name} ➜ {renamed_name}')
        else:
            print(f'❌ File not found: {original_name}')


In [None]:
!pip install rasterio

# SWATI_cdd_TVPDI_prep

In [None]:
import os
import glob
import rasterio
import numpy as np
import pandas as pd
# from google.colab import drive # Assuming you are running this in Google Colab

# --- Configuration ---
# !!! IMPORTANT: Update paths to your data folders in Google Drive !!!
# Path for TVPDIC, cdd, and prep files (2001-2022)
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/NEW FOLDER/TVPDI'

# Path for the primary SWATI files (2001-2024)
SWATI_FOLDER_PATH = '/content/drive/MyDrive/NEW FOLDER/SWATI'

# Ensure the output directory exists or create it
OUTPUT_DIR = '/content/drive/MyDrive/NEW FOLDER/STAT'
os.makedirs(OUTPUT_DIR, exist_ok=True)

START_YEAR = 2001
END_YEAR = 2024 # --- EDITED: Set to 2024 to process all available SWATI files
TARGET_SEASON = '04' # Season code to process (e.g., '01' for DJF)

# Updated CSV name to reflect SWATI is the primary file
OUTPUT_CSV_NAME = os.path.join(OUTPUT_DIR, f'SWATI_TVPDI_pcd_{TARGET_SEASON}.csv')

# --- Helper function to get coordinates (reusable) ---
def get_coordinates(transform, rows, cols):
    """Calculates longitude and latitude for given rows and columns."""
    xs, ys = rasterio.transform.xy(transform, rows, cols)
    return xs, ys

# --- Main Processing Logic ---
all_pixel_data = []

# --- EDITED: Updated print statements to reflect new logic ---
print(f"Processing files from {START_YEAR} to {END_YEAR} for season {TARGET_SEASON}...")
print(f"Primary reference file: SWATI_YYYY_SS*.tif (from {SWATI_FOLDER_PATH})")
print(f"Secondary files (2001-2022): TVPDIC_*, cdd_*, prep_* (from {DRIVE_FOLDER_PATH})")
print(f"For 2023-2024, only SWATI data will be extracted; other values will be NaN.")

for year in range(START_YEAR, END_YEAR + 1):
    year_str = str(year)
    print(f"\nProcessing year: {year_str}, Season: {TARGET_SEASON}")

    # Define file patterns for all four files from their respective folders
    swati_file_pattern = os.path.join(SWATI_FOLDER_PATH, f'SWATI2_{year_str}_{TARGET_SEASON}*.tif')
    tvpdic_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'TVPDIC_{year_str}_{TARGET_SEASON}*.tif')
    cdd_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'cdd_{year_str}_{TARGET_SEASON}*.tif')
    prep_file_pattern = os.path.join(DRIVE_FOLDER_PATH, f'prep_{year_str}_{TARGET_SEASON}*.tif')

    # Find files
    swati_files = glob.glob(swati_file_pattern)
    tvpdic_files = glob.glob(tvpdic_file_pattern)
    cdd_files = glob.glob(cdd_file_pattern)
    prep_files = glob.glob(prep_file_pattern)

    # --- EDITED: Check for primary SWATI file first ---
    if not swati_files:
        print(f"  Primary SWATI file not found for {year_str}_{TARGET_SEASON}. Skipping year/season...")
        continue

    swati_file_path = swati_files[0]
    tvpdic_file_path = tvpdic_files[0] if tvpdic_files else None
    cdd_file_path = cdd_files[0] if cdd_files else None
    prep_file_path = prep_files[0] if prep_files else None

    print(f"  Found primary SWATI: {os.path.basename(swati_file_path)}")
    if tvpdic_file_path:
        print(f"  Found TVPDIC: {os.path.basename(tvpdic_file_path)}")
    else:
        print(f"  TVPDIC file not found for {year_str}_{TARGET_SEASON}. tvpdic values will be NaN.")
    if cdd_file_path:
        print(f"  Found cdd: {os.path.basename(cdd_file_path)}")
    else:
        print(f"  cdd file not found for {year_str}_{TARGET_SEASON}. cdd values will be NaN.")
    if prep_file_path:
        print(f"  Found prep: {os.path.basename(prep_file_path)}")
    else:
        print(f"  prep file not found for {year_str}_{TARGET_SEASON}. prep values will be NaN.")

    try:
        # --- EDITED: Open primary SWATI file to get transform, mask, etc. ---
        with rasterio.open(swati_file_path) as swati_src:
            swati_data = swati_src.read(1)
            transform = swati_src.transform
            nodata_val_swati = swati_src.nodata

            # Create mask based on the primary SWATI data
            if nodata_val_swati is not None:
                mask = (swati_data != nodata_val_swati) & (~np.isnan(swati_data))
            else:
                mask = np.isfinite(swati_data)

            # Initialize raw data variables for secondary files
            tvpdic_data_raw = None
            cdd_data_raw = None
            prep_data_raw = None

            # Process TVPDIC file
            if tvpdic_file_path:
                try:
                    with rasterio.open(tvpdic_file_path) as tvpdic_src:
                        # Check dimensions against the primary SWATI raster
                        if not (swati_data.shape == tvpdic_src.shape):
                            print(f"  WARNING: TVPDIC raster dimensions ({tvpdic_src.shape}) do not match primary SWATI ({swati_data.shape}). tvpdic values will be NaN.")
                        else:
                            tvpdic_data_raw = tvpdic_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read TVPDIC file {os.path.basename(tvpdic_file_path)}: {e}. tvpdic values will be NaN.")

            # Process cdd file
            if cdd_file_path:
                try:
                    with rasterio.open(cdd_file_path) as cdd_src:
                        if not (swati_data.shape == cdd_src.shape):
                            print(f"  WARNING: cdd raster dimensions ({cdd_src.shape}) do not match primary SWATI ({swati_data.shape}). cdd values will be NaN.")
                        else:
                            cdd_data_raw = cdd_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read cdd file {os.path.basename(cdd_file_path)}: {e}. cdd values will be NaN.")

            # Process prep file
            if prep_file_path:
                try:
                    with rasterio.open(prep_file_path) as prep_src:
                        if not (swati_data.shape == prep_src.shape):
                            print(f"  WARNING: prep raster dimensions ({prep_src.shape}) do not match primary SWATI ({swati_data.shape}). prep values will be NaN.")
                        else:
                            prep_data_raw = prep_src.read(1)
                except rasterio.errors.RasterioIOError as e:
                    print(f"  WARNING: Could not open/read prep file {os.path.basename(prep_file_path)}: {e}. prep values will be NaN.")

            row_indices, col_indices = np.where(mask)

            if row_indices.size == 0:
                print(f"  No valid data pixels found in primary SWATI for {year_str}_{TARGET_SEASON}. Skipping.")
                continue

            # Extract values from the primary SWATI data
            swati_values_at_mask = swati_data[mask]
            nan_dtype = swati_values_at_mask.dtype if np.issubdtype(swati_values_at_mask.dtype, np.floating) else np.float32

            # Prepare masked arrays for all secondary files, defaulting to NaN
            tvpdic_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype)
            if tvpdic_data_raw is not None:
                tvpdic_values_masked = tvpdic_data_raw[mask]

            cdd_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype)
            if cdd_data_raw is not None:
                cdd_values_masked = cdd_data_raw[mask]

            prep_values_masked = np.full_like(swati_values_at_mask, np.nan, dtype=nan_dtype)
            if prep_data_raw is not None:
                prep_values_masked = prep_data_raw[mask]

            longitudes, latitudes = get_coordinates(transform, row_indices, col_indices)

            # --- EDITED: Append data for each valid pixel with new column order ---
            for i in range(len(longitudes)):
                all_pixel_data.append({
                    'lat': latitudes[i],
                    'long': longitudes[i],
                    'year': year,
                    'swati2': swati_values_at_mask[i],
                    'tvpdic': tvpdic_values_masked[i],
                    'cdd': cdd_values_masked[i],
                    'prep': prep_values_masked[i]
                })
            print(f"  Processed {len(longitudes)} pixels for {year_str}_{TARGET_SEASON}.")

    except rasterio.errors.RasterioIOError as e:
        print(f"  Error opening primary SWATI file {os.path.basename(swati_file_path)} for {year_str}_{TARGET_SEASON}: {e}. Skipping.")
    except Exception as e:
        print(f"  An unexpected error occurred processing files for {year_str}_{TARGET_SEASON}: {e}")
        import traceback
        traceback.print_exc()

# Create a Pandas DataFrame from the collected pixel data
if all_pixel_data:
    df = pd.DataFrame(all_pixel_data)
    df.to_csv(OUTPUT_CSV_NAME, index=False)
    print(f"\nSuccessfully extracted data and saved to: {OUTPUT_CSV_NAME}")
else:
    print("\nNo data was extracted. CSV file not created.")

print("\nProcessing complete.")

# Visualization

In [None]:
# prompt: PLEASE HELP ME TO DISPLAY ALL TVPDI MAPS AND HELP ME TO SEE

import matplotlib.pyplot as plt
import rasterio
import os

data_folder = '/content/drive/MyDrive/NEW DEMO/PREP'

for filename in sorted(os.listdir(data_folder)):
    if filename.endswith('.tif') and filename.startswith('prep_'):
        print(f"Displaying: {filename}")
        filepath = os.path.join(data_folder, filename)

        with rasterio.open(filepath) as src:
            tvpdi_data = src.read(1)
            plt.figure(figsize=(10, 8))
            plt.imshow(tvpdi_data, cmap='viridis')  # You can change the colormap
            plt.colorbar(label='TVPDI')
            plt.title(filename)
            plt.show()


# Trend analysis

In [None]:
# prompt: plot rastrio :/content/drive/MyDrive/PREP_VALIDATION/rain_2021_04.tif
import rasterio
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
file_path = '/content/drive/MyDrive/PREP_VALIDATION/rain_2021_04.tif'

try:
    with rasterio.open(file_path) as src:
        # Read the raster data
        raster_data = src.read(1)  # Assuming it's a single-band image

        # Check for invalid values (e.g., nodata) and handle them appropriately.
        # For example, replace nodata values with NaN:
        if src.nodata is not None:
            raster_data = np.where(raster_data == src.nodata, np.nan, raster_data)

        # Display the raster data using matplotlib
        plt.imshow(raster_data, cmap='viridis') # You can change the colormap
        plt.colorbar(label='Pixel Value')
        plt.title(f'Raster Data from {file_path}')
        plt.show()


except rasterio.errors.RasterioIOError as e:
    print(f"Error opening or reading the raster file: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
