## 1. General File functionality

### 1.1 Display all subfolders with number of files

In [None]:
import os

def count_files_in_subfolders(root_dir):
    folder_file_count = {}

    for subdir, _, files in os.walk(root_dir):
        # Count the number of files in the current folder
        file_count = len(files)
        if file_count > 0:
            # Store relative path and file count
            relative_path = os.path.relpath(subdir, root_dir)
            folder_file_count[relative_path] = file_count

    return folder_file_count

# Example usage
root_directory = "ChangeDetectionMergedDividedSplit-DW-tif"  # Replace with your directory path
file_counts = count_files_in_subfolders(root_directory)

# Print the relative path and file count for folders with files
for folder, count in file_counts.items():
    print(f"{folder}: {count}")

### 1.2 View any image details

In [None]:
import rasterio
import numpy as np
import matplotlib.pyplot as plt

image_path = 'ChangeDetectionMerged-tif/cd2_Output/cd2_m_SouthDakota_SiouxFalls_E.tif'

# Open the image using rasterio
with rasterio.open(image_path) as src:
    image = src.read(1)  # Read the first band

# Normalize the image (assuming uint8 input, scale to 0-1)
#image = image.astype(np.float32) / 255.0

# Display the image
plt.imshow(image, cmap='turbo')
plt.axis('off')
plt.show()

# Print metadata
print("SHAPE:", image.shape)
print("UNIQUE VALUES:", np.unique(image))
print("NUMBER OF CLASSES:", len(np.unique(image)))

### 1.4 Convert single TIF to PNG

In [None]:
import rasterio
import numpy as np
from PIL import Image

def tif_to_png(tif_path, png_path):
    """
    Convert a .tif image to a .png image.
    
    Args:
        tif_path (str): Path to the input .tif file.
        png_path (str): Path to save the output .png file.
    """
    # Read the .tif file using rasterio
    with rasterio.open(tif_path) as src:
        # Read the image data into a NumPy array
        array = src.read()
        if array.shape[0] == 3:  # RGB image
            array = np.moveaxis(array, 0, -1)  # Reorder dimensions to (H, W, C)
        elif array.shape[0] == 1:  # Grayscale image
            array = array[0]  # Remove the single-band dimension
    
    # Normalize the array to range [0, 255] for saving as PNG
    array = array - array.min()
    array = (array / array.max() * 255).astype(np.uint8)
    
    # Save the NumPy array as a .png image using Pillow
    img = Image.fromarray(array)
    img.save(png_path)
    print(f"Saved {png_path}")

# Example usage:
tif_path = "RGB.tif"
png_path = "RGB_Charlotte.png"
tif_to_png(tif_path, png_path)

## 3. Raw G-Drive output to Raw TIF dataset 

### 3.1 Seperate Image and Masks into folders
- Keep all the images in `ChangeDetection/Images`

In [None]:
import os
import shutil

def move_mask_files(source_folder, destination_folder):
    """
    Move all files starting with 'm_' from the source folder to the destination folder.

    :param source_folder: Path to the folder containing the files.
    :param destination_folder: Path to the folder where the files will be moved.
    """
    # Ensure the destination folder exists
    os.makedirs(destination_folder, exist_ok=True)
    
    # Iterate through all files in the source folder
    for file_name in os.listdir(source_folder):
        if file_name.startswith("m_"):
            # Build full paths
            source_path = os.path.join(source_folder, file_name)
            destination_path = os.path.join(destination_folder, file_name)
            
            # Move the file
            shutil.move(source_path, destination_path)
            print(f"Moved: {file_name}")

# Example usage:
source_folder = "ChangeDetectionInd/Images"
destination_folder = "ChangeDetectionInd/Masks"

move_mask_files(source_folder, destination_folder)

### 3.2 Seperate 2019 and 2024
- Keep all the images in `ChangeDetection/Images/T2019`

In [None]:
import os
import shutil

def move_mask_files(source_folder, destination_folder):
    """
    Move all files starting with 'm_' from the source folder to the destination folder.

    :param source_folder: Path to the folder containing the files.
    :param destination_folder: Path to the folder where the files will be moved.
    """
    # Ensure the destination folder exists
    os.makedirs(destination_folder, exist_ok=True)
    
    # Iterate through all files in the source folder
    for file_name in os.listdir(source_folder):
        if file_name.endswith("2024.tif"):
            # Build full paths
            source_path = os.path.join(source_folder, file_name)
            destination_path = os.path.join(destination_folder, file_name)
            
            # Move the file
            shutil.move(source_path, destination_path)
            print(f"Moved: {file_name}")

# Example usage:
source_folder = "ChangeDetectionInd/Masks/T2019"
destination_folder = "ChangeDetectionInd/Masks/T2024"

move_mask_files(source_folder, destination_folder)

### 3.3 Generating Change Detection ground truths

#### Case 1: MCD - Vegetation Increase and Decrease (3 classes) `cd1_Output`
- class 0 - no change, dense<->sparse
- class 1 - building to dense, building to sparse
- class 2 - dense to building, sparse to building

In [None]:
import os
from collections import Counter
import numpy as np
import rasterio
import csv

def classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer):
    """
    Classify land-use changes into 3 classes, calculate area in km^2, and save to a CSV file.
    """
    # Read 2019 mask
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)
        profile = src_2019.profile

    # Read 2024 mask
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)

    # Initialize the output classification mask
    output_mask = np.zeros_like(mask_2019, dtype=np.uint8)

    # Classification logic
    output_mask[(mask_2019 == mask_2024)  
                | ((mask_2019 == 3) & (mask_2024 == 2))
                | ((mask_2019 == 2) & (mask_2024 == 3))] = 0  # No change (including water)
    output_mask[((mask_2019 == 1) & (mask_2024 == 3)) | ((mask_2019 == 1) & (mask_2024 == 2))] = 1  # Vegetation increase
    output_mask[((mask_2019 == 3) & (mask_2024 == 1)) | ((mask_2019 == 2) & (mask_2024 == 1))] = 2  # Vegetation decrease

    # Calculate pixel counts for each class
    pixel_counts = Counter(output_mask.flatten())

    # Calculate area for each class in km^2
    pixel_area_km2 = (10 * 10) / (1000 * 1000)  # Each pixel represents 100 m^2 converted to km^2
    area_by_class = {class_id: count * pixel_area_km2 for class_id, count in pixel_counts.items()}

    # Prepare data for CSV
    mask_name = os.path.basename(mask_2019_path).replace("_2019.tif", "")
    class_0_area = area_by_class.get(0, 0)
    class_1_area = area_by_class.get(1, 0)
    class_2_area = area_by_class.get(2, 0)

    csv_writer.writerow([mask_name, class_0_area, class_1_area, class_2_area])

    # Save the output classification mask to a new TIFF file
    profile.update(dtype=rasterio.uint8, count=1)
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(output_mask, 1)

    print(f"Processed {mask_name}: Saved output mask and areas.")

def process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file):
    """
    Process all masks from the input folder, save the output masks to the output folder,
    and log areas to a CSV file.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Open CSV file for writing
    with open(csv_file, mode="w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Mask Name", "Class 0 Area", "Class 1 Area", "Class 2 Area"])  # Header row

        # Fetch all 2019 and 2024 masks
        mask_files_2019 = [f for f in os.listdir(input_folder_1) if "_2019.tif" in f]
        mask_files_2024 = [f for f in os.listdir(input_folder_2) if "_2024.tif" in f]

        # Match 2019 and 2024 masks by state and city and direction
        for mask_2019 in mask_files_2019:
            city_state_direction = mask_2019.replace("_2019.tif", "")
            mask_2024 = f"{city_state_direction}_2024.tif"
            if mask_2024 in mask_files_2024:
                mask_2019_path = os.path.join(input_folder_1, mask_2019)
                mask_2024_path = os.path.join(input_folder_2, mask_2024)
                output_path = os.path.join(output_folder, f"cd1_{city_state_direction}.tif")
                
                classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer)

# Example usage
input_folder_1 = "ChangeDetectionInd/Masks/T2019"
input_folder_2 = "ChangeDetectionInd/Masks/T2024"
output_folder = "ChangeDetectionInd/cd1_Output"
csv_file = "ChangeDetectionInd/cd1_Ind_area_summary.csv"
process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file)


#### Case 2: Building <-> Dense vegetation (3 classes) `cd2_Output` (not used)
- class 0  - no change, dense<->sparse, building<->sparse
- class 1 - dense to building
- class 2 - building to dense 

In [None]:
import os
from collections import Counter
import numpy as np
import rasterio
import csv

def classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer):
    """
    Classify land-use changes into 3 classes, calculate area in km^2, and save to a CSV file.
    """
    # Read 2019 mask
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)
        profile = src_2019.profile

    # Read 2024 mask
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)

    # Initialize the output classification mask
    output_mask = np.zeros_like(mask_2019, dtype=np.uint8)

    # Classification logic
    output_mask[(mask_2019 == mask_2024)  
                | ((mask_2019 == 3) & (mask_2024 == 2))
                | ((mask_2019 == 1) & (mask_2024 == 2)) 
                | ((mask_2019 == 2) & (mask_2024 == 3))
                | ((mask_2019 == 2) & (mask_2024 == 1))] = 0  # No change (including water and dense <-> sparse vegetation)
    output_mask[((mask_2019 == 1) & (mask_2024 == 3)) ] = 1  # Building to dense vegetation
    output_mask[((mask_2019 == 3) & (mask_2024 == 1)) ] = 2  # dense Vegetation to building

    # Calculate pixel counts for each class
    pixel_counts = Counter(output_mask.flatten())

    # Calculate area for each class in km^2
    pixel_area_km2 = (10 * 10) / (1000 * 1000)  # Each pixel represents 100 m^2 converted to km^2
    area_by_class = {class_id: count * pixel_area_km2 for class_id, count in pixel_counts.items()}

    # Prepare data for CSV
    mask_name = os.path.basename(mask_2019_path).replace("_2019.tif", "")
    class_0_area = area_by_class.get(0, 0)
    class_1_area = area_by_class.get(1, 0)
    class_2_area = area_by_class.get(2, 0)

    csv_writer.writerow([mask_name, class_0_area, class_1_area, class_2_area])

    # Save the output classification mask to a new TIFF file
    profile.update(dtype=rasterio.uint8, count=1)
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(output_mask, 1)

    print(f"Processed {mask_name}: Saved output mask and areas.")

def process_all_masks(input_folder_1, input_folder_2 , output_folder, csv_file):
    """
    Process all masks from the input folder, save the output masks to the output folder,
    and log areas to a CSV file.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Open CSV file for writing
    with open(csv_file, mode="w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Mask Name", "Class 0 Area", "Class 1 Area", "Class 2 Area"])  # Header row

        # Fetch all 2019 and 2024 masks
        mask_files_2019 = [f for f in os.listdir(input_folder_1) if "_2019.tif" in f]
        mask_files_2024 = [f for f in os.listdir(input_folder_2) if "_2024.tif" in f]

        # Match 2019 and 2024 masks by state and city and direction
        for mask_2019 in mask_files_2019:
            city_state_direction = mask_2019.replace("_2019.tif", "")
            mask_2024 = f"{city_state_direction}_2024.tif"
            if mask_2024 in mask_files_2024:
                mask_2019_path = os.path.join(input_folder_1, mask_2019)
                mask_2024_path = os.path.join(input_folder_2, mask_2024)
                output_path = os.path.join(output_folder, f"cd2_{city_state_direction}.tif")
                
                classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer)

# Example usage
input_folder_1 = "ChangeDetectionInd/Masks/T2019"
input_folder_2 = "ChangeDetectionInd/Masks/T2024"
output_folder = "ChangeDetectionInd/cd2_Output"
csv_file = "ChangeDetectionInd/cd2_Ind_area_summary.csv"
process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file)

#### Case 3: All classes (7 classes) `cd3_Output` (not used)

In [None]:
import os
from collections import Counter
import numpy as np
import rasterio
import csv

def classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer):
    """
    Classify land-use changes into 7 classes, calculate area in km^2, and save to a CSV file.
    """
    # Read 2019 mask
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)
        profile = src_2019.profile

    # Read 2024 mask
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)

    # Initialize the output classification mask
    output_mask = np.zeros_like(mask_2019, dtype=np.uint8)

    # Classification logic
    output_mask[(mask_2019 == mask_2024)] = 0  # No change (including water)
    output_mask[(mask_2019 == 1) & (mask_2024 == 2)] = 1  # Building to sparse veg
    output_mask[(mask_2019 == 1) & (mask_2024 == 3)] = 2  # Building to dense veg
    output_mask[(mask_2019 == 2) & (mask_2024 == 3)] = 3  # Sparse to dense
    output_mask[(mask_2019 == 2) & (mask_2024 == 1)] = 4  # Sparse to building
    output_mask[(mask_2019 == 3) & (mask_2024 == 2)] = 5  # Dense to sparse
    output_mask[(mask_2019 == 3) & (mask_2024 == 1)] = 6  # Dense to building


    # Calculate pixel counts for each class
    pixel_counts = Counter(output_mask.flatten())

    # Calculate area for each class in km^2
    pixel_area_km2 = (10 * 10) / (1000 * 1000)  # Each pixel represents 100 m^2 converted to km^2
    area_by_class = {class_id: count * pixel_area_km2 for class_id, count in pixel_counts.items()}

    # Prepare data for CSV
    mask_name = os.path.basename(mask_2019_path).replace("_2019.tif", "")
    class_0_area = area_by_class.get(0, 0)
    class_1_area = area_by_class.get(1, 0)
    class_2_area = area_by_class.get(2, 0)
    class_3_area = area_by_class.get(3, 0)
    class_4_area = area_by_class.get(4, 0)
    class_5_area = area_by_class.get(5, 0)
    class_6_area = area_by_class.get(6, 0)


    csv_writer.writerow([mask_name, class_0_area, class_1_area, class_2_area, 
                         class_3_area, class_4_area, class_5_area, class_6_area])

    # Save the output classification mask to a new TIFF file
    profile.update(dtype=rasterio.uint8, count=1)
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(output_mask, 1)

    print(f"Processed {mask_name}: Saved output mask and areas.")

def process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file):
    """
    Process all masks from the input folder, save the output masks to the output folder,
    and log areas to a CSV file.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Open CSV file for writing
    with open(csv_file, mode="w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Mask Name", "Class 0 Area", "Class 1 Area", "Class 2 Area", "Class 3 Area", 
                             "Class 4 Area", "Class 5 Area", "Class 6 Area"])  # Header row

        # Fetch all 2019 and 2024 masks
        mask_files_2019 = [f for f in os.listdir(input_folder_1) if "_2019.tif" in f]
        mask_files_2024 = [f for f in os.listdir(input_folder_2) if "_2024.tif" in f]

        # Match 2019 and 2024 masks by state and city and direction
        for mask_2019 in mask_files_2019:
            city_state_direction = mask_2019.replace("_2019.tif", "")
            mask_2024 = f"{city_state_direction}_2024.tif"
            if mask_2024 in mask_files_2024:
                mask_2019_path = os.path.join(input_folder_1, mask_2019)
                mask_2024_path = os.path.join(input_folder_2, mask_2024)
                output_path = os.path.join(output_folder, f"cd3_{city_state_direction}.tif")
                
                classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer)

# Example usage
input_folder_1 = "ChangeDetectionMerged-DW-tif/Masks/T2019"
input_folder_2 = "ChangeDetectionMerged-DW-tif/Masks/T2024"
output_folder = "ChangeDetectionMerged-DW-tif/cd3_Output"
csv_file = "ChangeDetectionMerged-DW-tif/cd3_area_summary.csv"
process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file)

#### Case 4: SCD - Semantic Change Mask (13 classes) (Merged into cd2)
- all to all

In [None]:
import os
from collections import Counter
import numpy as np
import rasterio
import csv

def classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer):
    """
    Classify land-use changes into 3 classes, calculate area in km^2, and save to a CSV file.
    """
    # Read 2019 mask
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)
        profile = src_2019.profile

    # Read 2024 mask
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)

    # Initialize the output classification mask
    output_mask = np.zeros_like(mask_2019, dtype=np.uint8)

    # Classification logic
    # 0 - water
    # 1 - building
    # 2 - sparse
    # 3 - dense
    output_mask[(mask_2019 == mask_2024)] = 0  # No change

    output_mask[(mask_2019 == 0) & (mask_2024 == 1)] = 1  # water to building
    output_mask[(mask_2019 == 0) & (mask_2024 == 2)] = 2 # water to sparse
    output_mask[(mask_2019 == 0) & (mask_2024 == 3)] = 3  # water to dense

    output_mask[(mask_2019 == 1) & (mask_2024 == 0)] = 4  # Building to water
    output_mask[(mask_2019 == 1) & (mask_2024 == 2)] = 5  # Building to sparse
    output_mask[(mask_2019 == 1) & (mask_2024 == 3)] = 6  # Building to dense

    output_mask[(mask_2019 == 2) & (mask_2024 == 0)] = 7  # Sparse to water
    output_mask[(mask_2019 == 2) & (mask_2024 == 1)] = 8  # Sparse to building
    output_mask[(mask_2019 == 2) & (mask_2024 == 3)] = 9  # Sparse to dense

    output_mask[(mask_2019 == 3) & (mask_2024 == 0)] = 10  # Dense to water
    output_mask[(mask_2019 == 3) & (mask_2024 == 1)] = 11  # Dense to building
    output_mask[(mask_2019 == 3) & (mask_2024 == 2)] = 12  # Dense to sparse


    # Calculate pixel counts for each class
    pixel_counts = Counter(output_mask.flatten())

    # Calculate area for each class in km^2
    pixel_area_km2 = (10 * 10) / (1000 * 1000)  # Each pixel represents 100 m^2 converted to km^2
    area_by_class = {class_id: count * pixel_area_km2 for class_id, count in pixel_counts.items()}

    # Prepare data for CSV
    mask_name = os.path.basename(mask_2019_path).replace("_2019.tif", "")
    class_0_area = area_by_class.get(0, 0)
    class_1_area = area_by_class.get(1, 0)
    class_2_area = area_by_class.get(2, 0)
    class_3_area = area_by_class.get(3, 0)
    class_4_area = area_by_class.get(4, 0)
    class_5_area = area_by_class.get(5, 0)
    class_6_area = area_by_class.get(6, 0)
    class_7_area = area_by_class.get(7, 0)
    class_8_area = area_by_class.get(8, 0)
    class_9_area = area_by_class.get(9, 0)
    class_10_area = area_by_class.get(10, 0)
    class_11_area = area_by_class.get(11, 0)
    class_12_area = area_by_class.get(12, 0)

    csv_writer.writerow([mask_name, class_0_area, class_1_area, class_2_area, class_3_area,
                         class_4_area, class_5_area, class_6_area, class_7_area, class_8_area,
                         class_9_area, class_10_area, class_11_area, class_12_area])

    # Save the output classification mask to a new TIFF file
    profile.update(dtype=rasterio.uint8, count=1)
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(output_mask, 1)

    print(f"Processed {mask_name}: Saved output mask and areas.")

def process_all_masks(input_folder_1, input_folder_2 , output_folder, csv_file):
    """
    Process all masks from the input folder, save the output masks to the output folder,
    and log areas to a CSV file.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Open CSV file for writing
    with open(csv_file, mode="w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Mask Name", "Class 0 Area", "Class 1 Area", "Class 2 Area",
                            "Class 3 Area","Class 4 Area", "Class 5 Area", "Class 6 Area",
                            "Class 7 Area", "Class 8 Area", "Class 9 Area",
                            "Class 10 Area", "Class 11 Area", "Class 12 Area"])  # Header row

        # Fetch all 2019 and 2024 masks
        mask_files_2019 = [f for f in os.listdir(input_folder_1) if "_2019.tif" in f]
        mask_files_2024 = [f for f in os.listdir(input_folder_2) if "_2024.tif" in f]

        # Match 2019 and 2024 masks by state and city and direction
        for mask_2019 in mask_files_2019:
            city_state_direction = mask_2019.replace("_2019.tif", "")
            mask_2024 = f"{city_state_direction}_2024.tif"
            if mask_2024 in mask_files_2024:
                mask_2019_path = os.path.join(input_folder_1, mask_2019)
                mask_2024_path = os.path.join(input_folder_2, mask_2024)
                output_path = os.path.join(output_folder, f"cd2_{city_state_direction}.tif")
                
                classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer)

# Example usage
input_folder_1 = "ChangeDetectionMerged-tif/Masks/T2019"
input_folder_2 = "ChangeDetectionMerged-tif/Masks/T2024"
output_folder = "ChangeDetectionMerged-tif/cd2_Output_non-noise"
csv_file = "ChangeDetectionMerged-tif/cd4_area_summary.csv"
process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file)

#### Case 5: MCD for Dynamic-World
- class 0: No change (same class, water transitions, crop transitions, tree-sparse transitions)
- class 1: Vegetation increase (built/bare -> trees/sparse)
- class 2: Vegetation decrease (trees/sparse -> built/bare)

In [None]:
import os
from collections import Counter
import numpy as np
import rasterio
import csv

def classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer):
    """
    Classify land-use changes into 3 classes, calculate area in km^2, and save to a CSV file.
    
    Classes:
    0 - No change (same class, water transitions, crop transitions, tree-sparse transitions, ANY transition involving 'others')
    1 - Vegetation increase (built/bare → trees/sparse)
    2 - Vegetation decrease (trees/sparse → built/bare)
    
    Class mapping:
    0: water
    1: built
    2: bare
    3: sparse
    4: trees
    5: crops
    6: others
    """
    # Read 2019 mask
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)
        profile = src_2019.profile

    # Read 2024 mask
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)

    # Initialize the output classification mask
    output_mask = np.zeros_like(mask_2019, dtype=np.uint8)

    # Define vegetation classes and other classifications
    vegetation_classes = [3, 4]  # sparse and trees
    built_bare = [1, 2]  # built, bare
    
    # Classification logic
    # 1. No change conditions
    output_mask[(mask_2019 == mask_2024) |                       # Same class
                (mask_2019 == 0) | (mask_2024 == 0) |            # Water transitions
                (mask_2019 == 5) | (mask_2024 == 5) |            # Crop transitions
                (mask_2019 == 6) | (mask_2024 == 6) |            # Any transition involving 'others'
                ((mask_2019 == 3) & (mask_2024 == 4)) |          # Sparse to trees
                ((mask_2019 == 4) & (mask_2024 == 3))] = 0       # Trees to sparse

    # 2. Vegetation increase: built/bare → trees/sparse
    for built_bare_class in built_bare:
        for vegetation in vegetation_classes:
            output_mask[(mask_2019 == built_bare_class) & 
                        (mask_2024 == vegetation)] = 1
    
    # 3. Vegetation decrease: trees/sparse → built/bare
    for vegetation in vegetation_classes:
        for built_bare_class in built_bare:
            output_mask[(mask_2019 == vegetation) & 
                        (mask_2024 == built_bare_class)] = 2

    # Calculate pixel counts for each class
    pixel_counts = Counter(output_mask.flatten())

    # Calculate area for each class in km^2
    pixel_area_km2 = (10 * 10) / (1000 * 1000)  # Each pixel represents 100 m^2 converted to km^2
    area_by_class = {class_id: count * pixel_area_km2 for class_id, count in pixel_counts.items()}

    # Prepare data for CSV
    mask_name = os.path.basename(mask_2019_path).replace("_2019.tif", "")
    class_0_area = area_by_class.get(0, 0)
    class_1_area = area_by_class.get(1, 0)
    class_2_area = area_by_class.get(2, 0)

    csv_writer.writerow([mask_name, class_0_area, class_1_area, class_2_area])

    # Save the output classification mask to a new TIFF file
    profile.update(dtype=rasterio.uint8, count=1)
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(output_mask, 1)

    print(f"Processed {mask_name}: No Change={class_0_area:.2f}km², "
          f"Vegetation Increase={class_1_area:.2f}km², "
          f"Vegetation Decrease={class_2_area:.2f}km²")

def process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file):
    """
    Process all masks from the input folder, save the output masks to the output folder,
    and log areas to a CSV file.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Open CSV file for writing
    with open(csv_file, mode="w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Mask Name", "No Change Area", "Vegetation Increase Area", "Vegetation Decrease Area"])  # Header row

        # Fetch all 2019 and 2024 masks
        mask_files_2019 = [f for f in os.listdir(input_folder_1) if "_2019.tif" in f]
        mask_files_2024 = [f for f in os.listdir(input_folder_2) if "_2024.tif" in f]

        # Match 2019 and 2024 masks by state and city and direction
        for mask_2019 in mask_files_2019:
            city_state_direction = mask_2019.replace("_2019.tif", "")
            mask_2024 = f"{city_state_direction}_2024.tif"
            if mask_2024 in mask_files_2024:
                mask_2019_path = os.path.join(input_folder_1, mask_2019)
                mask_2024_path = os.path.join(input_folder_2, mask_2024)
                output_path = os.path.join(output_folder, f"cd1_{city_state_direction}.tif")
                
                classify_veg_increase_decrease_classes(mask_2019_path, mask_2024_path, output_path, csv_writer)

# Example usage
input_folder_1 = "ChangeDetectionMerged-DW-tif/Masks/T2019"
input_folder_2 = "ChangeDetectionMerged-DW-tif/Masks/T2024"
output_folder = "ChangeDetectionMerged-DW-tif/cd1_Output"
csv_file = "ChangeDetectionMerged-DW-tif/cd1_area_summary.csv"
process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file)

#### Case 6: SCD Mask for Dynamic World
- all to all (31 classes)

In [None]:
import os
from collections import Counter
import numpy as np
import rasterio
import csv

def classify_semantic_changes(mask_2019_path, mask_2024_path, output_path, csv_writer):
    """
    Classify land-use changes into semantic change classes, calculate area in km^2, and save to a CSV file.
    
    Class mapping:
    0: water
    1: built
    2: bare
    3: sparse
    4: trees
    5: crops
    6: others
    
    Output classes:
    0: No change (all same-class transitions and ANY transition involving 'others')
    1-30: Transitions between different classes (excluding 'others')
    """
    # Read 2019 mask
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)
        profile = src_2019.profile

    # Read 2024 mask
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)

    # Initialize the output classification mask
    output_mask = np.zeros_like(mask_2019, dtype=np.uint8)
    
    # Class 0: No change (same class in both periods OR any transition involving 'others')
    output_mask[(mask_2019 == mask_2024) | 
                (mask_2019 == 6) | 
                (mask_2024 == 6)] = 0
    
    # Define class transition mapping (excluding 'others')
    class_index = 1  # Start from 1 since 0 is reserved for all no-change
    transition_mapping = {}
    
    # Generate transitions between classes 0-5 (excluding 6 'others')
    for class_from in range(6):  # 0-5
        for class_to in range(6):  # 0-5
            if class_from != class_to:  # Skip same-class transitions (they go to class 0)
                transition_mapping[(class_from, class_to)] = class_index
                class_index += 1
    
    # Apply transitions to output mask
    for (class_from, class_to), class_value in transition_mapping.items():
        output_mask[(mask_2019 == class_from) & (mask_2024 == class_to)] = class_value
    
    # Calculate pixel counts for each class
    pixel_counts = Counter(output_mask.flatten())

    # Calculate area for each class in km^2
    pixel_area_km2 = (10 * 10) / (1000 * 1000)  # Each pixel represents 100 m^2 converted to km^2
    area_by_class = {class_id: count * pixel_area_km2 for class_id, count in pixel_counts.items()}

    # Prepare data for CSV
    mask_name = os.path.basename(mask_2019_path).replace("_2019.tif", "")
    
    # Create row with areas for all classes
    row = [mask_name]
    for i in range(31):  # 0-30 (31 classes total)
        row.append(area_by_class.get(i, 0))
    
    csv_writer.writerow(row)

    # Save the output classification mask to a new TIFF file
    profile.update(dtype=rasterio.uint8, count=1)
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(output_mask, 1)

    print(f"Processed {mask_name}")

def process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file):
    """
    Process all masks from the input folder, save the output masks to the output folder,
    and log areas to a CSV file.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Generate transition descriptions for CSV header
    transitions = []
    # Start with no-change class
    transitions.append("No Change (including all 'others' transitions)")
    
    # Add all other transitions
    land_cover_classes = ["water", "built", "bare", "sparse", "trees", "crops"]
    for from_idx, class_from in enumerate(land_cover_classes):
        for to_idx, class_to in enumerate(land_cover_classes):
            if from_idx != to_idx:  # Skip same-class transitions
                transitions.append(f"{class_from} to {class_to}")
    
    # Open CSV file for writing
    with open(csv_file, mode="w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Create header row with all class descriptions
        header_row = ["Mask Name"]
        for i, desc in enumerate(transitions):
            header_row.append(f"Class {i}: {desc}")
        
        csv_writer.writerow(header_row)

        # Fetch all 2019 and 2024 masks
        mask_files_2019 = [f for f in os.listdir(input_folder_1) if "_2019.tif" in f]
        mask_files_2024 = [f for f in os.listdir(input_folder_2) if "_2024.tif" in f]

        # Match 2019 and 2024 masks by state and city and direction
        for mask_2019 in mask_files_2019:
            city_state_direction = mask_2019.replace("_2019.tif", "")
            mask_2024 = f"{city_state_direction}_2024.tif"
            if mask_2024 in mask_files_2024:
                mask_2019_path = os.path.join(input_folder_1, mask_2019)
                mask_2024_path = os.path.join(input_folder_2, mask_2024)
                output_path = os.path.join(output_folder, f"cd2_{city_state_direction}.tif")
                
                classify_semantic_changes(mask_2019_path, mask_2024_path, output_path, csv_writer)

# Example usage
input_folder_1 = "ChangeDetectionMerged-DW-tif/Masks/T2019"
input_folder_2 = "ChangeDetectionMerged-DW-tif/Masks/T2024"
output_folder = "ChangeDetectionMerged-DW-tif/cd2_Output"
csv_file = "ChangeDetectionMerged-DW-tif/cd2_area_summary.csv"
process_all_masks(input_folder_1, input_folder_2, output_folder, csv_file)

## 4. Main (raw TIF dataset to final dataset)

### 4.1 Remove whitespaces from filenames

In [None]:
import os

def remove_whitespaces_from_filenames(folder_path):
    """
    Removes whitespace from all filenames in the given folder.

    Parameters:
        folder_path (str): Path to the folder containing files to be renamed.
    """
    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Create the new file name by removing all whitespaces
        new_file_name = file_name.replace(" ", "")

        # If the file name is different, rename it
        if file_name != new_file_name:
            old_file_path = os.path.join(folder_path, file_name)
            new_file_path = os.path.join(folder_path, new_file_name)

            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed: {old_file_path} -> {new_file_path}")

# Example usage
root = 'ChangeDetectionMerged-DW-tif'
remove_whitespaces_from_filenames(f'{root}/cd1_Output')
remove_whitespaces_from_filenames(f'{root}/cd2_Output') 
remove_whitespaces_from_filenames(f'{root}/cd3_Output') 
remove_whitespaces_from_filenames(f'{root}/Images/T2019') 
remove_whitespaces_from_filenames(f'{root}/Images/T2024') 
remove_whitespaces_from_filenames(f'{root}/Masks/T2019')
remove_whitespaces_from_filenames(f'{root}/Masks/T2024') 

### 4.3 Morphological Noise filtering for cd_Outputs (FOR `TIFs`)
- Convert images from cd_Output to morphology-noise-reduced images 
- Runs only on folders named `cd1_Output`,`cd2_Output`,`cd3_Output`

In [None]:
import os
import numpy as np
from scipy import ndimage
import rasterio
import shutil
import tempfile

def reduce_noise_morphology(image_array, kernel_size=3):
    """
    Applies morphological opening to reduce noise in grayscale images.
    
    Args:
        image_array: Single-band grayscale image array
        kernel_size: Size of the morphological kernel
    Returns:
        Denoised image array
    """
    # Create a square kernel
    kernel = np.ones((kernel_size, kernel_size))
    
    # Apply morphological opening using grayscale operations
    opened = ndimage.grey_opening(image_array, size=(kernel_size, kernel_size))
    
    # Ensure output has same dtype as input
    return opened.astype(image_array.dtype)

def process_images_in_folder(base_folder, subfolder_names, kernel_size=3):
    """
    Applies morphological opening denoising to grayscale TIF images.
    Uses temporary files for safe file handling.
    
    Args:
        base_folder (str): Path to the base folder containing subfolders and images
        subfolder_names (list): List of subfolder names to apply processing on
        kernel_size (int): Size of the structuring element for morphological opening
    """
    for root, dirs, files in os.walk(base_folder):
        # Only process images in the specified subfolders
        if any(subfolder in root for subfolder in subfolder_names):
            for file in files:
                if file.lower().endswith('.tif'):
                    file_path = os.path.join(root, file)
                    print(f"Processing: {file_path}")
                    
                    try:
                        # Create a temporary file
                        temp_fd, temp_path = tempfile.mkstemp(suffix='.tif')
                        os.close(temp_fd)  # Close the file descriptor
                        
                        # Read the original TIF image
                        with rasterio.open(file_path) as src:
                            # Read the single band
                            image = src.read(1)
                            profile = src.profile.copy()
                            
                            # Print debug info
                            #print(f"Image shape: {image.shape}")
                            #print(f"Image dtype: {image.dtype}")
                            #print(f"Unique values before: {np.unique(image)}")
                            
                            # Apply morphological noise reduction
                            denoised_image = reduce_noise_morphology(image, kernel_size=kernel_size)
                            
                            #print(f"Unique values after: {np.unique(denoised_image)}")

                            # Write to temporary file first
                            with rasterio.open(temp_path, 'w', **profile) as dst:
                                dst.write(denoised_image, 1)
                        
                        # Only after successful write, replace the original file
                        shutil.move(temp_path, file_path)
                        #print(f"Saved denoised image to: {file_path}")
                            
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
                        # Clean up temporary file if it exists
                        if 'temp_path' in locals():
                            try:
                                os.remove(temp_path)
                            except:
                                pass
                        continue

if __name__ == "__main__":
    # Example usage
    base_folder_path = "ChangeDetectionMerged-DW-tif"  # Replace with your base folder path
    #subfolders_to_process = ['cd1_Output', 'cd2_Output', 'cd3_Output']  # Subfolders to process
    subfolders_to_process = ['cd2_Output']  # Subfolders to process
    kernel_size = 3  # Adjust kernel size for noise reduction as needed
    
    process_images_in_folder(base_folder_path, subfolders_to_process, kernel_size=kernel_size)

### 4.4 Divide TIF images
- Each image divided into 4 parts `[1,2,3,4]`
- Here onwards filenames will contain `part` number at the end (Eg. Alabama_Birmingham_C_2019_1.tif)

In [None]:
import os
from PIL import Image
import gc

def divide_and_save(image_path, output_dir):
    """
    Divide a TIF image into 4 equal quadrants and save them with proper file handling.
    """
    try:
        with Image.open(image_path) as image:
            image_array = image.copy()
            width, height = image_array.size

            # Ensure the output directory exists
            os.makedirs(output_dir, exist_ok=True)

            # Define the coordinates for the 4 quadrants
            quadrants = [
                (0, 0, width // 2, height // 2),        # Top-left
                (width // 2, 0, width, height // 2),    # Top-right
                (0, height // 2, width // 2, height),   # Bottom-left
                (width // 2, height // 2, width, height) # Bottom-right
            ]

            # Loop through the quadrants and save them
            for i, coords in enumerate(quadrants, start=1):
                cropped_img = image_array.crop(coords)
                new_filename = f"{os.path.splitext(os.path.basename(image_path))[0]}_{i}.tif"
                new_file_path = os.path.join(output_dir, new_filename)
                
                try:
                    cropped_img.save(new_file_path)
                    #print(f"Saved: {new_file_path}")
                finally:
                    cropped_img.close()
                    
            del cropped_img
            del image_array
            gc.collect()

    except Exception as e:
        print(f"Error processing {image_path}: {e}")

def process_files_batch(file_list, input_directory, output_directory):
    """
    Process a batch of files.
    """
    for file in file_list:
        if file.lower().endswith('.tif'):
            input_file_path = os.path.join(input_directory, file)
            relative_path = os.path.relpath(os.path.dirname(input_file_path), start=input_directory)
            output_subdir = os.path.join(output_directory, relative_path)
            
            #print(f"Processing: {input_file_path}")
            divide_and_save(input_file_path, output_subdir)
            gc.collect()

def process_directory_in_batches(input_directory, output_directory, batch_no):
    """
    Process all TIF files in the input directory in two batches.
    """
    # Get list of all TIF files
    tif_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.tif')]
    total_files = len(tif_files)
    
    if total_files == 0:
        print("No TIF files found in the directory.")
        return
    
    # Calculate the midpoint
    mid_point = total_files // 2
    
    # Split files into two batches
    first_batch = tif_files[:mid_point]
    second_batch = tif_files[mid_point:]
    
    print(f"Total files: {total_files}")

    if batch_no==1:
        print(f"Processing first batch of {len(first_batch)} files...")
        process_files_batch(first_batch, input_directory, output_directory)
    
    print("\nClearing memory between batches...")
    gc.collect()
    
    if batch_no==2:
        print(f"\nProcessing second batch of {len(second_batch)} files...")
        process_files_batch(second_batch, input_directory, output_directory)

if __name__ == "__main__":
    # Input and Output directories
    DIR = 'cd2_Output' # Change to 'cd2_Output' or 'cd3_Output' for other outputs
    input_directory = f'ChangeDetectionMerged-DW-tif/{DIR}'
    output_directory = f'ChangeDetectionMergedDivided-DW-tif/{DIR}'

    try:
        batch_no = 2 # Change to 2 for the second batch
        process_directory_in_batches(input_directory, output_directory, batch_no)
        print("\nDivision complete!")
    except Exception as e:
        print(f"Process failed: {e}")
    finally:
        gc.collect()

### 4.5 Histogram generation and delete too-noisy/no-change files 

#### 4.5.1 Histogram plot and calculate

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import rasterio

def calculate_zero_pixel_percentage_and_plot(folder_path, lower_threshold=80, upper_threshold=99.5):
    """
    Calculates the percentage of pixels with a value of 0 for each TIF image in the folder,
    plots a histogram of the distribution, and prints filenames of images with pixel percentage 
    less than the lower threshold or greater than the upper threshold.

    Parameters:
        folder_path (str): Path to the folder containing TIF images.
        lower_threshold (float): The lower threshold percentage for zero pixels to filter images.
        upper_threshold (float): The upper threshold percentage for zero pixels to filter images.
    """
    # List all TIF files in the folder
    tif_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.tif')]

    # Initialize lists to store zero pixel percentages and filenames
    zero_pixel_percentages = []
    filtered_files = []

    for file_name in tif_files:
        file_path = os.path.join(folder_path, file_name)

        try:
            # Open and read the TIF file
            with rasterio.open(file_path) as src:
                # Read the first band
                image_array = src.read(1)
                
                # Print debug info
                #print(f"Processing {file_name}")
                #print(f"Shape: {image_array.shape}")
                #print(f"Unique values: {np.unique(image_array)}")

                # Calculate the total number of pixels
                total_pixels = image_array.size

                # Calculate the number of pixels with value 0
                pixel_0_count = np.sum(image_array == 0)

                # Calculate the percentage of pixels with value 0
                percentage_0 = (pixel_0_count / total_pixels) * 100
                zero_pixel_percentages.append(percentage_0)

                # Check if the percentage exceeds the thresholds
                if percentage_0 > upper_threshold or percentage_0 < lower_threshold:
                    filtered_files.append(file_name)
                    #print(f"Filtered: {file_name} ({percentage_0:.2f}% zeros)")

        except Exception as e:
            print(f"Error processing {file_name}: {e}")
            continue

    if not zero_pixel_percentages:
        print("No files were successfully processed.")
        return filtered_files

    # Plot the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(zero_pixel_percentages, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    plt.title('Distribution of Zero-Pixel Percentages', fontsize=16)
    plt.xlabel('Percentage of Pixels with Value 0', fontsize=14)
    plt.ylabel('Number of Images', fontsize=14)
    plt.grid(axis='y', alpha=0.75)
    plt.show()

    # Print statistics
    print("\nAnalysis Results:")
    print(f"Total images processed: {len(tif_files)}")
    print(f"Mean percentage of zero pixels: {np.mean(zero_pixel_percentages):.2f}%")
    print(f"Median percentage of zero pixels: {np.median(zero_pixel_percentages):.2f}%")
    print(f"Number of images with zero pixels >{upper_threshold}% or <{lower_threshold}%: {len(filtered_files)}")

    if filtered_files:
        print(f"\nFilenames of images with >{upper_threshold}% zero pixels or <{lower_threshold}% zero pixels:")
        for file_name in filtered_files:
            # Get the actual percentage for this file
            file_path = os.path.join(folder_path, file_name)
            with rasterio.open(file_path) as src:
                image_array = src.read(1)
                percentage = (np.sum(image_array == 0) / image_array.size) * 100
                print(f"{file_name}: {percentage:.2f}% zeros")

    return filtered_files

if __name__ == "__main__":
    # Folder containing the TIF masks
    folder_path1 = 'ChangeDetectionMergedDivided-DW-tif/cd1_Output'
    folder_path2 = 'ChangeDetectionMergedDivided-DW-tif/cd2_Output'

    # Calculate zero pixel percentages, plot histogram, and print filenames
    filtered_files1 = calculate_zero_pixel_percentage_and_plot(folder_path1, 
                                                            lower_threshold=80, 
                                                            upper_threshold=99.5)
    
    filtered_files2 = calculate_zero_pixel_percentage_and_plot(folder_path2, 
                                                            lower_threshold=70, 
                                                            upper_threshold=99.5)
    
    filtered_files = filtered_files1 + filtered_files2
    filtered_files = list(set(filtered_files))  # Remove duplicates
    print(f"\nTotal filtered files: {len(filtered_files)}")

#### 4.5.2 Extract the `City_State_Direction_Part` name

In [None]:
import re

def extract_city_state_direction_part(file_name):
    # Example regex pattern to extract city_state_direction_part (adjust according to your file naming convention)
    match = re.match(r'([a-zA-Z0-9_]+_[a-zA-Z0-9_]+_[a-zA-Z0-9_]+)', file_name)
    if match:
        return match.group(1)  # Return the matched part
    else:
        return None

# Extract city_state_direction_part from each item in filtered_files
city_state_direction_parts = [extract_city_state_direction_part(file) for file in filtered_files]

# Print the extracted parts
city_state_direction_parts2 = []
for part in city_state_direction_parts:
    part2 = part.replace('cd1_m_', '').replace('cd2_m_', '') if part else "Not Found"
    city_state_direction_parts2.append(part2)

city_state_direction_parts3 = list(set(city_state_direction_parts2))
city_state_direction_parts3.sort()

for party in city_state_direction_parts3:
    print(party)
print(len(city_state_direction_parts3))
#print(city_state_direction_parts2)

#### 4.5.3 Delete the files from all folders 

In [None]:
import os

def delete_matching_files(folder_path, file_list, specific_part1, specific_part2, if_mask):
    """
    Deletes files in the folder that match entries in the provided file_list.
    
    Parameters:
        folder_path (str): Path to the folder containing files to be deleted.
        file_list (list): List of filenames to delete (matching city_state_direction_part).
    """
    # Create a set for faster lookup
    file_set = set(file_list)

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Extract the city_state_direction_part from the filename (without extension)
        csdp3 = file_name.rsplit('.', 1)[0]
        if if_mask==True:
            csdp4 = csdp3.replace(specific_part1, '')
            csdp5 = csdp4[2:]
        else:
            csdp5 = csdp3.replace(specific_part1, '')
        #print(len(csdp5))
        
        if csdp5 in file_set:
            # Full file path to delete
            file_path = os.path.join(folder_path, file_name)
            os.remove(file_path)
            print(f"Deleted: {file_path}")

# Example usage
ROOT = 'ChangeDetectionMergedDivided-DW-tif'
TTV = ''

DIR = '2019'
folder_path = f'{ROOT}/{TTV}/Images/T{DIR}'
specific_part = f'_{DIR}'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

folder_path = f'{ROOT}/{TTV}/Masks/T{DIR}'
specific_part1 = f'_{DIR}'
specific_part2 = 'm_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part1, specific_part2, True)

DIR = '2024'
folder_path = f'{ROOT}/{TTV}/Images/T{DIR}'
specific_part = f'_{DIR}'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

folder_path = f'{ROOT}/{TTV}/Masks/T{DIR}'
specific_part1 = f'_{DIR}'
specific_part2 = 'm_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part1, specific_part2, True)

SPECIFICITY = 'cd1'
folder_path = f'{ROOT}/{TTV}/{SPECIFICITY}_Output'
specific_part = f'{SPECIFICITY}_m_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

SPECIFICITY = 'cd2'
folder_path = f'{ROOT}/{TTV}/{SPECIFICITY}_Output'
specific_part = f'{SPECIFICITY}_m_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

SPECIFICITY = 'cd3'
folder_path = f'{ROOT}/{TTV}/{SPECIFICITY}_Output'
specific_part = f'{SPECIFICITY}_m_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

## 5. Split dataset into Train-Test-Val

### 5.1 Split `TIFs`

#### File structures:
1. Before split - 
```
Dataset ->{ Images->{T2019,T2024}, Masks->{T2019,T2024}, cd1_Output, cd2_Output, cd3_Output}
```

2. After split - 
```
SplitDataset ->{train->Dataset, test->Dataset, val->Dataset}
WHERE
Dataset->{ Images->{T2019,T2024}, Masks->{T2019,T2024}, cd1_Output, cd2_Output, cd3_Output}
```

- Filenames <br>
    - Images = `Alabama_Birmingham_C_2019.tif` <br>
    - Masks = `m_Alabama_Birmingham_C_2019.tif` <br>
    - cd1_Output = `cd1_m_Alabama_Birmingham_C.tif`<br>

In [None]:
import os
import shutil
from collections import defaultdict
import random
import rasterio

def split_dataset_tif(dataset_dir, output_dir, train_count, val_count, test_count):
    # Define directories
    images_2019 = os.path.join(dataset_dir, "Images", "T2019")
    images_2024 = os.path.join(dataset_dir, "Images", "T2024")
    masks_2019 = os.path.join(dataset_dir, "Masks", "T2019")
    masks_2024 = os.path.join(dataset_dir, "Masks", "T2024")
    cd1_output = os.path.join(dataset_dir, "cd1_Output")
    cd2_output = os.path.join(dataset_dir, "cd2_Output")
    cd3_output = os.path.join(dataset_dir, "cd3_Output")

    # Group files by city (excluding part and extension)
    city_files = defaultdict(list)
    for filename in os.listdir(images_2019):
        if filename.endswith(".tif"):
            # Extract the city name (e.g., "Alabama_Birmingham_C")
            city = "_".join(filename.split("_")[:-2])
            city_files[city].append(filename)

    # Shuffle cities for randomization
    cities = list(city_files.keys())
    random.shuffle(cities)

    # Split cities into train, val, and test with precise file counts
    train_cities, val_cities, test_cities = [], [], []
    train_files, val_files, test_files = 0, 0, 0

    for city in cities:
        city_file_count = len(city_files[city])
        if train_files + city_file_count <= train_count:
            train_cities.append(city)
            train_files += city_file_count
        elif val_files + city_file_count <= val_count:
            val_cities.append(city)
            val_files += city_file_count
        elif test_files + city_file_count <= test_count:
            test_cities.append(city)
            test_files += city_file_count
        else:
            break

    # Adjust splits if necessary
    def adjust_split(target_count, current_count, source_split, target_split):
        for city in source_split[:]:
            city_file_count = len(city_files[city])
            if current_count + city_file_count <= target_count:
                target_split.append(city)
                source_split.remove(city)
                current_count += city_file_count
            if current_count == target_count:
                break
        return current_count

    train_files = adjust_split(train_count, train_files, cities, train_cities)
    val_files = adjust_split(val_count, val_files, cities, val_cities)
    test_files = adjust_split(test_count, test_files, cities, test_cities)

    # Helper function to copy files using rasterio
    def copy_files(cities, split_type):
        for city in cities:
            for filename in city_files[city]:
                city_base = "_".join(filename.split("_")[:-2])

                files_to_copy = [
                    (os.path.join(images_2019, filename), os.path.join(output_dir, split_type, "Images", "T2019")),
                    (os.path.join(images_2024, filename.replace("_2019", "_2024")), os.path.join(output_dir, split_type, "Images", "T2024")),
                    (os.path.join(masks_2019, f"m_{filename}"), os.path.join(output_dir, split_type, "Masks", "T2019")),
                    (os.path.join(masks_2024, f"m_{filename.replace('_2019', '_2024')}"), os.path.join(output_dir, split_type, "Masks", "T2024")),
                    (os.path.join(cd1_output, f"cd1_m_{city_base}_{filename.split('_')[-1]}"), os.path.join(output_dir, split_type, "cd1_Output")),
                    (os.path.join(cd2_output, f"cd2_m_{city_base}_{filename.split('_')[-1]}"), os.path.join(output_dir, split_type, "cd2_Output")),
                    (os.path.join(cd3_output, f"cd3_m_{city_base}_{filename.split('_')[-1]}"), os.path.join(output_dir, split_type, "cd3_Output")),
                ]

                for src, dest_dir in files_to_copy:
                    try:
                        if os.path.exists(src):
                            os.makedirs(dest_dir, exist_ok=True)
                            dest_path = os.path.join(dest_dir, os.path.basename(src))
                            with rasterio.open(src) as src_data:
                                with rasterio.open(dest_path, 'w', **src_data.meta) as dest_data:
                                    dest_data.write(src_data.read())
                        else:
                            print(f"File not found: {src}")
                    except Exception as e:
                        print(f"Error copying {src}: {e}")

    # Copy files to train, val, and test folders
    copy_files(train_cities, "train")
    copy_files(val_cities, "val")
    copy_files(test_cities, "test")
    print("Dataset split complete.")

# Example usage
dataset_dir = "ChangeDetectionMergedDivided-DW-tif"
output_dir = "ChangeDetectionMergedDividedSplit-DW-tif"
Total = 6064
train_count = int(Total * 0.70)
val_count = int(Total * 0.15) + 1
test_count = int(Total * 0.15) + 1
print(train_count, val_count, test_count)
print(train_count + val_count + test_count)
split_dataset_tif(dataset_dir, output_dir, train_count, val_count, test_count)

### 4.2 Convert an entire folder from TIF to PNG

In [None]:
import os
import rasterio
import numpy as np
from PIL import Image

def tif_to_png(tif_path, png_path):
    # Read the .tif file using rasterio
    with rasterio.open(tif_path) as src:
        # Read the image data into a NumPy array
        array = src.read()
        if array.shape[0] == 3:  # RGB image
            array = np.moveaxis(array, 0, -1)  # Reorder dimensions to (H, W, C)
        elif array.shape[0] == 1:  # Grayscale image
            array = array[0]  # Remove the single-band dimension
    
    # Normalize the array to range [0, 255] for saving as PNG
    array = array - array.min()
    array = (array / array.max() * 255).astype(np.uint8)
    
    # Save the NumPy array as a .png image using Pillow
    img = Image.fromarray(array)
    img.save(png_path)
    print(f"Saved {png_path}")

def convert_all_tifs_in_folder(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # List all .tif files in the input folder
    tif_files = [f for f in os.listdir(input_folder) if f.endswith('.tif')]
    
    if not tif_files:
        print(f"No .tif files found in {input_folder}")
        return

    # Convert each .tif file to .png and save in the output folder
    for tif_file in tif_files:
        tif_path = os.path.join(input_folder, tif_file)
        png_path = os.path.join(output_folder, tif_file.replace(".tif", ".png"))
        tif_to_png(tif_path, png_path)

# Example usage:
DIRECTORY1 = "ChangeDetectionMergedDivided-tif"
DIRECTORY2 = "ChangeDetectionMergedDivided-png"  
input_folder = f"{DIRECTORY1}\Images\T2019"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\Images\T2019"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

input_folder = f"{DIRECTORY1}\Images\T2024"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\Images\T2024"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

input_folder = f"{DIRECTORY1}\Masks\T2019"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\Masks\T2019"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

input_folder = f"{DIRECTORY1}\Masks\T2024"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\Masks\T2024"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

input_folder = f"{DIRECTORY1}\cd1_Output"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\cd1_Output"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

input_folder = f"{DIRECTORY1}\cd2_Output"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\cd2_Output"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

input_folder = f"{DIRECTORY1}\cd3_Output"  # Replace with the folder containing your .tif files
output_folder = f"{DIRECTORY2}\cd3_Output"  # Replace with the folder to save .png files
convert_all_tifs_in_folder(input_folder, output_folder)

## 6. Display the files in Dataset (Check if they match)

###  Display the files in Dataset (to check if they match) for `TIFs` (not divided)

In [None]:
import os
import matplotlib.pyplot as plt
import rasterio
import random
import numpy as np

def fetch_and_display_images(real_folder_2019, real_folder_2024,
                             input_folder_2019, input_folder_2024,
                             cd1_output_folder, cd2_output_folder, 
                             cd3_output_folder, index):
    """
    Fetch and display images: mask_2019, mask_2024, cd1, cd2, cd3, real_2019, real_2024.
    """
    # Find a matching pair of 2019 and 2024 masks
    files2019 = os.listdir(input_folder_2019)
    files2024 = os.listdir(input_folder_2024)
    mask_2019_files = [f for f in files2019]
    mask_2024_files = [f for f in files2024]

    real2019 = os.listdir(real_folder_2019)
    real2024 = os.listdir(real_folder_2024)
    real_2019_files = [f for f in real2019]
    real_2024_files = [f for f in real2024]
    
    if not mask_2019_files or not mask_2024_files:
        print("No matching mask files found in the input folder.")
        return
    
    if not real_2019_files or not real_2024_files:
        print("No matching real files found in the input folder.")
        return

    # Use the indexed file
    real_2019_file = real_2019_files[index]
    city_state = real_2019_file.replace("_2019.tif", "")
    mask_2019_file = f"m_{city_state}_2019.tif"
    mask_2024_file = f"m_{city_state}_2024.tif"
    cd1_output = f"cd1_m_{city_state}.tif"
    cd2_output = f"cd2_m_{city_state}.tif"
    cd3_output = f"cd3_m_{city_state}.tif"

    real_2019_file = f"{city_state}_2019.tif"
    real_2024_file = f"{city_state}_2024.tif"
    
    mask_2019_path = os.path.join(input_folder_2019, mask_2019_file)
    mask_2024_path = os.path.join(input_folder_2024, mask_2024_file)
    cd1_output_path = os.path.join(cd1_output_folder, cd1_output)
    cd2_output_path = os.path.join(cd2_output_folder, cd2_output)
    cd3_output_path = os.path.join(cd3_output_folder, cd3_output)
    real_2019_path = os.path.join(real_folder_2019, real_2019_file)
    real_2024_path = os.path.join(real_folder_2024, real_2024_file)

    if not os.path.exists(cd1_output_path) or not os.path.exists(cd2_output_path) or not os.path.exists(cd3_output_path):
        print(f"Output mask files not found: {cd1_output}, {cd2_output}, or {cd3_output}")
        return

    # Read and display images
    with rasterio.open(mask_2019_path) as src_2019:
        mask_2019 = src_2019.read(1)  # Read first band
    
    with rasterio.open(mask_2024_path) as src_2024:
        mask_2024 = src_2024.read(1)  # Read first band
    
    with rasterio.open(cd1_output_path) as src_output:
        cd1_output_mask = src_output.read(1)  # Read first band

    with rasterio.open(cd2_output_path) as src_output:
        cd2_output_mask = src_output.read(1)  # Read first band

    with rasterio.open(cd3_output_path) as src_output:
        cd3_output_mask = src_output.read(1)  # Read first band

    with rasterio.open(real_2019_path) as src_real2019:
        real_2019 = np.dstack([src_real2019.read(band) for band in (1, 2, 3)])  # Stack bands for RGB

    with rasterio.open(real_2024_path) as src_real2024:
        real_2024 = np.dstack([src_real2024.read(band) for band in (1, 2, 3)])  # Stack bands for RGB

    # Plot the masks
    fig, axs = plt.subplots(2, 4, figsize=(20, 10))

    axs[0, 0].imshow(real_2019 / real_2019.max())  # Normalize for display
    axs[0, 0].set_title(real_2019_file)
    axs[0, 0].axis("off")
    
    axs[0, 1].imshow(real_2024 / real_2024.max())  # Normalize for display
    axs[0, 1].set_title(real_2024_file)
    axs[0, 1].axis("off")
    
    axs[1, 0].imshow(mask_2019, cmap="viridis")
    axs[1, 0].set_title(mask_2019_file)
    axs[1, 0].axis("off")
    
    axs[1, 1].imshow(mask_2024, cmap="viridis")
    axs[1, 1].set_title(mask_2024_file)
    axs[1, 1].axis("off")
    
    axs[0, 2].imshow(cd1_output_mask, cmap="turbo")
    axs[0, 2].set_title(cd1_output)
    axs[0, 2].axis("off")

    axs[0, 3].imshow(cd2_output_mask, cmap="turbo")
    axs[0, 3].set_title(cd2_output)
    axs[0, 3].axis("off")

    axs[1, 2].imshow(cd3_output_mask, cmap="turbo")
    axs[1, 2].set_title(cd3_output)
    axs[1, 2].axis("off")

    axs[1, 3].axis("off")
    
    plt.tight_layout()
    plt.show()

# Example usage:
base_folder = "ChangeDetectionUSA"
real_folder_2019 = f"{base_folder}/Images/T2019"
real_folder_2024 = f"{base_folder}/Images/T2024"
input_folder_2019 = f"{base_folder}/Masks/T2019"
input_folder_2024 = f"{base_folder}/Masks/T2024"
cd1_output_folder = f"{base_folder}/cd1_Output"
cd2_output_folder = f"{base_folder}/cd2_Output"
cd3_output_folder = f"{base_folder}/cd3_Output"

for i in range(10):
    j = random.randint(0, len(os.listdir(input_folder_2019)) - 1)
    fetch_and_display_images(real_folder_2019, real_folder_2024,
                             input_folder_2019, input_folder_2024,
                             cd1_output_folder, cd2_output_folder, 
                             cd3_output_folder, j)

### Display All images of a single City and save to `Organized` folder
- All = image2019, image2024, mask2019, mask2024, cd1, cd2, cd3

In [None]:
import os
import shutil
import matplotlib.pyplot as plt
import rasterio
import numpy as np

def fetch_display_and_copy_images(city_state, real_folder_2019, real_folder_2024,
                                  input_folder_2019, input_folder_2024,
                                  cd1_output_folder, cd2_output_folder, cd3_output_folder,
                                  output_folder_base):
    """
    Fetch, display, and copy images for a specific city: mask_2019, mask_2024, cd1, cd2, cd3, real_2019, real_2024.
    """
    # Define file names based on the city_state input
    mask_2019_file = f"m_{city_state}_2019.tif"
    mask_2024_file = f"m_{city_state}_2024.tif"
    cd1_output_file = f"cd1_m_{city_state}.tif"
    cd2_output_file = f"cd2_m_{city_state}.tif"
    cd3_output_file = f"cd3_m_{city_state}.tif"
    real_2019_file = f"{city_state}_2019.tif"
    real_2024_file = f"{city_state}_2024.tif"
    
    # Define file paths
    mask_2019_path = os.path.join(input_folder_2019, mask_2019_file)
    mask_2024_path = os.path.join(input_folder_2024, mask_2024_file)
    cd1_output_path = os.path.join(cd1_output_folder, cd1_output_file)
    cd2_output_path = os.path.join(cd2_output_folder, cd2_output_file)
    cd3_output_path = os.path.join(cd3_output_folder, cd3_output_file)
    real_2019_path = os.path.join(real_folder_2019, real_2019_file)
    real_2024_path = os.path.join(real_folder_2024, real_2024_file)
    
    # List of file paths and their corresponding titles
    files_and_titles = [
        (real_2019_path, f"{real_2019_file}", True),
        (real_2024_path, f"{real_2024_file}", True),
        (mask_2019_path, f"{mask_2019_file}", False),
        (mask_2024_path, f"{mask_2024_file}", False),
        (cd1_output_path, f"{cd1_output_file}", False),
        (cd2_output_path, f"{cd2_output_file}", False),
        (cd3_output_path, f"{cd3_output_file}", False),
    ]
    
    # Check if all files exist
    for path, _, _ in files_and_titles:
        if not os.path.exists(path):
            print(f"File not found: {path}")
            return
    
    # Create output folder for the city
    city_output_folder = os.path.join(output_folder_base, city_state)
    os.makedirs(city_output_folder, exist_ok=True)
    
    # Function to display images in 4x4 grid (subplots)
    def display_images_in_grid(files_and_titles):
        # Create a 4x4 grid of subplots
        plt.figure(figsize=(16, 16))  # Adjust size to fit the grid
        for i, (file_path, title, is_rgb) in enumerate(files_and_titles):
            plt.subplot(4, 4, i+1)  # Set the subplot position
            with rasterio.open(file_path) as src:
                if is_rgb:
                    image = np.dstack([src.read(band) for band in (1, 2, 3)])  # Stack bands for RGB
                    image = image / image.max()  # Normalize for display
                else:
                    image = src.read(1)  # Read the first band
            
            plt.imshow(image, cmap="turbo" if not is_rgb else None)
            plt.title(title)
            plt.axis("off")
        
        plt.tight_layout()  # Automatically adjust subplot spacing
        plt.show()

    # Display images in a 4x4 grid
    display_images_in_grid(files_and_titles)
    
    # Copy files to the output folder
    for file_path, _, _ in files_and_titles:
        shutil.copy(file_path, os.path.join(city_output_folder, os.path.basename(file_path)))

    print(f"All files for {city_state} have been copied to {city_output_folder}")

# Example usage:
base_folder = "ChangeDetectionUSA"
real_folder_2019 = f"{base_folder}/Images/T2019"
real_folder_2024 = f"{base_folder}/Images/T2024"
input_folder_2019 = f"{base_folder}/Masks/T2019"
input_folder_2024 = f"{base_folder}/Masks/T2024"
cd1_output_folder = f"{base_folder}/cd1_Output"
cd2_output_folder = f"{base_folder}/cd2_Output"
cd3_output_folder = f"{base_folder}/cd3_Output"
output_folder_base = "Organized"

city_name = "California_SanFrancisco_SW" #"Washington_Seattle_SW" 
fetch_display_and_copy_images(city_name, real_folder_2019, real_folder_2024,
                               input_folder_2019, input_folder_2024,
                               cd1_output_folder, cd2_output_folder, cd3_output_folder,
                               output_folder_base)

## 7. Utilities

### GDAL commands
1. Convert individual 1 band image (mask image) to PNG
```
gdal_translate -of PNG -scale -co worldfile=no mask2.tif output6.png
```

### Compare folders

In [None]:
import os

folder1 = "ChangeDetectionMergedDivided-DW-tif/Images/T2024"
folder2 = "ChangeDetectionMergedDivided-DW-tif/Masks/T2024"

# Get list of files (without extensions for comparison)
files1 = {f for f in os.listdir(folder1) if os.path.isfile(os.path.join(folder1, f))}
files2 = {f[2:] for f in os.listdir(folder2) if f.startswith("m_") and os.path.isfile(os.path.join(folder2, f))}

# Find files in folder1 that do not have a corresponding m_ file in folder2
missing_files = files1 - files2

print("Files in folder1 without matching m_ files in folder2:")
for file in missing_files:
    print(file)

print(len(missing_files))

### Replace filenames

In [None]:
import os

# Define the folder containing the files
folder_path = "ChangeDetectionMerged-DW-tif/cd1_Output"

# Dictionary with incorrect-to-correct replacements
replacements = {
    'Alabama_Dayton': 'Ohio_Dayton',
    'Alabama_Cincinnati': 'Ohio_Cincinnati',
    'Alabama_Toledo': 'Ohio_Toledo',
    'Illinois_Greenbay': 'Wisconsin_Greenbay',
    'Illinois_FortWayne': 'Indiana_FortWayne',
    'Illinois_Southbend': 'Indiana_Southbend',
    'Oklahoma_SanAntonio': 'Texas_SanAntonio',
    'Oklahoma_Dallas': 'Texas_Dallas',
    'Oklahoma_Austin': 'Texas_Austin',
    'Alabama_Cleveland': 'Ohio_Cleveland',
}

# List all files in the folder
for filename in os.listdir(folder_path):
    old_path = os.path.join(folder_path, filename)
    
    # Check if it's a file
    if os.path.isfile(old_path):
        # Replace incorrect state names in the filename
        for wrong, correct in replacements.items():
            if wrong in filename:
                new_filename = filename.replace(wrong, correct)
                new_path = os.path.join(folder_path, new_filename)
                
                # Rename the file
                os.rename(old_path, new_path)
                print(f"Renamed: {filename} -> {new_filename}")

### Rename files in a folder

In [None]:
import os

def rename_files_in_folder(folder_path, old_str, new_str):
    for filename in os.listdir(folder_path):
        new_filename = filename.replace(old_str, new_str)
        old_file = os.path.join(folder_path, filename)
        new_file = os.path.join(folder_path, new_filename)
        os.rename(old_file, new_file)

# Example usage:
folder_path = 'ChangeDetectionMergedDivided-DW-tif/cd3_Output'
old_str = 'cd2'
new_str = 'cd3'
rename_files_in_folder(folder_path, old_str, new_str)

In [None]:
import os
import rasterio
import numpy as np
from tqdm import tqdm

def replace_255_with_6(input_folder, output_folder=None):
    """
    Process a folder of TIF files, replacing pixels with value 255 with value 6.
    
    Args:
        input_folder (str): Path to the folder containing TIF files
        output_folder (str, optional): Path to save the processed files. 
                                      If None, will modify files in place.
    """
    # Create output folder if specified and doesn't exist
    if output_folder and not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get list of TIF files in the input folder
    tif_files = [f for f in os.listdir(input_folder) if f.endswith('.tif')]
    
    if not tif_files:
        print(f"No TIF files found in {input_folder}")
        return
    
    print(f"Found {len(tif_files)} TIF files to process.")
    
    # Process each file
    for filename in tqdm(tif_files, desc="Processing files"):
        input_path = os.path.join(input_folder, filename)
        
        # Determine output path
        if output_folder:
            output_path = os.path.join(output_folder, filename)
        else:
            # Create a temporary file for in-place modification
            output_path = os.path.join(input_folder, f"temp_{filename}")
        
        # Open the input file
        with rasterio.open(input_path) as src:
            # Read the raster data
            data = src.read()
            
            # Get profile here so it's available for both branches
            profile = src.profile
            
            # Get unique values in the data before modification
            unique_before = np.unique(data)
            
            # Check if value 255 exists
            has_255 = 255 in unique_before
            
            # Replace 255 with 6
            if has_255:
                data = np.where(data == 255, 6, data)
                
                # Get unique values after modification for verification
                unique_after = np.unique(data)
                print(f"Modified {filename}: 255 -> 6")
                #print(f"  Unique values before: {unique_before}")
                #print(f"  Unique values after:  {unique_after}")
            else:
                pass
                #print(f"Skipped {filename}: No 255 values found")
            
            # Save file (either modified or unchanged) to the output location
            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(data)
            
            # If doing in-place modification and we have changes
            if output_folder is None and has_255:
                # Replace the original file with the modified file
                os.replace(output_path, input_path)
    
    print("Processing complete!")

if __name__ == "__main__":
    # Example usage
    input_folder = "ChangeDetectionMergedDivided-DW-tif/Masks/T2024"  # Replace with your folder path
    
    # Option 1: Modify files in place
    #replace_255_with_6(input_folder)
    
    #Option 2: Save to a new folder
    output_folder = "ChangeDetectionMergedDivided-DW-tif/Masks/T2024_modified"
    replace_255_with_6(input_folder, output_folder)

## 8. class distribution in tifs (before converting to PNGs)
- find unexpected classes

In [None]:
import os
import rasterio
import numpy as np
from collections import Counter

def analyze_class_combinations(masks_folder):
    """
    Analyze unique class combinations in each mask file and provide summary statistics.
    
    Parameters:
    masks_folder (str): Path to folder containing mask files
    """
    # Store combinations for each file
    file_combinations = {}
    
    # Count frequency of each unique combination
    combination_counter = Counter()
    
    # Process each file
    for filename in sorted(os.listdir(masks_folder)):
        if filename.endswith('.tif'):
            file_path = os.path.join(masks_folder, filename)
            
            # Read the mask
            with rasterio.open(file_path) as src:
                mask_data = src.read(1)
                unique_classes = sorted(np.unique(mask_data))
                
                # Store combination for this file
                file_combinations[filename] = unique_classes
                
                # Update counter with this combination
                combination_counter[tuple(unique_classes)] += 1
    
    # Print individual file results
    print("\nClass combinations per file:")
    print("-" * 50)
    # for filename, classes in file_combinations.items():
    #     print(f"{filename}: {classes}")
    
    # Print summary statistics
    print("\nSummary of unique combinations:")
    print("-" * 50)
    total_files = len(file_combinations)
    
    for combination, count in combination_counter.most_common():
        percentage = (count / total_files) * 100
        print(f"Classes {list(combination)}: {count} files ({percentage:.1f}%)")
    
    print(f"\nTotal number of files analyzed: {total_files}")
    print(f"Number of unique combinations found: {len(combination_counter)}")
    
    return {
        'file_combinations': file_combinations,
        'combination_counts': dict(combination_counter),
        'total_files': total_files,
        'unique_combinations': len(combination_counter)
    }

# Example usage:
# print("Analyzing 2019 masks...")
# masks_folder_2019 = "ChangeDetectionMergedDivided-DW-tif/Masks/T2019"
# stats_2019 = analyze_class_combinations(masks_folder_2019)

# print("\nAnalyzing 2024 masks...")
# masks_folder_2024 = "ChangeDetectionMergedDivided-DW-tif/Masks/T2024"
# stats_2024 = analyze_class_combinations(masks_folder_2024)


print("\nAnalyzing cd1...")
masks_folder_2024 = "ChangeDetectionMergedDivided-DW-tif/cd1_Output"
stats_2024 = analyze_class_combinations(masks_folder_2024)

print("\nAnalyzing cd2...")
masks_folder_2024 = "ChangeDetectionMergedDivided-DW-tif/cd2_Output"
stats_2024 = analyze_class_combinations(masks_folder_2024)

In [None]:
import os
import rasterio
import numpy as np
from collections import Counter

def find_unexpected_class_combinations(masks_folder):
    """
    Find files that have class combinations other than [0,1,2,3], [1,2,3]
    
    Parameters:
    masks_folder (str): Path to folder containing mask files
    
    Returns:
    tuple: (unexpected_files, all_combinations)
        - unexpected_files: dict mapping filenames to their unexpected class combinations
        - all_combinations: dict mapping filenames to their class combinations
    """
    # Define expected combinations
    expected_combinations = [
        (1, 2, 3),     # [1,2,3]
        (0, 1, 2, 3),  # [0,1,2,3]
    ]
    
    # Store results
    unexpected_files = {}
    all_combinations = {}
    combination_counter = Counter()
    
    # Process each file
    print(f"\nAnalyzing files in {masks_folder}")
    print("-" * 50)
    
    for filename in sorted(os.listdir(masks_folder)):
        if filename.endswith('.tif'):
            file_path = os.path.join(masks_folder, filename)
            
            # Read the mask
            with rasterio.open(file_path) as src:
                mask_data = src.read(1)
                unique_classes = tuple(sorted(np.unique(mask_data)))
                
                # Store this combination
                all_combinations[filename] = unique_classes
                combination_counter[unique_classes] += 1
                
                # Check if this combination is unexpected
                if unique_classes not in expected_combinations:
                    unexpected_files[filename] = unique_classes
    
    # Print summary statistics
    total_files = len(all_combinations)
    unexpected_count = len(unexpected_files)
    
    print("\nSummary:")
    print(f"Total files analyzed: {total_files}")
    print(f"Files with unexpected class combinations: {unexpected_count}")
    
    print("\nAll combinations found:")
    for combo, count in combination_counter.most_common():
        percentage = (count / total_files) * 100
        print(f"Classes {list(combo)}: {count} files ({percentage:.1f}%)")
    
    if unexpected_files:
        print("\nFiles with unexpected class combinations:")
        for filename, classes in unexpected_files.items():
            print(f"{filename}: {list(classes)}")
    else:
        print("\nNo files with unexpected class combinations found.")
    
    return unexpected_files, all_combinations

# Example usage:
print("Analyzing 2019 masks...")
masks_folder_2019 = "ChangeDetectionMergedDivided-tif/Masks/T2019"
unexpected_2019, all_2019 = find_unexpected_class_combinations(masks_folder_2019)

print("\nAnalyzing 2024 masks...")
masks_folder_2024 = "ChangeDetectionMergedDivided-tif/Masks/T2024"
unexpected_2024, all_2024 = find_unexpected_class_combinations(masks_folder_2024)

In [None]:
# Get the union of problematic files 
all_problematic_files = list(set(unexpected_2019.keys()) | set(unexpected_2024.keys()))

# print("\nAll unique problematic files:")
# print("-" * 50)
# for filename in sorted(all_problematic_files):
#     classes_2019 = list(unexpected_2019.get(filename, []))  # Empty list if file not in 2019 
#     classes_2024 = list(unexpected_2024.get(filename, []))  # Empty list if file not in 2024
#     print(f"{filename}:")
#     if classes_2019:
#         print(f"  2019 classes: {classes_2019}")
#     if classes_2024:
#         print(f"  2024 classes: {classes_2024}")

#print(f"\nTotal unique problematic files: {len(all_problematic_files)}")


import re

def extract_city_state_direction(file_name):
    # Remove .tif extension if present
    file_name = file_name.replace('.tif', '')
    
    # Remove only year numbers (2019/2024)
    file_name = re.sub(r'_20(19|24)', '', file_name)
    
    # Remove any prefixes like m_ or cd1_m_
    file_name = re.sub(r'^(m_|cd[1-3]_m_)', '', file_name)
    
    # Keep the part number at the end
    match = re.match(r'([a-zA-Z0-9]+_[a-zA-Z0-9]+_[A-Z]+_[1-4])', file_name)
    if match:
        return match.group(1)
    return None

# Get unique locations from the problematic files list
unique_locations = set()
for filename in all_problematic_files:
    location = extract_city_state_direction(filename)
    if location:
        unique_locations.add(location)

print("\nUnique problematic locations with parts:")
print("-" * 50)
for location in sorted(unique_locations):
    print(location)

print(f"\nTotal unique problematic locations: {len(unique_locations)}")

# Print examples of files for each location
print("\nFiles for each location:")
print("-" * 50)
for location in sorted(unique_locations):
    print(f"\n{location}:")
    matching_files = [f for f in all_problematic_files if location in f]
    for file in sorted(matching_files):
        print(f"  {file}")

In [None]:
import os

def delete_matching_files(folder_path, file_list, specific_part1, specific_part2, if_mask):
    """
    Deletes files in the folder that match entries in the provided file_list.
    
    Parameters:
        folder_path (str): Path to the folder containing files to be deleted.
        file_list (list): List of filenames to delete (matching city_state_direction_part).
    """
    # Create a set for faster lookup
    file_set = set(file_list)

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Extract the city_state_direction_part from the filename (without extension)
        csdp3 = file_name.rsplit('.', 1)[0]
        if if_mask==True:
            csdp4 = csdp3.replace(specific_part1, '')
            csdp5 = csdp4[2:]
        else:
            csdp5 = csdp3.replace(specific_part1, '')
        #print(len(csdp5))
        
        if csdp5 in file_set:
            # Full file path to delete
            file_path = os.path.join(folder_path, file_name)
            os.remove(file_path)
            print(f"Deleted: {file_path}")

# Example usage
ROOT = 'ChangeDetectionMergedDivided-tif'
TTV = ''

city_state_direction_parts2 = unique_locations
DIR = '2019'
folder_path = f'{ROOT}/{TTV}/Images/T{DIR}'
specific_part = f'_{DIR}'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

folder_path = f'{ROOT}/{TTV}/Masks/T{DIR}'
specific_part1 = f'_{DIR}'
specific_part2 = 'm_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part1, specific_part2, True)

DIR = '2024'
folder_path = f'{ROOT}/{TTV}/Images/T{DIR}'
specific_part = f'_{DIR}'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

folder_path = f'{ROOT}/{TTV}/Masks/T{DIR}'
specific_part1 = f'_{DIR}'
specific_part2 = 'm_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part1, specific_part2, True)

SPECIFICITY = 'cd1'
folder_path = f'{ROOT}/{TTV}/{SPECIFICITY}_Output'
specific_part = f'{SPECIFICITY}_m_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

SPECIFICITY = 'cd2'
folder_path = f'{ROOT}/{TTV}/{SPECIFICITY}_Output'
specific_part = f'{SPECIFICITY}_m_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

SPECIFICITY = 'cd3'
folder_path = f'{ROOT}/{TTV}/{SPECIFICITY}_Output'
specific_part = f'{SPECIFICITY}_m_'
delete_matching_files(folder_path, city_state_direction_parts2, specific_part, specific_part, False)

In [None]:
import os
import numpy as np
from PIL import Image
from collections import Counter
import re

def analyze_png_class_distribution(masks_folder):
    """
    Find files that have class combinations other than [0,1,2,3], [1,2,3], [1,2], or [0].
    
    Parameters:
    masks_folder (str): Path to folder containing PNG mask files
    
    Returns:
    tuple: (unexpected_files, all_combinations)
        - unexpected_files: dict mapping filenames to their unexpected class combinations
        - all_combinations: dict mapping filenames to their class combinations
    """
    # Define expected combinations
    expected_combinations = [
        (1, 2, 3),     # [1,2,3]
        (0, 1, 2, 3),  # [0,1,2,3]
    ]
    
    # Store results
    unexpected_files = {}
    all_combinations = {}
    combination_counter = Counter()
    
    # Process each file
    print(f"\nAnalyzing PNG files in {masks_folder}")
    print("-" * 50)
    
    for filename in sorted(os.listdir(masks_folder)):
        if filename.endswith('.png'):
            file_path = os.path.join(masks_folder, filename)
            
            try:
                # Read the PNG mask using PIL
                with Image.open(file_path) as img:
                    mask_data = np.array(img)
                    unique_classes = tuple(sorted(np.unique(mask_data)))
                    
                    # Store this combination
                    all_combinations[filename] = unique_classes
                    combination_counter[unique_classes] += 1
                    
                    # Check if this combination is unexpected
                    if unique_classes not in expected_combinations:
                        unexpected_files[filename] = unique_classes
                        
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                continue
    
    # Print summary statistics
    total_files = len(all_combinations)
    unexpected_count = len(unexpected_files)
    
    print("\nSummary:")
    print(f"Total files analyzed: {total_files}")
    print(f"Files with unexpected class combinations: {unexpected_count}")
    
    print("\nAll combinations found:")
    for combo, count in combination_counter.most_common():
        percentage = (count / total_files) * 100
        print(f"Classes {list(combo)}: {count} files ({percentage:.1f}%)")
    
    # if unexpected_files:
    #     #print("\nFiles with unexpected class combinations:")
    #     #print("-" * 50)
    #     for filename, classes in sorted(unexpected_files.items()):
    #         #print(f"{filename}: {list(classes)}")
            
    #         # Read file again to print detailed value counts
    #         file_path = os.path.join(masks_folder, filename)
    #         with Image.open(file_path) as img:
    #             mask_data = np.array(img)
    #             unique, counts = np.unique(mask_data, return_counts=True)
    #             value_counts = dict(zip(unique, counts))
    #             print(f"  Value counts: {value_counts}")
    #             print(f"  Shape: {mask_data.shape}")
    #             print(f"  Data type: {mask_data.dtype}")
    #             print()
    #else:
    #    print("\nNo files with unexpected class combinations found.")
    
    return unexpected_files, all_combinations

# def extract_city_state_direction(file_name):
#     """Extract location identifier from filename."""
#     # Remove .png extension
#     file_name = file_name.replace('.png', '')
    
#     # Remove year numbers (2019/2024)
#     file_name = re.sub(r'_20(19|24)', '', file_name)
    
#     # Remove any prefixes like m_ or cd1_m_
#     file_name = re.sub(r'^(m_|cd[1-3]_m_)', '', file_name)
    
#     # Keep the part number at the end
#     match = re.match(r'([a-zA-Z0-9]+_[a-zA-Z0-9]+_[A-Z]+_[1-4])', file_name)
#     if match:
#         return match.group(1)
#     return None

# Example usage:
masks_folder_2019 = "ChangeDetectionMergedDivided-png/Masks/T2019"
print("Analyzing 2019 masks...")
unexpected_2019, all_2019 = analyze_png_class_distribution(masks_folder_2019)

masks_folder_2024 = "ChangeDetectionMergedDivided-png/Masks/T2024"
print("\nAnalyzing 2024 masks...")
unexpected_2024, all_2024 = analyze_png_class_distribution(masks_folder_2024)

# # Get unique problematic locations
# all_problematic_files = list(set(unexpected_2019.keys()) | set(unexpected_2024.keys()))
# unique_locations = set()

# for filename in all_problematic_files:
#     location = extract_city_state_direction(filename)
#     if location:
#         unique_locations.add(location)

# # print("\nUnique problematic locations with parts:")
# # print("-" * 50)
# # for location in sorted(unique_locations):
# #    print(location)

# print(f"\nTotal unique problematic locations: {len(unique_locations)}")

### Class pixel distributions

In [None]:
import os
import rasterio
import numpy as np
from collections import Counter

def count_pixels_by_class(tif_directory):
    """
    Count pixels by class for all TIF files in a directory
    
    Args:
        tif_directory: Path to directory containing TIF files
        
    Returns:
        Dictionary with counts of pixels in each class
    """
    # Initialize overall pixel counter
    total_pixel_counts = Counter()
    
    # Get list of TIF files
    tif_files = [f for f in os.listdir(tif_directory) if f.endswith('.tif')]
    
    # Process each file
    for tif_file in tif_files:
        file_path = os.path.join(tif_directory, tif_file)
        #print(f"Processing: {file_path}")
        
        # Open TIF file
        with rasterio.open(file_path) as src:
            # Read the data as a numpy array
            data = src.read(1)  # Read first band
            
            # Count pixel values and add to overall counter
            file_counts = Counter(data.flatten())
            total_pixel_counts.update(file_counts)
            
            # Print individual file stats
            # print(f"File: {tif_file}")
            # print(f"Shape: {data.shape}")
            # print(f"Classes found: {sorted(file_counts.keys())}")
            # print(f"Pixel counts: {dict(file_counts)}")
            # print("-" * 50)
    
    # Return the total counts
    return dict(total_pixel_counts)

# Run the analysis
if __name__ == "__main__":
    directories = {
        "train": "ChangeDetectionMergedDividedSplit-tif3\\train\\cd1_Output",
        "val": "ChangeDetectionMergedDividedSplit-tif3\\val\\cd1_Output",
        "test": "ChangeDetectionMergedDividedSplit-tif3\\test\\cd1_Output"
    }
    
    combined_class_counts = Counter()
    total_pixels = 0
    
    for split, directory in directories.items():
        # Count pixels
        class_counts = count_pixels_by_class(directory)
        
        # Update combined counts
        combined_class_counts.update(class_counts)
        
        # Update total pixels
        total_pixels += sum(class_counts.values())
    
    # Print results
    print("\nOVERALL RESULTS:")
    print("=" * 50)
    print(f"Total number of classes: {len(combined_class_counts)}")
    print(f"Total number of pixels: {total_pixels}")
    print("\nCLASS DISTRIBUTION:")
    print("-" * 50)
    print(f"{'Class':<8} {'Count':<12} {'Percentage':<10}")
    print("-" * 50)
    for class_id, count in sorted(combined_class_counts.items()):
        percentage = (count / total_pixels) * 100
        print(f"{class_id:<8} {count:<12} {percentage:.2f}%")

    