### Identifying missing SNIC pseudomasks

In [1]:
import os 
tif_folder = "/Volumes/USB/Orig_200m_TIFS/rgb"
jpg_folder = "/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/images"

orig_files = [file for file in os.listdir(tif_folder) if not file.endswith('_aug.tif') and not file.startswith('._')]
thresholded_cams = [file for file in os.listdir(jpg_folder) if not file.startswith('._')]

# List to store TIF files without corresponding JPG files
missing_jpgs = []

# Check for TIF files in the folder
for tif_file in orig_files:
    if tif_file.endswith('.tif'):
        jpg_file = tif_file.replace('.tif', '.jpg')
        if not os.path.exists(os.path.join(jpg_folder, jpg_file)):
            missing_jpgs.append(tif_file)

# Print the count and list of missing JPG files
print(f"Number of TIF files without corresponding JPGs: {len(missing_jpgs)}")
print("Missing JPG files:", missing_jpgs)

Number of TIF files without corresponding JPGs: 577
Missing JPG files: ['763_72_5025_2018_negcrop_2.tif', '751_65_5050_2015_negcrop_7.tif', '758_66_5075_2010_crop_133.tif', '760_73_7500_2018_negcrop_11.tif', '761_75_7500_2015_negcrop_23.tif', '759_72_2575_2013_negcrop_13.tif', '760_71_7550_2018_crop_104.tif', '759_76_7575_2013_negcrop_1.tif', '761_73_0050_2018_crop_101.tif', '759_71_7550_2013_negcrop_1.tif', '758_73_0075_2013_crop_135.tif', '758_70_5075_2013_crop_2.tif', '761_72_2550_2018_crop_13.tif', '758_70_0050_2013_crop_105.tif', '754_68_5050_2015_crop_48.tif', '761_74_2500_2018_crop_123.tif', '749_61_0000_2019_crop_62.tif', '748_62_0025_2019_negcrop_9.tif', '761_73_5025_2018_crop_143.tif', '765_72_5025_2016_crop_40.tif', '765_73_5000_2016_crop_38.tif', '763_72_5050_2018_crop_57.tif', '757_72_7525_2013_negcrop_17.tif', '763_75_7525_2015_negcrop_7.tif', '761_71_0025_2018_crop_27.tif', '760_74_7500_2018_negcrop_42.tif', '756_69_7575_2014_crop_21.tif', '762_72_0075_2018_crop_31.tif',

In [2]:
import pandas as pd 

labels_file = "/Volumes/USB/Orig_200m_TIFS/new_palsa_labels.csv"
all_files = pd.read_csv(labels_file, index_col=0)

# Keep only the rows whose index name is in missing_jpgs (with the .tif extension)
missing_jpgs = all_files.loc[all_files.index.isin([file.replace('.tif', '') for file in missing_jpgs])]
missing_jpgs.head()


Unnamed: 0,palsa_percentage
760_73_5025_2018_crop_32,2.25
760_73_5025_2018_negcrop_9,0.0
760_73_5025_2018_negcrop_33,0.0
752_65_7550_2015_negcrop_3,0.0
765_73_5000_2016_crop_38,0.25


In [3]:
len(missing_jpgs)

577

In [4]:
############
# Imports #
############

import json
import os

import wandb
import pandas as pd
import rasterio
import torch
import torchvision.utils as vutils
import numpy as np

from torch.utils.data import DataLoader
from PIL import Image

from model.cnn_classifier import model_4D
from model.pseudomask import Pseudomasks
from utils.data_modules import ImageDataset, TestSet, filter_dataset

##################
## load configs ##
##################

rgb_dir = "/Volumes/USB/Orig_200m_TIFS/rgb"
hs_dir = "/Volumes/USB/Orig_200m_TIFS/hs"

# assign model
artifact_path = "nadjaflechner/VGG_CAMs/classification_model:v61"
batch_size = 20
im_size = 200
min_palsa_positive_samples = 0
augment = False
normalize = False
depth_layer = 'hs'

# assign pseudomasks configs
cam_threshold_factor = 0.87
overlap_threshold = 0.46
snic_seeds = 100
snic_compactness = 4
finetune = False
std_from_mean = 0

# choose depth data based on configs
depth_dir = hs_dir
# Create the dataset and loaders for the entire dataset.
dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

#############
# Init model: 

api = wandb.Api()
artifact = api.artifact(artifact_path, type='model')
artifact_dir = artifact.download()
state_dict = torch.load(f"{artifact_dir}/model.pth", map_location=torch.device('cpu'))
model = model_4D()
model.load_state_dict(state_dict)
model.eval()

#############################
# generate all pseudolabels #
#############################

pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [37]:
files_generated = os.listdir("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/images")
missing_jpgs = missing_jpgs[~missing_jpgs.index.isin([f[:-4] for f in files_generated if f.endswith('.jpg')])]

dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

len(missing_jpgs)

3

In [62]:
print(missing_jpgs)

                           palsa_percentage
764_74_7500_2018_crop_123              2.50
759_76_5025_2013_crop_133              2.25
760_77_0000_2015_crop_23               3.25


In [70]:
counter = 0
# files_generated = os.listdir("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/images")
for im,binary_label,_,img_name in loader:

    try:
        # if not f"{img_name}.jpg" in files_generated:

        # output paths:
        output_path_mask = os.path.join("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/masks", f"{img_name[0]}.png") 
        output_path_jpg = os.path.join("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/images", f"{img_name[0]}.jpg") 

        # Save RGB image
        vutils.save_image((im[:,:3,:,:]/255).cpu(), output_path_jpg, normalize = False)

        # Generate pseudomask 
        if binary_label == 0: 
            pseudomask = np.full((400, 400), False, dtype=bool)
        else:
            pseudomask = pseudomask_generator.generate_mask(im, None, save_plot=False)

        # Save pseudomask
        binary_pseudomask = np.squeeze(pseudomask.astype(np.uint8)) # convert bool to binary mask
        pseudomask_img = Image.fromarray(binary_pseudomask) # convert to pil img
        pseudomask_img.save(output_path_mask) # save as binary png

        counter+= 1

        if counter%10 ==0:
            print(f"{counter} images produced")

    except:
        pass

RasterioIOError: '/Volumes/USB/Orig_200m_TIFS/rgb/764_74_7500_2018_crop_123.tif' not recognized as a supported file format.

### Generate thresholded CAMs that were not generated

Reset kernel before attempting this!|

In [None]:
import os 
tif_folder = "/Volumes/USB/Orig_200m_TIFS/rgb"
jpg_folder = "/Volumes/USB/Pseudomasks/Thresholded_CAM/images"

orig_files = [file for file in os.listdir(tif_folder) if not file.endswith('_aug.tif') and not file.startswith('._')]
thresholded_cams = [file for file in os.listdir(jpg_folder) if not file.startswith('._')]

# List to store TIF files without corresponding JPG files
missing_jpgs = []

# Check for TIF files in the folder
for tif_file in orig_files:
    if tif_file.endswith('.tif'):
        jpg_file = tif_file.replace('.tif', '.jpg')
        if not os.path.exists(os.path.join(jpg_folder, jpg_file)):
            missing_jpgs.append(tif_file)

# Print the count and list of missing JPG files
print(f"Number of TIF files without corresponding JPGs: {len(missing_jpgs)}")
print("Missing JPG files:", missing_jpgs)

In [None]:
import pandas as pd 

labels_file = "/Volumes/USB/Orig_200m_TIFS/new_palsa_labels.csv"
all_files = pd.read_csv(labels_file, index_col=0)

# Keep only the rows whose index name is in missing_jpgs (with the .tif extension)
missing_jpgs = all_files.loc[all_files.index.isin([file.replace('.tif', '') for file in missing_jpgs])]
missing_jpgs.head()


Check that the df is the same length as number of missing files. 

In [None]:
len(missing_jpgs)

In [None]:
############
# Imports #
############

import json
import os

import wandb
import pandas as pd
import rasterio
import torch
import torchvision.utils as vutils
import numpy as np

from torch.utils.data import DataLoader
from PIL import Image

from model.cnn_classifier import model_4D
from model.pseudomask import Pseudomasks
from utils.data_modules import ImageDataset, TestSet, filter_dataset

##################
## load configs ##
##################

rgb_dir = "/Volumes/USB/Orig_200m_TIFS/rgb"
hs_dir = "/Volumes/USB/Orig_200m_TIFS/hs"

# assign model
artifact_path = "nadjaflechner/VGG_CAMs/classification_model:v61"
batch_size = 20
im_size = 200
min_palsa_positive_samples = 0
augment = False
normalize = False
depth_layer = 'hs'

# assign pseudomasks configs
cam_threshold_factor = 0.87
overlap_threshold = 0.46
snic_seeds = 100
snic_compactness = 4
finetune = False
std_from_mean = 0

# choose depth data based on configs
depth_dir = hs_dir
# Create the dataset and loaders for the entire dataset.
dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

#############
# Init model: 

api = wandb.Api()
artifact = api.artifact(artifact_path, type='model')
artifact_dir = artifact.download()
state_dict = torch.load(f"{artifact_dir}/model.pth", map_location=torch.device('cpu'))
model = model_4D()
model.load_state_dict(state_dict)
model.eval()

#############################
# generate all pseudolabels #
#############################

pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


In [None]:
for i in range(30):
    try: 
        counter = 0
        files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/images")
        for im,binary_label,_,img_name in loader:

            if not f"{img_name}.jpg" in files_generated:

                # output paths:
                output_path_mask = os.path.join("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/masks", f"{img_name[0]}.png") 
                output_path_jpg = os.path.join("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/images", f"{img_name[0]}.jpg") 

                # Save RGB image
                vutils.save_image((im[:,:3,:,:]/255).cpu(), output_path_jpg, normalize = False)

                # Generate pseudomask 
                if binary_label == 0: 
                    pseudomask = np.full((400, 400), False, dtype=bool)
                else:
                    pseudomask = pseudomask_generator.generate_thresholded_CAM(im, None, save_plot=False)

                # Save pseudomask
                binary_pseudomask = np.squeeze(pseudomask.astype(np.uint8)) # convert bool to binary mask
                pseudomask_img = Image.fromarray(binary_pseudomask) # convert to pil img
                pseudomask_img.save(output_path_mask) # save as binary png

                counter+= 1

                if counter%10 ==0:
                    print(f"{counter} images produced")
    except: 
        files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/images")
        missing_jpgs = missing_jpgs[~missing_jpgs.index.isin([f[:-4] for f in files_generated if f.endswith('.jpg')])]

        dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
        loader = DataLoader(dataset, batch_size=1, shuffle=True)
        pseudomask_generator = Pseudomasks(
            dataset, cam_threshold_factor, overlap_threshold,
            snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
            )
        pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

        len(missing_jpgs)