### Identifying missing SNIC pseudomasks

In [2]:
import os 
tif_folder = "/Volumes/USB/Orig_200m_TIFS/rgb"
jpg_folder = "/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/images"
jpg_folder2 = "/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/masks"

orig_files = [file for file in os.listdir(tif_folder) if not file.endswith('_aug.tif') and not file.startswith('._')]
thresholded_cams = [file for file in os.listdir(jpg_folder) if not file.startswith('._')]

# List to store TIF files without corresponding JPG files
missing_jpgs = []

# Check for TIF files in the folder
for tif_file in orig_files:
    if tif_file.endswith('.tif'):
        jpg_file = tif_file.replace('.tif', '.jpg')
        if not os.path.exists(os.path.join(jpg_folder, jpg_file)):
            missing_jpgs.append(tif_file)

# Print the count and list of missing JPG files
print(f"Number of TIF files without corresponding JPGs: {len(missing_jpgs)}")
# print("Missing JPG files:", missing_jpgs)

Number of TIF files without corresponding JPGs: 8807


In [2]:
import os 

images = "/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TEST/images"
masks = "/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TEST/masks"

masks_list = [file[:-4] for file in os.listdir(masks)]
missing_masks = [img for img in os.listdir(images) if img[:-4] not in masks_list]

print(f"Number of images without corresponding masks: {len(missing_masks)}")
print("Missing masks:", missing_masks)


Number of images without corresponding masks: 0
Missing masks: []


In [25]:
import pandas as pd 

labels_file = "/Volumes/USB/Orig_200m_TIFS/new_palsa_labels.csv"
all_files = pd.read_csv(labels_file, index_col=0)

# Keep only the rows whose index name is in missing_jpgs (with the .tif extension)
missing_masks_DF = all_files.loc[all_files.index.isin([file.replace('.jpg', '') for file in missing_masks])]
missing_masks_DF.head()


Unnamed: 0,palsa_percentage
760_72_2550_2018_crop_68,3.75
763_72_7500_2018_crop_71,2.5


In [26]:
missing_masks_DF

Unnamed: 0,palsa_percentage
760_72_2550_2018_crop_68,3.75
763_72_7500_2018_crop_71,2.5


In [30]:
############
# Imports #
############

import json
import os

import wandb
import pandas as pd
import rasterio
import torch
import torchvision.utils as vutils
import numpy as np

from torch.utils.data import DataLoader
from PIL import Image

from model.cnn_classifier import model_4D
from model.pseudomask import Pseudomasks
from utils.data_modules import ImageDataset, TestSet, filter_dataset

##################
## load configs ##
##################

rgb_dir = "/Volumes/USB/Orig_200m_TIFS/rgb"
hs_dir = "/Volumes/USB/Orig_200m_TIFS/hs"

# assign model
artifact_path = "nadjaflechner/VGG_CAMs/classification_model:v61"
batch_size = 2
im_size = 200
min_palsa_positive_samples = 0
augment = False
normalize = False
depth_layer = 'hs'

# assign pseudomasks configs
cam_threshold_factor = 1.6
overlap_threshold = 0.46
snic_seeds = 100
snic_compactness = 4
finetune = False
std_from_mean = 0

# choose depth data based on configs
depth_dir = hs_dir
# Create the dataset and loaders for the entire dataset.
dataset = ImageDataset(depth_dir, rgb_dir, missing_masks_DF, im_size, normalize)
loader = DataLoader(dataset, batch_size=1)

#############
# Init model: 

api = wandb.Api()
artifact = api.artifact(artifact_path, type='model')
artifact_dir = artifact.download()
state_dict = torch.load(f"{artifact_dir}/model.pth", map_location=torch.device('cpu'))
model = model_4D()
model.load_state_dict(state_dict)
model.eval()

#############################
# generate all pseudolabels #
#############################

pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [32]:
files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/images")
missing_jpgs = missing_jpgs[~missing_jpgs.index.isin([f[:-4] for f in files_generated if f.endswith('.jpg')])]

dataset = ImageDataset(depth_dir, rgb_dir, missing_masks_DF, im_size, normalize)
loader = DataLoader(dataset, batch_size=1)
pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

print(f"remaining images = {len(missing_jpgs)}")

for i in range(35):
    
    counter = 0
    for im,binary_label,_,img_name in loader:

        try:
            # output paths:
            output_path_mask = os.path.join("/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/masks", f"{img_name[0]}.png") 
            output_path_jpg = os.path.join("/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/images", f"{img_name[0]}.jpg") 

            # Save RGB image
            vutils.save_image((im[:,:3,:,:]/255).cpu(), output_path_jpg, normalize = False)

            # Generate pseudomask 
            if binary_label == 0: 
                pseudomask = np.full((400, 400), False, dtype=bool)
            else:
                pseudomask = pseudomask_generator.generate_thresholded_CAM(im, None, save_plot=False)

            # Save pseudomask
            binary_pseudomask = np.squeeze(pseudomask.astype(np.uint8)) # convert bool to binary mask
            pseudomask_img = Image.fromarray(binary_pseudomask) # convert to pil img
            pseudomask_img.save(output_path_mask) # save as binary png

            counter+= 1

            if counter%10 ==0:
                print(f"{counter} images produced")
        except: 

            files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/images")
            missing_jpgs = missing_jpgs[~missing_jpgs.index.isin([f[:-4] for f in files_generated if f.endswith('.jpg')])]

            dataset = ImageDataset(depth_dir, rgb_dir, missing_masks_DF, im_size, normalize)
            loader = DataLoader(dataset, batch_size=1)
            pseudomask_generator = Pseudomasks(
                dataset, cam_threshold_factor, overlap_threshold,
                snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
                )
            pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            print(f"remaining images = {len(missing_jpgs)}")
            pass

remaining images = 0
remaining images = 0


In [140]:
missing_jpgs

Unnamed: 0,palsa_percentage
763_73_5000_2018_negcrop_8,0.0
761_72_0050_2018_negcrop_30,0.0
760_74_7525_2018_negcrop_40,0.0
764_74_7500_2018_crop_123,2.5
749_61_0000_2019_crop_88,0.0
761_73_0025_2018_crop_37,2.5
760_71_0075_2018_negcrop_4,0.0
755_70_7550_2010_negcrop_0,0.0
764_74_2575_2018_crop_106,46.0
759_76_5025_2013_crop_133,2.25


In [5]:
files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM_1.6/TRAIN/images")
missing_jpgs = missing_jpgs[~missing_jpgs.index.isin([f[:-4] for f in files_generated if f.endswith('.jpg')])]

dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

len(missing_jpgs)

NameError: name 'ImageDataset' is not defined

In [62]:
print(missing_jpgs)

                           palsa_percentage
764_74_7500_2018_crop_123              2.50
759_76_5025_2013_crop_133              2.25
760_77_0000_2015_crop_23               3.25


In [87]:
counter = 0
# files_generated = os.listdir("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/images")
for im,binary_label,_,img_name in loader:

    try:
        # if not f"{img_name}.jpg" in files_generated:

        # output paths:
        output_path_mask = os.path.join("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/masks", f"{img_name[0]}.png") 
        output_path_jpg = os.path.join("/Volumes/USB/Pseudomasks/orig_SNIC_pseudomasks/newly_generated/images", f"{img_name[0]}.jpg") 

        # Save RGB image
        vutils.save_image((im[:,:3,:,:]/255).cpu(), output_path_jpg, normalize = False)

        # Generate pseudomask 
        if binary_label == 0: 
            pseudomask = np.full((400, 400), False, dtype=bool)
        else:
            pseudomask = pseudomask_generator.generate_mask(im, None, save_plot=False)

        # Save pseudomask
        binary_pseudomask = np.squeeze(pseudomask.astype(np.uint8)) # convert bool to binary mask
        pseudomask_img = Image.fromarray(binary_pseudomask) # convert to pil img
        pseudomask_img.save(output_path_mask) # save as binary png

        counter+= 1

        if counter%10 ==0:
            print(f"{counter} images produced")

    except:
        pass

RasterioIOError: '/Volumes/USB/Orig_200m_TIFS/rgb/760_77_0000_2015_crop_23.tif' not recognized as a supported file format.

### Generate thresholded CAMs that were not generated

Reset kernel before attempting this!|

In [2]:
import os 
tif_folder = "/Volumes/USB/Orig_200m_TIFS/rgb"

orig_files = [file for file in os.listdir(tif_folder) if not file.endswith('_aug.tif') and not file.startswith('._')]
thresholded_cams_train = [file for file in os.listdir(jpg_train_folder) if not file.startswith('._')]
thresholded_cams_test = [file for file in os.listdir(jpg_test_folder) if not file.startswith('._')]
thresholded_cams = thresholded_cams_train + thresholded_cams_test

# List to store TIF files without corresponding JPG files
missing_jpgs = []

# Check for TIF files in the folder
for tif_file in orig_files:
    if tif_file.endswith('.tif'):
        jpg_file = tif_file.replace('.tif', '.jpg')
        if not jpg_file in thresholded_cams:
            missing_jpgs.append(tif_file)

# Print the count and list of missing JPG files
print(f"Number of TIF files without corresponding JPGs: {len(missing_jpgs)}")
print("Missing JPG files:", missing_jpgs)

Number of TIF files without corresponding JPGs: 5559
Missing JPG files: ['761_73_0025_2018_negcrop_10.tif', '763_72_5025_2018_negcrop_2.tif', '754_67_7525_2015_crop_105.tif', '760_72_0025_2018_negcrop_28.tif', '760_73_5075_2018_crop_104.tif', '758_70_2575_2013_negcrop_50.tif', '758_70_2575_2013_negcrop_44.tif', '759_68_0075_2014_negcrop_1.tif', '761_73_2550_2018_negcrop_39.tif', '763_75_7575_2015_negcrop_0.tif', '761_71_7550_2018_crop_53.tif', '750_65_7500_2015_crop_104.tif', '764_75_0000_2015_negcrop_16.tif', '759_73_0075_2013_negcrop_20.tif', '760_72_2500_2018_negcrop_1.tif', '754_67_7575_2015_crop_35.tif', '758_66_5025_2010_negcrop_5.tif', '761_72_0025_2018_negcrop_3.tif', '758_72_0075_2013_negcrop_18.tif', '760_72_0025_2018_negcrop_14.tif', '761_75_2575_2015_negcrop_1.tif', '764_73_0000_2018_crop_110.tif', '755_68_5025_2016_negcrop_5.tif', '761_73_2550_2018_crop_52.tif', '760_74_7525_2018_crop_10.tif', '765_73_0050_2016_negcrop_5.tif', '761_73_7500_2018_crop_135.tif', '761_72_2575_

In [3]:
import pandas as pd 

labels_file = "/Volumes/USB/Orig_200m_TIFS/new_palsa_labels.csv"
all_files = pd.read_csv(labels_file, index_col=0)

# Keep only the rows whose index name is in missing_jpgs (with the .tif extension)
missing_jpgs = all_files.loc[all_files.index.isin([file.replace('.tif', '') for file in missing_jpgs])]
missing_jpgs.head()


Unnamed: 0,palsa_percentage
760_73_5025_2018_crop_14,7.25
760_73_5025_2018_crop_15,0.75
760_73_5025_2018_crop_16,3.25
760_73_5025_2018_crop_20,4.0
760_73_5025_2018_crop_26,5.5


Check that the df is the same length as number of missing files. 

In [4]:
len(missing_jpgs)

5559

In [5]:
############
# Imports #
############

import json
import os

import wandb
import pandas as pd
import rasterio
import torch
import torchvision.utils as vutils
import numpy as np

from torch.utils.data import DataLoader
from PIL import Image

from model.cnn_classifier import model_4D
from model.pseudomask import Pseudomasks
from utils.data_modules import ImageDataset, TestSet, filter_dataset

##################
## load configs ##
##################

rgb_dir = "/Volumes/USB/Orig_200m_TIFS/rgb"
hs_dir = "/Volumes/USB/Orig_200m_TIFS/hs"

# assign model
artifact_path = "nadjaflechner/VGG_CAMs/classification_model:v61"
batch_size = 20
im_size = 200
min_palsa_positive_samples = 0
augment = False
normalize = False
depth_layer = 'hs'

# assign pseudomasks configs
cam_threshold_factor = 1.6
overlap_threshold = 0.46
snic_seeds = 100
snic_compactness = 4
finetune = False
std_from_mean = 0

# choose depth data based on configs
depth_dir = hs_dir
# Create the dataset and loaders for the entire dataset.
dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

#############
# Init model: 

api = wandb.Api()
artifact = api.artifact(artifact_path, type='model')
artifact_dir = artifact.download()
state_dict = torch.load(f"{artifact_dir}/model.pth", map_location=torch.device('cpu'))
model = model_4D()
model.load_state_dict(state_dict)
model.eval()

#############################
# generate all pseudolabels #
#############################

pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [16]:
files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/images")
missing_jpgs = missing_jpgs[~missing_jpgs.index.isin([f[:-4] for f in files_generated if f.endswith('.jpg')])]

dataset = ImageDataset(depth_dir, rgb_dir, missing_jpgs, im_size, normalize)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
pseudomask_generator = Pseudomasks(
    dataset, cam_threshold_factor, overlap_threshold,
    snic_seeds, snic_compactness, finetuned = finetune, std_from_mean=0
    )
pseudomask_generator.model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

print(f"remaining images = {len(missing_jpgs)}")

for i in range(30):
    counter = 0
    files_generated = os.listdir("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/images")
    for im,binary_label,_,img_name in loader:

        if not f"{img_name}.jpg" in files_generated:

            try:

                # output paths:
                output_path_mask = os.path.join("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/masks", f"{img_name[0]}.png") 
                output_path_jpg = os.path.join("/Volumes/USB/Pseudomasks/Thresholded_CAM/newly_generated/images", f"{img_name[0]}.jpg") 

                # Save RGB image
                vutils.save_image((im[:,:3,:,:]/255).cpu(), output_path_jpg, normalize = False)

                # Generate pseudomask 
                if binary_label == 0: 
                    pseudomask = np.full((400, 400), False, dtype=bool)
                else:
                    pseudomask = pseudomask_generator.generate_thresholded_CAM(im, None, save_plot=False)

                # Save pseudomask
                binary_pseudomask = np.squeeze(pseudomask.astype(np.uint8)) # convert bool to binary mask
                pseudomask_img = Image.fromarray(binary_pseudomask) # convert to pil img
                pseudomask_img.save(output_path_mask) # save as binary png

                counter+= 1

                if counter%10 ==0:
                    print(f"{counter} images produced")
            except: 
                pass

remaining images = 13


RasterioIOError: '/Volumes/USB/Orig_200m_TIFS/rgb/749_61_0000_2019_crop_88.tif' not recognized as a supported file format.