In [7]:
#############
## imports ##
#############

# libraries 
import geopandas as gpd
import numpy as np 
import pandas as pd
import rasterio
import matplotlib.pyplot as plt
from rasterio.plot import show
from rasterio.mask import mask
import os
import json
from shapely.geometry import box, Polygon
import logging

##########
## code ##
##########

def tif_from_ruta(ruta_geometry):
    minx_ruta = ruta_geometry.bounds[0]
    miny_ruta = ruta_geometry.bounds[1]

    miny = str(miny_ruta)[0:3]
    minx = str(minx_ruta)[0:2]

    if 0 <= int(str(miny_ruta)[3:5]) < 25:
        km_siffran_y = '00'
    elif 25 <= int(str(miny_ruta)[3:5]) < 50:
        km_siffran_y = '25'
    elif 50 <= int(str(miny_ruta)[3:5]) < 75:
        km_siffran_y = '50'
    elif 75 <= int(str(miny_ruta)[3:5]) < 100:
        km_siffran_y = '75'

    if 0 <= int(str(minx_ruta)[3:5]) < 25:
        km_siffran_x = '00'
    elif 25 <= int(str(minx_ruta)[3:5]) < 50:
        km_siffran_x = '25'
    elif 50 <= int(str(minx_ruta)[3:5]) < 75:
        km_siffran_x = '50'
    elif 75 <= int(str(minx_ruta)[3:5]) < 100:
        km_siffran_x = '75'

    year = 2018 # WHICH YEAR SHOULD IT BE??

    filename = f"{miny}_{minx}_{km_siffran_y}{km_siffran_x}_{year}.tif"
    return filename


def filter_imgs(all_rutor_path, dir_files):
    all_rutor = gpd.read_file(all_rutor_path)
    all_rutor['in_tif'] = all_rutor['geometry'].map(tif_from_ruta)
    uniques = all_rutor.in_tif.unique()

    only_tifs = [filename for filename in dir_files if filename[-4:] == ".tif"]

    # compare such only the part without the year. 
    only_tifs_noyear = [filename[:-8] for filename in only_tifs]
    uniques_noyear = [filename[:-8] for filename in list(uniques)]

    # check that all uniques are in only tifs
    if not (set(list(uniques_noyear)).issubset(set(only_tifs_noyear))):
        # logger.WARN(f"at least one tif name generated from all_rutor was not found in the directory: {original_tif_dir}")
        print(f"at least one tif name generated from all_rutor was not found in the directory")
        items_not_in_dir = [item for item in list(uniques) if item not in only_tifs]
        print(f"items not in directory are: \n {items_not_in_dir}")

    intersection = list(set(uniques_noyear) & set(only_tifs_noyear))

    tifs_to_use = [filename for filename in only_tifs if filename[:-8] in intersection]

    return tifs_to_use

In [8]:
all_filenames = os.listdir('/home/nadjaflechner/Hojddata/all/')

In [9]:
len(all_filenames)

28676

In [10]:
shape_file = "/home/nadjaflechner/palsa_seg/Palsa_rutor/RUTNAT_100x100M_PALS_OR_PALSVATT_ALL.shp"
filtered_imgs = filter_imgs(shape_file, all_filenames)

at least one tif name generated from all_rutor was not found in the directory
items not in directory are: 
 ['761_75_0050_2018.tif', '761_75_0025_2018.tif', '761_75_2525_2018.tif', '761_75_2550_2018.tif', '761_75_2575_2018.tif', '761_75_2500_2018.tif', '761_70_5000_2018.tif', '761_75_5025_2018.tif', '761_75_5050_2018.tif', '761_75_5075_2018.tif', '761_75_5000_2018.tif', '761_75_7500_2018.tif', '761_75_7525_2018.tif', '761_75_7550_2018.tif', '761_75_7575_2018.tif', '762_75_5050_2018.tif', '763_75_0000_2018.tif', '763_75_0075_2018.tif', '763_71_2500_2018.tif', '763_75_2550_2018.tif', '763_75_2500_2018.tif', '763_75_2525_2018.tif', '763_75_2575_2018.tif', '763_75_5050_2018.tif', '763_71_5000_2018.tif', '763_75_5000_2018.tif', '763_75_5025_2018.tif', '763_75_5075_2018.tif', '763_71_7500_2018.tif', '763_75_7550_2018.tif', '763_75_7575_2018.tif', '763_75_7500_2018.tif', '763_75_7525_2018.tif', '764_71_0000_2018.tif', '764_75_0025_2018.tif', '764_75_0050_2018.tif', '764_75_0000_2018.tif', '76

In [11]:
len(filtered_imgs)

1176

In [2]:
import os 
import numpy as np 

hs_folder = '/home/nadjaflechner/Palsa_data/hillshade'
hs_names = os.listdir(hs_folder)
hs_only_tifs_noyear = [filename[:-8] for filename in hs_names]
hs_uniques = np.unique(np.array(hs_only_tifs_noyear))    

rgb_folder = '/home/nadjaflechner/Palsa_data/filtered_tifs'
rgb_names = os.listdir(rgb_folder)
rgb_only_tifs_noyear = [filename[:-8] for filename in rgb_names]
rgb_uniques = np.unique(np.array(rgb_only_tifs_noyear))

In [4]:
len(hs_uniques)

1176

In [5]:
len(rgb_uniques)

381

## Understanding mismatch in # samples RGB and hillshade

In [8]:
import zipfile

rgb_zip = zipfile.ZipFile('/home/circ/Data/SpatialEcology_Lab/Siewert/ortoRgb.zip')
print(len(rgb_zip.namelist()))

4091


In [9]:
import os 
import numpy as np 

hs_folder = '/home/nadjaflechner/Hojddata/all/'
hs_names = os.listdir(hs_folder)
hs_only_tifs_noyear = [filename[:-8] for filename in hs_names]
hs_uniques = np.unique(np.array(hs_only_tifs_noyear))    

rgb_names = rgb_zip.namelist()
rgb_only_tifs_noyear = [filename[:-8] for filename in rgb_names]
rgb_uniques = np.unique(np.array(rgb_only_tifs_noyear))

In [10]:
len(hs_uniques)

14563

In [11]:
len(rgb_uniques)

1950

the hs images have spatial extents of 2,5x2,5 kms. So for each rgb image there should be 4 hs images. So converted, there are 3640 hs images, while there are only 1950 unique rgb images. how can this be? Which RGB images are we missing?

In [12]:
14563/4

3640.75

In [11]:
from functions import get_RGB_match, Crop_tif_varsize, filter_imgs
import json
import os

config_path = os.path.join(os.getcwd(), 'configs.json')
with open(config_path, 'r') as config_file:
    configs = json.load(config_file)

# load paths from configs 
config_paths = configs.get('paths', {}) 
palsa_shapefile_path = config_paths.get('palsa_shapefile_path') # load shapefile path
save_crops_dir = config_paths.get('save_crops_dir') # load directory with all tifs
original_tif_dir = config_paths.get('original_tif_dir') # load directory with all tifs
hillshade_tif_dir = config_paths.get('hillshade_tif_dir') # load directory with all tifs

hillshade_filenames = filter_imgs(palsa_shapefile_path, hillshade_tif_dir) # TODO: could already filter 'only newest' here.. 

pair_dict = {}
for idx, hs_img_name in enumerate(hillshade_filenames):
    # grab corresponding RGB image (matching the hillshade)

    RGB_tif_name = get_RGB_match(hs_img_name, original_tif_dir) 
    pair_dict[hs_img_name] = RGB_tif_name

at least one tif name generated from all_rutor was not found in the directory
items not in directory are: 
 ['761_70_5000_', '763_71_2500_', '763_71_5000_', '763_71_7500_', '764_71_0000_', '764_75_2550_', '764_75_2575_', '764_71_2500_', '762_78_0075_', '762_77_5050_', '762_77_5075_', '762_77_7575_', '762_77_7550_', '762_77_7525_', '764_75_5050_', '764_75_5075_', '764_75_5025_', '765_71_0000_', '765_74_0075_', '765_74_2550_', '765_74_2575_', '759_70_2500_', '759_70_5000_', '759_70_7500_', '759_70_7525_', '760_81_7575_']


In [16]:
list(pair_dict.values())

['758_71_05_2016.tif',
 '765_73_55_0000',
 '760_73_50_2016.tif',
 '762_71_00_2016.tif',
 '752_65_55_2018.tif',
 '761_79_00_0000',
 '757_74_50_0000',
 '765_73_50_2016.tif',
 '752_68_00_2018.tif',
 '764_74_55_2016.tif',
 '752_69_00_2018.tif',
 '763_73_50_2016.tif',
 '762_74_55_2016.tif',
 '758_70_05_2016.tif',
 '750_66_55_2018.tif',
 '762_73_05_2016.tif',
 '762_72_00_2016.tif',
 '759_76_00_0000',
 '761_72_05_2016.tif',
 '747_65_55_0000',
 '754_67_55_2018.tif',
 '755_67_55_0000',
 '762_72_00_2016.tif',
 '760_73_05_2016.tif',
 '759_72_50_2016.tif',
 '763_75_05_2016.tif',
 '751_67_55_2018.tif',
 '765_71_55_2016.tif',
 '754_67_55_2018.tif',
 '760_72_00_2016.tif',
 '753_68_55_0000',
 '758_66_05_2018.tif',
 '761_70_05_2016.tif',
 '762_74_05_2016.tif',
 '747_64_05_2018.tif',
 '733_52_00_2015.tif',
 '734_52_05_2015.tif',
 '748_66_55_2018.tif',
 '761_74_50_2016.tif',
 '750_65_50_2018.tif',
 '752_65_50_0000',
 '747_58_00_2015.tif',
 '759_80_55_0000',
 '749_68_55_2018.tif',
 '749_68_00_2018.tif',
 

In [17]:
rgb_tifs_extract = [name.split('.')[0] for name in list(pair_dict.values())]

In [19]:
rgb_tifs_extract[:10]

['758_71_05_2016',
 '765_73_55_0000',
 '760_73_50_2016',
 '762_71_00_2016',
 '752_65_55_2018',
 '761_79_00_0000',
 '757_74_50_0000',
 '765_73_50_2016',
 '752_68_00_2018',
 '764_74_55_2016']

In [31]:
import zipfile
rgb_zip = zipfile.ZipFile('/home/circ/Data/SpatialEcology_Lab/Siewert/ortoRgb.zip')
rgb_names = rgb_zip.namelist()

only_tifs = [filename for filename in rgb_names if filename[-4:] == ".tif"]
only_tifs = [filename.split('/')[1] for filename in rgb_names if filename[-4:] == ".tif"]

# compare such only the part without the year. 
only_tifs_noyear = [filename[:-8] for filename in only_tifs]
rgb_extr_noyear = [filename[:-4] for filename in list(rgb_tifs_extract)]

intersection = list(set(rgb_extr_noyear) & set(only_tifs_noyear))

tifs_to_use = [filename for filename in only_tifs if filename[:-8] in intersection]

In [41]:
tifs_to_use

['758_70_00_2016.tif',
 '761_71_55_2009.tif',
 '764_74_00_2010.tif',
 '750_69_55_2018.tif',
 '746_65_05_2018.tif',
 '750_66_55_2018.tif',
 '762_72_05_2010.tif',
 '746_66_50_2010.tif',
 '750_65_00_2010.tif',
 '756_70_50_2016.tif',
 '760_73_00_2010.tif',
 '756_65_55_2018.tif',
 '754_66_00_2018.tif',
 '752_69_55_2010.tif',
 '746_64_00_2010.tif',
 '750_67_50_2010.tif',
 '752_66_55_2010.tif',
 '754_69_00_2018.tif',
 '748_62_00_2018.tif',
 '760_74_55_2010.tif',
 '756_67_05_2018.tif',
 '761_76_00_2009.tif',
 '758_68_55_2018.tif',
 '752_65_00_2018.tif',
 '764_73_55_2010.tif',
 '758_72_50_2016.tif',
 '761_79_00_2009.tif',
 '766_72_50_2010.tif',
 '752_70_05_2016.tif',
 '747_62_55_2008.tif',
 '766_71_05_2016.tif',
 '758_74_00_2010.tif',
 '753_65_05_2008.tif',
 '764_75_05_2016.tif',
 '760_77_00_2016.tif',
 '760_72_05_2016.tif',
 '759_76_55_2009.tif',
 '757_67_00_2008.tif',
 '762_76_05_2016.tif',
 '762_73_00_2016.tif',
 '759_79_55_2009.tif',
 '746_56_05_2015.tif',
 '759_71_00_2009.tif',
 '759_74_05

In [42]:
import pandas as pd
imgs_to_use = pd.DataFrame(tifs_to_use)
imgs_to_use.to_csv("NEW_filenames_to_use.csv", header = False, index = False)

In [22]:
names = pd.read_csv('/home/nadjaflechner/palsa_seg/data/data_prep/NEW_filenames_to_use.csv', header=None, names=['files'])
filenames_to_extract = names.files.tolist()

In [23]:
filenames_to_extract

['758_71_05_2016',
 '765_73_55_0000',
 '760_73_50_2016',
 '762_71_00_2016',
 '752_65_55_2018',
 '761_79_00_0000',
 '757_74_50_0000',
 '765_73_50_2016',
 '752_68_00_2018',
 '764_74_55_2016',
 '752_69_00_2018',
 '763_73_50_2016',
 '762_74_55_2016',
 '758_70_05_2016',
 '750_66_55_2018',
 '762_73_05_2016',
 '762_72_00_2016',
 '759_76_00_0000',
 '761_72_05_2016',
 '747_65_55_0000',
 '754_67_55_2018',
 '755_67_55_0000',
 '762_72_00_2016',
 '760_73_05_2016',
 '759_72_50_2016',
 '763_75_05_2016',
 '751_67_55_2018',
 '765_71_55_2016',
 '754_67_55_2018',
 '760_72_00_2016',
 '753_68_55_0000',
 '758_66_05_2018',
 '761_70_05_2016',
 '762_74_05_2016',
 '747_64_05_2018',
 '733_52_00_2015',
 '734_52_05_2015',
 '748_66_55_2018',
 '761_74_50_2016',
 '750_65_50_2018',
 '752_65_50_0000',
 '747_58_00_2015',
 '759_80_55_0000',
 '749_68_55_2018',
 '749_68_00_2018',
 '749_67_50_2018',
 '758_74_50_2016',
 '735_54_55_2015',
 '757_72_55_2016',
 '751_65_50_2018',
 '758_72_50_0000',
 '758_73_00_2016',
 '753_67_50_

In [15]:
rgb_tifs_extract

['758_71_05_2016', '765_73_55_0000']

In [9]:
len(hillshade_filenames)

1176

## REGENERATING RGB DATASET FROM ZIP 

In [2]:
#############
## imports ##
#############

# libraries 
import geopandas as gpd
import numpy as np 
import pandas as pd
import rasterio
import matplotlib.pyplot as plt
from rasterio.plot import show
from rasterio.mask import mask
import os
import json
from shapely.geometry import box, Polygon
import logging

##########
## code ##
##########

def rgb_tif_from_ruta(ruta_geometry):
    minx_ruta = ruta_geometry.bounds[0]
    miny_ruta = ruta_geometry.bounds[1]

    miny = str(miny_ruta)[0:3]
    minx = str(minx_ruta)[0:2]
    km_siffran_y = 0 if int(str(miny_ruta)[3]) < 5 else 5
    km_siffran_x = 0 if int(str(minx_ruta)[2]) < 5 else 5
    year = 2018

    filename = f"{miny}_{minx}_{km_siffran_y}{km_siffran_x}_{year}.tif"
    return filename


def rgb_filter_imgs(all_rutor_path, dir_files):
    all_rutor = gpd.read_file(all_rutor_path)
    all_rutor['in_tif'] = all_rutor['geometry'].map(rgb_tif_from_ruta)
    uniques = all_rutor.in_tif.unique()

    # dir_files = os.listdir(original_tif_dir)
    only_tifs = [filename for filename in dir_files if filename[-4:] == ".tif"]

    only_tifs_noyear = [filename[:-8] for filename in only_tifs]
    uniques_noyear = [filename[:-8] for filename in list(uniques)]

    # check that all uniques are in only tifs
    if not (set(list(uniques_noyear)).issubset(set(only_tifs_noyear))):
        # logger.WARN(f"at least one tif name generated from all_rutor was not found in the directory: {original_tif_dir}")
        print(f"at least one tif name generated from all_rutor was not found in the directory")
        items_not_in_dir = [item for item in uniques_noyear if item not in list(only_tifs_noyear)]
        print(f"items not in directory are: \n {items_not_in_dir}")

    intersection = list(set(uniques_noyear) & set(only_tifs_noyear))

    tifs_to_use = [filename for filename in only_tifs if filename[:-8] in intersection]

    return tifs_to_use

In [5]:
import zipfile
shape_file = "/home/nadjaflechner/Palsa_data/Palsa_rutor/RUTNAT_100x100M_PALS_OR_PALSVATT_ALL.shp"
rgb_zip = zipfile.ZipFile('/home/circ/Data/SpatialEcology_Lab/Siewert/ortoRgb.zip')
rgb_names = rgb_zip.namelist()
filtered_imgs = rgb_filter_imgs(shape_file, rgb_names)

at least one tif name generated from all_rutor was not found in the directory
items not in directory are: 
 ['761_71_00_', '761_71_05_', '761_72_00_', '761_72_05_', '761_73_00_', '761_73_05_', '761_74_00_', '761_70_05_', '761_74_05_', '761_75_00_', '761_75_05_', '761_70_55_', '761_71_50_', '761_72_50_', '761_72_55_', '761_73_50_', '761_74_55_', '761_71_55_', '761_74_50_', '761_75_55_', '761_73_55_', '761_75_50_', '762_72_05_', '762_74_05_', '762_73_00_', '762_74_00_', '762_73_05_', '762_72_00_', '762_71_05_', '762_71_00_', '762_71_50_', '762_72_55_', '762_73_55_', '762_74_50_', '762_72_50_', '762_74_55_', '762_73_50_', '762_75_55_', '762_71_55_', '763_71_05_', '763_74_05_', '763_74_00_', '763_72_00_', '763_72_05_', '763_73_05_', '763_75_05_', '763_73_00_', '763_71_55_', '763_72_50_', '763_73_50_', '763_75_55_', '763_72_55_', '763_74_50_', '763_71_50_', '763_73_55_', '763_75_50_', '763_74_55_', '764_71_05_', '764_72_00_', '764_73_00_', '764_75_05_', '764_75_00_', '764_74_00_', '764_71_0