### Preprocess Raw-data


In [1]:

import os
# path to 'deepSat' containing: 
# "raw-data/AOI/tile_selection.gpkg", 
# 'raw-data/AOI/patch_size_features.gpkg', 
# 'raw-data/LULC-Sweden/*/Data/*.gpkg'
# 'raw-data/sentinel-2/Tidsperiod-x/'

root_path = '/media/oskar/ESSD/deepSat/'
timeperiod = 1 # timeperiod (tp) 1 eller 2

patch_size =256  #choose between ['64','128','256']
source = 'TCI' # for savedir... e.g. TCI => TCI_256 if patch_size=256
data_source = source + '_{}'.format(patch_size)
tp = 'Tidsperiod-{}'.format(timeperiod)
sdir = os.path.join(root_path,'processed-data/{source}/{tp}'.format(source=data_source,tp=tp))

import glob
import h5netcdf
import xarray as xr
import numpy as np
import pandas as pd
import rioxarray as rio
from geopandas import read_file
from tqdm.notebook import tqdm
from preprocess.uaUtils import open_fua
from preprocess.s2Utils import search_product,open_tile,clip_tile,open_clip_tile, get_tile_info, get_prod_info
from preprocess.utils import rasterize,get_obj_within,str_to_oao, save_cube,class_dict,org_files

xr.set_options(keep_attrs=True)


# path to Urban Atlas (UA) data
ua_paths = glob.glob(os.path.join(root_path,'raw-data/LULC-Sweden/*/Data/*.gpkg'))

# path to s2 products
s2_path = os.path.join(root_path,'raw-data/sentinel-2/{}/'.format(tp))
print('Looking for sentinel products in:\n',s2_path,'\n')
ua_crs = read_file(ua_paths[0]).crs

# path to s2 tiles to use 
s2tiles = read_file(os.path.join(root_path,'raw-data/AOI/tile_selection.gpkg')).to_crs(ua_crs)

# path to scb patches of size*size
scb_grid = read_file(os.path.join(root_path,'raw-data/AOI/patch_size_features.gpkg'),layer = "patch_{}".format(patch_size)).to_crs(ua_crs)


# check that desired s2tiles existst in s2_path
print('---- Found sentinel products: ----')
for tile in s2tiles.itertuples():
    prod,date= search_product(tile.Name,s2_path)
    
    if prod:
        for i, prod in enumerate(prod):
            fn=os.path.basename(prod)
            print('Found:',get_prod_info(fn)['Tile'],pd.to_datetime(date[i]),fn)
    else:
        print('No found sentinel products for tile:',tile.Name)
          
# list found FUAS
print('\n---- Found LULC data: ----')
if ua_paths:
    for ua_path in ua_paths:
        print('Found:',os.path.basename(ua_path))
else:
    print('No LULC data found')
    

print('\n Save processed data to: \n',  sdir)

Looking for sentinel products in:
 /media/oskar/ESSD/deepSat/raw-data/sentinel-2/Tidsperiod-1/ 

---- Found sentinel products: ----
Found: T33UUB 2018-07-26 10:20:19 S2B_MSIL1C_20180726T102019_N0206_R065_T33UUB_20180726T142914.SAFE
Found: T33VUC 2018-07-04 10:30:21 S2A_MSIL1C_20180704T103021_N0206_R108_T33VUC_20180704T174024.SAFE
Found: T33VUD 2018-06-29 10:30:19 S2B_MSIL1C_20180629T103019_N0206_R108_T33VUD_20180629T123841.SAFE
Found: T33VUE 2018-07-04 10:30:21 S2A_MSIL1C_20180704T103021_N0206_R108_T33VUE_20180704T174024.SAFE
Found: T33VVD 2018-07-26 10:20:19 S2B_MSIL1C_20180726T102019_N0206_R065_T33VVD_20180726T142914.SAFE
Found: T33VVE 2018-06-01 10:20:21 S2A_MSIL1C_20180601T102021_N0206_R065_T33VVE_20180601T123308.SAFE
Found: T33VVF 2018-07-04 10:30:21 S2A_MSIL1C_20180704T103021_N0206_R108_T33VVF_20180704T174024.SAFE
Found: T33VWE 2018-06-01 10:20:21 S2A_MSIL1C_20180601T102021_N0206_R065_T33VWE_20180601T123308.SAFE
Found: T33VWF 2018-06-01 10:20:21 S2A_MSIL1C_20180601T102021_N0206_R

In [2]:
# open UA data
bad_patch=[]
for path in tqdm(ua_paths, desc='Total progress'):
    
    fua_labls,fua_bound = open_fua(path)             # open fua layers
    tiles_inters = s2tiles.sjoin(fua_bound, predicate='intersects')  #get intersecting tiles for fua
    fua_name = str_to_oao(fua_bound.fua_name[0])
    
    for tile in tqdm(tiles_inters.itertuples(),desc='Intersecting tiles',total=len(tiles_inters)): # for every intersecting tile
        
        # select patches within curr tile
        curr_tile = s2tiles[s2tiles.Name.isin([tile.Name])]                                               
        patches_within = get_obj_within(get_obj_within(scb_grid,curr_tile),fua_bound) #select patches(grid) within curr_tile and within fua
        #savedir
        savedir = os.path.join(sdir,'{}/{}/'.format(fua_name,tile.Name))
        #print(savedir)
        #open curr tile 
        s2_tile = open_tile(tile.Name,s2_path)
        
        for patch in tqdm(patches_within.itertuples(),total=len(patches_within), desc='FUA: {}, Tile: {}'.format(fua_name,tile.Name)):
            
            # get current patch and clip labels to patch extent
            curr_patch = scb_grid[(scb_grid.id.isin([patch.id]))]
            # clip tile and labels to patch extent
            s2_patch = clip_tile(s2_tile,curr_patch)
            patch_labls = fua_labls.clip(curr_patch)
            #rasterize patch labels 
            try:
                cube = rasterize(patch_labls)
            except:
                bad_patch.append(patch.id)
            else:    
                #merge s2_patch and patch labels
                cube = cube.merge(s2_patch.rio.reproject_match(cube)) #append to cube
                # reduce coordinate dimensions by one
                if cube.dims['x'] > patch_size:
                   cube=cube.isel(x=slice(None, -1), y=slice(None, -1))           
            
                #set attributes to cube before saving
                cube.attrs["patch_id"] = patch.id
                cube.attrs['FUA'] = fua_name
                for key,value in get_tile_info(tile.Name,s2_path).items():
                    cube.attrs[key] = value


                cube.train_id.attrs['_fillValue']=255
                cube.class_code.attrs['_fillValue']=255
                save_cube(cube,savedir)
           

            break
        break
    break


Total progress:   0%|          | 0/12 [00:00<?, ?it/s]

Intersecting tiles:   0%|          | 0/2 [00:00<?, ?it/s]

FUA: GOTEBORG, Tile: 33VUD:   0%|          | 0/120 [00:00<?, ?it/s]

### split train/test/val

1. split dataset (if satisfied, execute step 2)
2. reorganize files after split

In [3]:
#1 split dataset
train = 0.8
test = 0.5 #of remaining -train

df = np.asarray(glob.glob(os.path.join(sdir,'*/*/*.nc')))

msk = np.random.rand(len(df)) <train
len(msk[msk==True])/len(msk)

train = df[msk]
testval = df[~msk]
msk = np.random.rand(len(testval))<test
test=testval[msk]
val = testval[~msk]
print('train:', len(train) ,round(len(train)/len(df),2),'test:',len(test),round(len(test)/len(df),2),'val:',len(val),round(len(val)/len(df),2))

train: 1 1.0 test: 0 0.0 val: 0 0.0


In [6]:
if train.size>0: 
    org_files(train,mode='train')
if test.size>0:
    org_files(test,mode='test')
if val.size>0:
    org_files(val,mode='val')


/media/oskar/ESSD/deepSat/processed-data/TCI_256_split/Tidsperiod-1/train


#### Compact Naming Convention

The compact naming convention is arranged as follows:

MMM_MSIXXX_YYYYMMDDHHMMSS_Nxxyy_ROOO_Txxxxx_<Product Discriminator>.SAFE

The products contain two dates.

The first date (YYYYMMDDHHMMSS) is the datatake sensing time.
The second date is the "Product Discriminator" field, which is 15 characters in length, and is used to distinguish between different end user products from the same datatake. Depending on the instance, the time in this field can be earlier or slightly later than the datatake sensing time.

The other components of the filename are:

* MMM: is the mission ID(S2A/S2B)
* MSIXXX: MSIL1C denotes the Level-1C product level/ MSIL2A denotes the Level-2A product level
* YYYYMMDDHHMMSS: the datatake sensing start time
* Nxxyy: the PDGS Processing Baseline number (e.g. N0204)
* ROOO: Relative Orbit number (R001 - R143)
* Txxxxx: Tile Number field

SAFE: Product Format (Standard Archive Format for Europe)

Source https://sentinel.esa.int/web/sentinel/user-guides/sentinel-2-msi/naming-convention