# Build a VRT of partially overlapping tifs
This builds VRTs of GLiHT data by year and by UTM zone sorted by date (latest dates first)

Paul Montesano  
Sept 2023

In [1]:
from osgeo import gdal 
import glob
import pandas as pd
import rasterio

from collections import defaultdict

from dateutil.parser import parse
import datetime

## Find all files

In [2]:
MAINDIR = '/explore/nobackup/people/pmontesa/userfs02/data/gliht/chm'
YEAR = 2014
TYPE = 'CHM'
f_list = glob.glob(f'{MAINDIR}/{YEAR}/*{TYPE}.tif')

In [3]:
print(f'{f_list[2]}\n{f_list[2000]}')

/explore/nobackup/people/pmontesa/userfs02/data/gliht/chm/2014/AK_10Jul2014_l17s40_CHM.tif
/explore/nobackup/people/pmontesa/userfs02/data/gliht/chm/2014/AK_20140809_l4s604_CHM.tif


## Separate files in list according to the UTM zone of their coordinate system

In [8]:
%%time

list_utm_zones = ['32604','32605','32606','32607']

dict_utm_zones = defaultdict(list)

# Get lists of files for each UTM zone
for f in f_list:
    
    with rasterio.open(f) as ds:
        for zone in list_utm_zones:
            if str(ds.crs).split(':')[1] == zone: dict_utm_zones[zone].append(f) 

dict_utm_zones = dict(dict_utm_zones)

CPU times: user 2.35 s, sys: 2.55 s, total: 4.9 s
Wall time: 10.8 s


## Build a VRT with files of a specific UTM zone

#### functions

In [5]:
def clean_date(text):
    datetimestr = parse(text)
    #text = datetime.datetime.strptime(datetimestr, '%Y%m%d')
    return datetimestr

def filelist_to_date_sorted_df(f_list, date_string_position=1):
    
    '''
    Returns a df sorted by date
    This requires:
        that the date be at an expected position in the filename 
    '''
    
    df = pd.DataFrame(f_list, columns=['path'])
    df["file"] = df["path"].apply(lambda x: os.path.basename(x))

    # Handle date - this works with multiple
    df["date"] = df["file"].str.split('_', expand=True)[date_string_position]
    df['date'] = df['date'].apply(clean_date)
    df['date'] = pd.to_datetime(df['date']) 

    # Handle date - this works only with 1 date format
    #df["date"] = pd.to_datetime(df["path"].str.split('_', expand=True)[1] , format="%d%B%Y").dt.strftime('%Y%m%d')

    df = df.sort_values(by='date', ignore_index=True, ascending=False )
    
    return df

def build_vrt(f_list, out_vrt_fn):
    
    vrt_options = gdal.BuildVRTOptions(resampleAlg='cubic')
    my_vrt = gdal.BuildVRT(out_vrt_fn, f_list, options=vrt_options)
    my_vrt = None

    print(out_vrt_fn)

def dict_gliht_to_vrt(f_list, epsg_utm, date_string_position=1): 
    
    print(epsg_utm)
    error_list = []
    
    try:
        # Run function to sort
        df = filelist_to_date_sorted_df(f_list, date_string_position=date_string_position)
        utm_zone_specific_f_list = df.path.to_list()
        
        out_vrt_fn = os.path.join(MAINDIR, f'gliht_{TYPE.lower()}_{YEAR}_{epsg_utm}.vrt')
        
        build_vrt(utm_zone_specific_f_list, out_vrt_fn)
        
    except Exception as e:
        print(e)
        error_list.append(e)
        
    return error_list    

#### Run the gliht to VRT function by UTM zone and collect the errors

In [6]:
dict_utm_zones_error = defaultdict(list)

for epsg_utm, f_list in dict_utm_zones.items():
    
    error_list = dict_gliht_to_vrt(f_list, epsg_utm, date_string_position=1)
    
    # Errors get collected here by UTM zone
    dict_utm_zones_error[epsg_utm].append(error_list)

32607
/explore/nobackup/people/pmontesa/userfs02/data/gliht/chm/gliht_chm_2014_32607.vrt
32606
Unknown string format: Creek
32605
/explore/nobackup/people/pmontesa/userfs02/data/gliht/chm/gliht_chm_2014_32605.vrt


#### Handle the UTM zone with a f_list for which an error popped up

##### Bonanza Creek data format is special: no full date string

In [7]:
dict_utm_zones_error = dict(dict_utm_zones_error)

for epsg_utm, error_list_list in dict_utm_zones_error.items():
    
    print(epsg_utm)
    
    for error_list in error_list_list:
        for e in error_list:
            
            # Get the unexpected string returned from the position of the gliht filename that you thought held the datestring
            unexpected_string = e.args[1]
            print(unexpected_string)
            
            # Get the files in this zone's list where the unexpectected string was NOT in (the normally formatted filenames)
            gen = filter(lambda x: unexpected_string not in x, dict_utm_zones[epsg_utm]) #Returns a generator
            f_list_expected = list(gen)
            
            gen = filter(lambda x: unexpected_string in x, dict_utm_zones[epsg_utm]) #Returns a generator
            f_list_unexpected = list(gen)
            
            # Get the sorted df from this 'expected' filename list
            df_exp = filelist_to_date_sorted_df(f_list_expected, date_string_position=1)
            
            # Get the sorted df from the 'unexpected' filename list
            df_unexp = filelist_to_date_sorted_df(f_list_unexpected, date_string_position=3)
            
            # Now sort just according to filename
            # In this specific case this will sort Bonanza Creek flightlines according to kHz of acquisition (larger first)
            df_unexp.sort_values(by='file', ignore_index=True, ascending=False, inplace=True )
            
            # Combine the 2 dfs
            df = pd.concat([df_exp, df_unexp])
            
            utm_zone_specific_f_list = df.path.to_list()
        
            out_vrt_fn = os.path.join(MAINDIR, f'gliht_{TYPE.lower()}_{YEAR}_{epsg_utm}.vrt')
        
            build_vrt(utm_zone_specific_f_list, out_vrt_fn)
            
            
    

32607
32606
Creek
/explore/nobackup/people/pmontesa/userfs02/data/gliht/chm/gliht_chm_2014_32606.vrt
32605
