Metadata collection workflow:

1. Collect metadata from logs. This includes date the image was pulled; glims_id; crs; utm_zone; landsat image collection;

2. Collect metadata from metadata files for each landsat

3. Collect metadata from the image files. This includes the date the image was taken; the file size; 

4. Collect metadata from the images. This includes the image shape.

5. Join on filename

In [1]:
import pandas as pd
import numpy as np
import os
import rasterio
import sys
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.expanduser("~"),"Desktop","projects", "GlacierView", "src","segmentation","helpers"))
import read

In [2]:
data_label = "full_time_series_c02_t1_l2"
glacier_view_dir = os.path.join(os.path.expanduser('~'),"Desktop","projects","GlacierView")
glaciers_dir = os.path.join(glacier_view_dir,"src","earth_engine","data","ee_landing_zone",data_label, "landsat")
log_dir = os.path.join(glacier_view_dir,"src","earth_engine","data","ee_landing_zone",data_label, "logs")
output_dir = os.path.join(glacier_view_dir, "src", "earth_engine", "data", "processed_metadata", data_label)
log_path =  os.path.join(log_dir,"trient_only.log")
glims_ids = [f for f in os.listdir(glaciers_dir) if not f.startswith('.')]

### Step 1: Collect and clean log files from ee_pull_training

In [3]:
log_df = pd.read_csv(log_path, sep = ";", header = None)

In [4]:
log_df['ee_pull_date'] = log_df[0].str.split(",", expand = True)[0]
log_df['glims_id'] = log_df[0].str.split(",", expand = True)[1].str.split(":", expand = True)[3]
log_df['espg_crs'] = log_df[1].str.split(":", expand = True)[2]
log_df['cloud_cover'] = log_df[2].str.split(":", expand = True)[1]
log_df['cloud_cover_land'] = log_df[3].str.split(":", expand = True)[1]
log_df['scene_center_time'] = log_df[4].str.split(":", expand = True)[1]
log_df['image_quality'] = log_df[5].str.split(":", expand = True)[1]
log_df = log_df.drop([0,1,2,3,4,5,6], axis = 1)

In [5]:
log_df.to_csv(os.path.join(output_dir,"processed_logs.csv"), index = False)
log_df.head()

Unnamed: 0,ee_pull_date,glims_id,espg_crs,cloud_cover,cloud_cover_land,scene_center_time,image_quality
0,2023-11-26 23:00:17,G007026E45991N,32632,3,3,9,9
1,2023-11-26 23:00:20,G007026E45991N,32632,35,35,9,9
2,2023-11-26 23:00:23,G007026E45991N,32632,31,31,9,7
3,2023-11-26 23:00:27,G007026E45991N,32632,58,58,9,7
4,2023-11-26 23:00:30,G007026E45991N,32632,59,59,9,9


### Step 2: Collect file and image attributes (filename, file size in bytes, number of files)

In [6]:
file_attributes = []
image_attributes = []
for glims_id in tqdm(glims_ids):
    glacier_dir = os.path.join(glaciers_dir,glims_id)
    file_names = [f for f in os.listdir(glacier_dir) if "." in f and not f.startswith('.') and not f.endswith('.xml')]
    for file_name in tqdm(file_names):
        #getting file attributes
        base_name = file_name.split(".")[0]
        glacier_path = os.path.join(glacier_dir,file_name)
        base_name_split = base_name.split("_")
        src_date = base_name_split[1]
        landsat = base_name_split[2]
        size_in_bytes = os.path.getsize(glacier_path)
        file_attributes.append((file_name, src_date, landsat, size_in_bytes)) 
        
        #getting image attributes
        with rasterio.open(os.path.join(glacier_dir,file_name)) as src:
            # Descriptions: https://rasterio.readthedocs.io/en/stable/api/rasterio.io.html#rasterio.io.DatasetReader
            height_in_pixels = src.height
            width_in_pixels = src.width
            left_bound = src.bounds.left
            right_bound = src.bounds.right
            top_bound = src.bounds.top
            bottom_bound = src.bounds.bottom
            num_of_bands = src.count
            epsg_code = int(src.crs.to_dict()['init'].split(":")[1])
            no_data_val = src.nodata
            pixel_width_res = src.res[0]
            pixel_height_res = src.res[1]
            
            raster = src.read()
            zero_pixel_count = np.sum(raster == 0)
            no_data_pixel_count = np.sum(raster == no_data_val)
            negative_pixel_count = np.sum(raster < 0)
            image_attributes.append((file_name, height_in_pixels, width_in_pixels, left_bound,
                                     right_bound, top_bound, bottom_bound, num_of_bands, epsg_code,
                                    no_data_val, pixel_width_res, pixel_height_res, zero_pixel_count,
                                    no_data_pixel_count, negative_pixel_count))

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1748 [00:00<?, ?it/s][A
  0%|          | 5/1748 [00:00<00:35, 49.19it/s][A
  1%|          | 13/1748 [00:00<00:26, 64.35it/s][A
  1%|▏         | 22/1748 [00:00<00:23, 72.55it/s][A
  2%|▏         | 31/1748 [00:00<00:22, 76.99it/s][A
  2%|▏         | 40/1748 [00:00<00:21, 79.16it/s][A
  3%|▎         | 49/1748 [00:00<00:21, 80.09it/s][A
  3%|▎         | 58/1748 [00:00<00:21, 80.30it/s][A
  4%|▍         | 67/1748 [00:00<00:20, 81.22it/s][A
  4%|▍         | 76/1748 [00:00<00:20, 80.53it/s][A
  5%|▍         | 85/1748 [00:01<00:20, 80.91it/s][A
  5%|▌         | 94/1748 [00:01<00:20, 80.84it/s][A
  6%|▌         | 103/1748 [00:01<00:20, 80.71it/s][A
  6%|▋         | 112/1748 [00:01<00:20, 81.15it/s][A
  7%|▋         | 121/1748 [00:01<00:20, 80.32it/s][A
  7%|▋         | 130/1748 [00:01<00:19, 81.44it/s][A
  8%|▊         | 139/1748 [00:01<00:19, 80.78it/s][A
  8%|▊         | 148/1748 [00:01<00:19, 82.70it/s][A
  9%|▉      

 71%|███████   | 1239/1748 [00:15<00:06, 77.28it/s][A
 71%|███████▏  | 1248/1748 [00:16<00:06, 78.11it/s][A
 72%|███████▏  | 1257/1748 [00:16<00:06, 79.95it/s][A
 72%|███████▏  | 1266/1748 [00:16<00:05, 80.76it/s][A
 73%|███████▎  | 1275/1748 [00:16<00:05, 80.87it/s][A
 73%|███████▎  | 1284/1748 [00:16<00:05, 80.83it/s][A
 74%|███████▍  | 1293/1748 [00:16<00:05, 81.16it/s][A
 74%|███████▍  | 1302/1748 [00:16<00:05, 82.27it/s][A
 75%|███████▌  | 1311/1748 [00:16<00:05, 81.82it/s][A
 76%|███████▌  | 1320/1748 [00:16<00:05, 82.45it/s][A
 76%|███████▌  | 1329/1748 [00:17<00:05, 83.40it/s][A
 77%|███████▋  | 1338/1748 [00:17<00:04, 83.46it/s][A
 77%|███████▋  | 1347/1748 [00:17<00:04, 83.74it/s][A
 78%|███████▊  | 1356/1748 [00:17<00:04, 83.51it/s][A
 78%|███████▊  | 1365/1748 [00:17<00:04, 83.40it/s][A
 79%|███████▊  | 1374/1748 [00:17<00:04, 83.37it/s][A
 79%|███████▉  | 1383/1748 [00:17<00:04, 82.19it/s][A
 80%|███████▉  | 1392/1748 [00:17<00:04, 82.50it/s][A
 80%|█████

In [7]:
file_attributes_df = pd.DataFrame(file_attributes, columns = ["file_name", "src_date", "landsat", "file size in bytes"])
file_attributes_df.to_csv(os.path.join(output_dir,"file_attributes.csv"), index = False)
file_attributes_df.head()

Unnamed: 0,file_name,src_date,landsat,file size in bytes
0,G007026E45991N_2021-06-11_L8_C02_T1_L2_SR.tif,2021-06-11,L8,639637
1,G007026E45991N_2009-11-24_L5_C02_T1_L2_SR.tif,2009-11-24,L5,345000
2,G007026E45991N_1997-03-21_L5_C02_T1_L2_SR.tif,1997-03-21,L5,437373
3,G007026E45991N_2010-06-05_L7_C02_T1_L2_SR.tif,2010-06-05,L7,401740
4,G007026E45991N_1993-08-08_L5_C02_T1_L2_SR.tif,1993-08-08,L5,537548


In [8]:
image_attributes_df = pd.DataFrame(image_attributes,
                                   columns = ["file_name", "height_in_pixels", "width_in_pixels", "left_bound",
                                             "right_bound", "top_bound", "bottom_bound", "num_of_bands", "epsg_code",
                                             "no_data_val", "pixel_width_res", "pixel_height_res", "zero_pixel_count",
                                             "no_data_pixel_count", "negative_pixel_count"])
image_attributes_df.to_csv(os.path.join(output_dir,"image_attributes.csv"), index = False)
image_attributes_df.head()

Unnamed: 0,file_name,height_in_pixels,width_in_pixels,left_bound,right_bound,top_bound,bottom_bound,num_of_bands,epsg_code,no_data_val,pixel_width_res,pixel_height_res,zero_pixel_count,no_data_pixel_count,negative_pixel_count
0,G007026E45991N_2021-06-11_L8_C02_T1_L2_SR.tif,177,103,345735.0,348825.0,5098845.0,5093535.0,19,32632,-2147484000.0,30.0,30.0,28523,0,0
1,G007026E45991N_2009-11-24_L5_C02_T1_L2_SR.tif,179,108,810225.0,813465.0,5104875.0,5099505.0,19,32631,-2147484000.0,30.0,30.0,38699,0,0
2,G007026E45991N_1997-03-21_L5_C02_T1_L2_SR.tif,177,103,345735.0,348825.0,5098845.0,5093535.0,19,32632,-2147484000.0,30.0,30.0,8964,0,0
3,G007026E45991N_2010-06-05_L7_C02_T1_L2_SR.tif,177,103,345735.0,348825.0,5098845.0,5093535.0,19,32632,-2147484000.0,30.0,30.0,36548,0,24125
4,G007026E45991N_1993-08-08_L5_C02_T1_L2_SR.tif,179,108,810225.0,813465.0,5104875.0,5099505.0,19,32631,-2147484000.0,30.0,30.0,27843,0,0


### Step 3: Collect JSON metadata from EE

In [13]:
ee_meta_data_df = pd.DataFrame({
                                "file_name":[],
                                "src_date": [],
                                "ee_identifier":[], 
                                "glims_id":[], 
                                "algorithm_source_surface_reflectance": [],
                                "algorithm_source_surface_temperature": [],
                                "cloud_cover": [],
                                "cloud_cover_land": [],
                                "collection_category": [],
                                "data_source_air_temperature": [],
                                "data_source_elevation": [],
                                "data_source_ozone": [],
                                "data_source_pressure": [],
                                "data_source_reanalysis": [],
                                "data_source_water_vapor": [],
                                "date_product_generated": [],
                                "earth_sun_distance": [],
                                "geometric_rmse_model": [],
                                "geometric_rmse_model_x": [],
                                "geometric_rmse_model_y": [],
                                "ground_control_points_model": [],
                                "ground_control_points_version": [],
                                "l1_date_product_generated": [],
                                "l1_landsat_product_id": [],
                                "l1_processing_level": [],
                                "l1_processing_software_version": [],
                                "landsat_product_id": [],
                                "landsat_scene_id": [],
                                "processing_level": [],
                                "processing_software_version": [],
                                "scene_center_time": [],
                                "sensor_id": [],
                                "spacecraft_id": [],
                                "sun_azimuth": [],
                                "sun_elevation": [],
                                "wrs_path": [],
                                "wrs_row": [],
                                "image_quality": [],
                                "image_quality_oli": [],
                                "image_quality_tirs": [],
                                "temperature_maximum_band_st_b6": [],
                                "temperature_minimum_band_st_b6": [],
                                "ephemeris_type": [],
                                "data_source_tirs_stray_light_correction": [],
                                "geometric_rmse_verify": [],
                                "ground_control_points_verify": [],
                                "roll_angle": [],
                                "target_wrs_path": [],
                                "target_wrs_row": [],
                                "temperature_maximum_band_st_b10": [],
                                "temperature_minimum_band_st_b10": [],
                                "tirs_ssm_model": []
                               })

for glims_id in glims_ids:
    glacier_dir = os.path.join(glaciers_dir, glims_id)
    if "meta_data" not in os.listdir(glacier_dir):
        continue
    meta_data_dir = os.path.join(glacier_dir, "meta_data")
    meta_data_file_names = os.listdir(meta_data_dir)
    for meta_data_file_name in meta_data_file_names:
        with open(os.path.join(meta_data_dir,meta_data_file_name), mode = "r") as file:
            meta = file.read()
            for unformatted_meta in meta.split("\n")[1:-1]: #no data in first and last value
                truncated_meta = unformatted_meta.split(",\"")[1][:-1]
                meta_data = truncated_meta.replace("\'", "\"")
                meta_dict = json.loads(meta_data)
                meta_id_split = meta_dict['id'].split("_")

                landsat = "L" + str(int(meta_id_split[1][-2:]))
                date = meta_id_split[-1]
                year = date[:4]
                month = date[4:6]
                day = date[6:]
                file_name = f"{glims_id}_{year}-{month}-{day}_{landsat}_C02_T1_L2_SR.tif"
                ee_identifier = meta_dict['id']
                algorithm_source_surface_reflectance = meta_dict['properties']["ALGORITHM_SOURCE_SURFACE_REFLECTANCE"]
                try:
                    algorithm_source_surface_temperature = meta_dict['properties']["ALGORITHM_SOURCE_SURFACE_TEMPERATURE"]
                except KeyError:
                    algorithm_source_surface_temperature = ""
                cloud_cover = meta_dict['properties']["CLOUD_COVER"]
                cloud_cover_land = meta_dict['properties']["CLOUD_COVER_LAND"]
                collection_category = meta_dict['properties']["COLLECTION_CATEGORY"]
                data_source_air_temperature = meta_dict['properties']["DATA_SOURCE_AIR_TEMPERATURE"]
                data_source_elevation = meta_dict['properties']["DATA_SOURCE_ELEVATION"]
                data_source_ozone = meta_dict['properties']["DATA_SOURCE_OZONE"]
                data_source_pressure = meta_dict['properties']["DATA_SOURCE_PRESSURE"]
                try:
                    data_source_reanalysis = meta_dict['properties']["DATA_SOURCE_REANALYSIS"]
                except KeyError:
                    data_source_reanalysis = ""
                data_source_water_vapor = meta_dict['properties']["DATA_SOURCE_WATER_VAPOR"]
                date_product_generated = meta_dict['properties']["DATE_PRODUCT_GENERATED"]
                earth_sun_distance = meta_dict['properties']["EARTH_SUN_DISTANCE"]
                geometric_rmse_model = meta_dict['properties']["GEOMETRIC_RMSE_MODEL"]
                geometric_rmse_model_x = meta_dict['properties']["GEOMETRIC_RMSE_MODEL_X"]
                geometric_rmse_model_y = meta_dict['properties']["GEOMETRIC_RMSE_MODEL_Y"]
                ground_control_points_model = meta_dict['properties']["GROUND_CONTROL_POINTS_MODEL"]
                ground_control_points_version = meta_dict['properties']["GROUND_CONTROL_POINTS_VERSION"]
                l1_date_product_generated = meta_dict['properties']["L1_DATE_PRODUCT_GENERATED"]
                l1_landsat_product_id = meta_dict['properties']["L1_LANDSAT_PRODUCT_ID"]
                l1_processing_level = meta_dict['properties']["L1_PROCESSING_LEVEL"]
                l1_processing_software_version = meta_dict['properties']["L1_PROCESSING_SOFTWARE_VERSION"]
                landsat_product_id = meta_dict['properties']["LANDSAT_PRODUCT_ID"]
                landsat_scene_id = meta_dict['properties']["LANDSAT_SCENE_ID"]
                processing_level = meta_dict['properties']["PROCESSING_LEVEL"]
                processing_software_version = meta_dict['properties']["PROCESSING_SOFTWARE_VERSION"]
                scene_center_time = meta_dict['properties']["SCENE_CENTER_TIME"]
                sensor_id = meta_dict['properties']["SENSOR_ID"]
                spacecraft_id = meta_dict['properties']["SPACECRAFT_ID"]
                sun_azimuth = meta_dict['properties']["SUN_AZIMUTH"]
                sun_elevation = meta_dict['properties']["SUN_ELEVATION"]
                wrs_path = meta_dict['properties']["WRS_PATH"]
                wrs_row = meta_dict['properties']["WRS_ROW"]


                try: 
                    image_quality = meta_dict['properties']['IMAGE_QUALITY']
                except KeyError:
                    image_quality = ""
                try: 
                    image_quality_tirs = meta_dict['properties']['IMAGE_QUALITY_TIRS']
                except:
                    image_quality_tirs = ""
                    
                try: 
                    image_quality_oli = meta_dict['properties']['IMAGE_QUALITY_OLI']
                except:
                    image_quality_oli = ""
                try: 
                    temperature_maximum_band_st_b6 = meta_dict['properties']['TEMPERATURE_MAXIMUM_BAND_ST_B6']
                except KeyError:
                    temperature_maximum_band_st_b6 = ""
                try:                     
                    temperature_minimum_band_st_b6 = meta_dict['properties']['TEMPERATURE_MINIMUM_BAND_ST_B6']
                except KeyError:       
                    temperature_minimum_band_st_b6 = ""
                try:     
                    ephemeris_type = meta_dict['properties']['EPHEMERIS_TYPE']
                except KeyError:   
                    ephemeris_type = ""
                try:    
                    data_source_tirs_stray_light_correction = meta_dict['properties']['DATA_SOURCE_TIRS_STRAY_LIGHT_CORRECTION']
                except KeyError:
                    data_source_tirs_stray_light_correction = ""
                try:    
                    geometric_rmse_verify = meta_dict['properties']['GEOMETRIC_RMSE_VERIFY']
                except KeyError:  
                    geometric_rmse_verify = ""
                try:    
                    ground_control_points_verify = meta_dict['properties']['GROUND_CONTROL_POINTS_VERIFY']
                except KeyError:  
                    ground_control_points_verify = ""
                try:   
                    roll_angle = meta_dict['properties']['ROLL_ANGLE']
                except KeyError:     
                    roll_angle = ""
                try:   
                    target_wrs_path = meta_dict['properties']['TARGET_WRS_PATH']
                except KeyError:  
                    target_wrs_path = ""
                try:    
                    target_wrs_row = meta_dict['properties']['TARGET_WRS_ROW']
                except KeyError:   
                    target_wrs_row = ""
                try:   
                    temperature_maximum_band_st_b10 = meta_dict['properties']['TEMPERATURE_MAXIMUM_BAND_ST_B10']
                except KeyError:
                    temperature_maximum_band_st_b10 = ""
                try:  
                    temperature_minimum_band_st_b10 = meta_dict['properties']['TEMPERATURE_MINIMUM_BAND_ST_B10']
                except KeyError:
                    temperature_minimum_band_st_b10 = ""
                try:   
                    tirs_ssm_model = meta_dict['properties']['TIRS_SSM_MODEL']
                except KeyError:
                    tirs_ssm_model = ""


                ee_meta_data = {
                                "file_name": file_name,
                                "src_date": f"{year}-{month}-{day}",
                                "ee_identifier": ee_identifier, 
                                "glims_id": glims_id, 
                                "algorithm_source_surface_reflectance": algorithm_source_surface_reflectance,
                                "algorithm_source_surface_temperature": algorithm_source_surface_temperature,
                                "cloud_cover": cloud_cover,
                                "cloud_cover_land": cloud_cover_land,
                                "collection_category": collection_category,
                                "data_source_air_temperature": data_source_air_temperature,
                                "data_source_elevation": data_source_elevation,
                                "data_source_ozone": data_source_ozone,
                                "data_source_pressure": data_source_pressure,
                                "data_source_reanalysis": data_source_reanalysis,
                                "data_source_water_vapor": data_source_water_vapor,
                                "date_product_generated": date_product_generated,
                                "earth_sun_distance": earth_sun_distance,
                                "geometric_rmse_model": geometric_rmse_model,
                                "geometric_rmse_model_x": geometric_rmse_model_x,
                                "geometric_rmse_model_y": geometric_rmse_model_y,
                                "ground_control_points_model": ground_control_points_model,
                                "ground_control_points_version": ground_control_points_version,
                                "l1_date_product_generated": l1_date_product_generated,
                                "l1_landsat_product_id": l1_landsat_product_id,
                                "l1_processing_level": l1_processing_level,
                                "l1_processing_software_version": l1_processing_software_version,
                                "landsat_product_id": landsat_product_id,
                                "landsat_scene_id": landsat_scene_id,
                                "processing_level": processing_level,
                                "processing_software_version": processing_software_version,
                                "scene_center_time": scene_center_time,
                                "sensor_id": sensor_id,
                                "spacecraft_id": spacecraft_id,
                                "sun_azimuth": sun_azimuth,
                                "sun_elevation": sun_elevation,
                                "wrs_path": wrs_path,
                                "wrs_row": wrs_row,
                                "image_quality": image_quality,
                                "image_quality_oli": image_quality_oli ,
                                "image_quality_tirs": image_quality_tirs ,
                                "temperature_maximum_band_st_b6": temperature_maximum_band_st_b6 ,
                                "temperature_minimum_band_st_b6": temperature_minimum_band_st_b6 ,
                                "ephemeris_type": ephemeris_type ,
                                "data_source_tirs_stray_light_correction": data_source_tirs_stray_light_correction ,
                                "geometric_rmse_verify": geometric_rmse_verify ,
                                "ground_control_points_verify": ground_control_points_verify ,
                                "roll_angle": roll_angle ,
                                "target_wrs_path": target_wrs_path ,
                                "target_wrs_row": target_wrs_row ,
                                "temperature_maximum_band_st_b10": temperature_maximum_band_st_b10 ,
                                "temperature_minimum_band_st_b10": temperature_minimum_band_st_b10 ,
                                "tirs_ssm_model": tirs_ssm_model 
                              }
                ee_meta_data_df = ee_meta_data_df.append(ee_meta_data, ignore_index = True)                


G007026E45991N


In [14]:
ee_meta_data_df.to_csv(os.path.join(output_dir,"ee_metadata.csv"), index = False)