Metadata collection workflow:

1. Collect metadata from logs. This includes date the image was pulled; glims_id; crs; utm_zone; landsat image collection;

2. Collect metadata from metadata files for each landsat

3. Collect metadata from the image files. This includes the date the image was taken; the file size; 

4. Collect metadata from the images. This includes the image shape.

5. Join on filename

In [2]:
import pandas as pd
import numpy as np
import os
import rasterio
import sys
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.expanduser("~"),"Desktop","projects", "GlacierView", "src","segmentation","helpers"))
import read

In [3]:
data_label = "localized_time_series_for_training_c02_t1_l2"
glacier_view_dir = os.path.join(os.path.expanduser('~'),"Desktop","projects","GlacierView")
glaciers_dir = os.path.join(glacier_view_dir,"src","earth_engine","data","ee_landing_zone",data_label, "landsat")
log_dir = os.path.join(glacier_view_dir,"src","earth_engine","data","ee_landing_zone",data_label, "logs")
output_dir = os.path.join(glacier_view_dir, "src", "earth_engine", "data", "processed_metadata", data_label)
log_path =  os.path.join(log_dir,"training_log_1.log")
glims_ids = [f for f in os.listdir(glaciers_dir) if not f.startswith('.')]

### Step 1: Collect and clean log files from ee_pull_training

In [101]:
log_df = pd.read_csv(log_path, sep = ";", header = None)

In [102]:
log_df['ee_pull_date'] = log_df[0].str.split(",", expand = True)[0]
log_df['glims_id'] = log_df[0].str.split(",", expand = True)[1].str.split(":", expand = True)[3]
log_df['espg_crs'] = log_df[1].str.split(":", expand = True)[2]
log_df['cloud_cover'] = log_df[2].str.split(":", expand = True)[1]
log_df['cloud_cover_land'] = log_df[3].str.split(":", expand = True)[1]
log_df['scene_center_time'] = log_df[4].str.split(":", expand = True)[1]
log_df['image_quality'] = log_df[5].str.split(":", expand = True)[1]
log_df = log_df.drop([0,1,2,3,4,5,6], axis = 1)

In [103]:
log_df.to_csv(os.path.join(output_dir,"processed_logs.csv"), index = False)
log_df.head()

Unnamed: 0,ee_pull_date,glims_id,espg_crs,cloud_cover,cloud_cover_land,scene_center_time,image_quality
0,2023-07-11 23:08:36,G087511E37931N,32645,30,30,4,7
1,2023-07-11 23:08:38,G087511E37931N,32645,70,70,4,7
2,2023-07-11 23:08:40,G087511E37931N,32645,43,43,4,9
3,2023-07-11 23:08:42,G087511E37931N,32645,24,24,4,9
4,2023-07-11 23:08:44,G087511E37931N,32645,4,4,4,9


### Step 2: Collect file and image attributes (filename, file size in bytes, number of files)

In [104]:
file_attributes = []
image_attributes = []
for glims_id in tqdm(glims_ids):
    glacier_dir = os.path.join(glaciers_dir,glims_id)
    file_names = [f for f in os.listdir(glacier_dir) if "." in f and not f.startswith('.') and not f.endswith('.xml')]
    for file_name in file_names:
        
        #getting file attributes
        base_name = file_name.split(".")[0]
        glacier_path = os.path.join(glacier_dir,file_name)
        base_name_split = base_name.split("_")
        src_date = base_name_split[1]
        landsat = base_name_split[2]
        size_in_bytes = os.path.getsize(glacier_path)
        file_attributes.append((file_name, src_date, landsat, size_in_bytes)) 
        
        #getting image attributes
        with rasterio.open(os.path.join(glacier_dir,file_name)) as src:
            # Descriptions: https://rasterio.readthedocs.io/en/stable/api/rasterio.io.html#rasterio.io.DatasetReader
            height_in_pixels = src.height
            width_in_pixels = src.width
            left_bound = src.bounds.left
            right_bound = src.bounds.right
            top_bound = src.bounds.top
            bottom_bound = src.bounds.bottom
            num_of_bands = src.count
            epsg_code = int(src.crs.to_dict()['init'].split(":")[1])
            no_data_val = src.nodata
            pixel_width_res = src.res[0]
            pixel_height_res = src.res[1]
            
            raster = src.read()
            zero_pixel_count = np.sum(raster == 0)
            no_data_pixel_count = np.sum(raster == no_data_val)
            negative_pixel_count = np.sum(raster < 0)
            image_attributes.append((file_name, height_in_pixels, width_in_pixels, left_bound,
                                     right_bound, top_bound, bottom_bound, num_of_bands, epsg_code,
                                    no_data_val, pixel_width_res, pixel_height_res, zero_pixel_count,
                                    no_data_pixel_count, negative_pixel_count))

  1%|          | 93/18093 [00:16<53:40,  5.59it/s]  


KeyboardInterrupt: 

In [96]:
file_attributes_df = pd.DataFrame(file_attributes, columns = ["file_name", "src_date", "landsat", "file size in bytes"])
file_attributes_df.to_csv(os.path.join(output_dir,"file_attributes.csv"), index = False)
file_attributes_df.head()

Unnamed: 0,file_name,src_date,landsat,file size in bytes
0,G076018E36317N_2010-10-02_L7_C02_T1_L2_SR.tif,2010-10-02,L7,350805
1,G076018E36317N_2010-08-31_L7_C02_T1_L2_SR.tif,2010-08-31,L7,393877
2,G076018E36317N_2010-09-23_L7_C02_T1_L2_SR.tif,2010-09-23,L7,379287
3,G076018E36317N_2010-10-09_L7_C02_T1_L2_SR.tif,2010-10-09,L7,369966
4,G076018E36317N_2010-09-08_L5_C02_T1_L2_SR.tif,2010-09-08,L5,478042


In [97]:
image_attributes_df = pd.DataFrame(image_attributes,
                                   columns = ["file_name", "height_in_pixels", "width_in_pixels", "left_bound",
                                             "right_bound", "top_bound", "bottom_bound", "num_of_bands", "epsg_code",
                                             "no_data_val", "pixel_width_res", "pixel_height_res", "zero_pixel_count",
                                             "no_data_pixel_count", "negative_pixel_count"])
image_attributes_df.to_csv(os.path.join(output_dir,"image_attributes.csv"), index = False)
image_attributes_df.head()

Unnamed: 0,file_name,height_in_pixels,width_in_pixels,left_bound,right_bound,top_bound,bottom_bound,num_of_bands,epsg_code,no_data_val,pixel_width_res,pixel_height_res,zero_pixel_count,no_data_pixel_count,negative_pixel_count
0,G076018E36317N_2010-10-02_L7_C02_T1_L2_SR.tif,124,179,588945.0,594315.0,4021425.0,4017705.0,19,32643,-2147484000.0,30.0,30.0,81883,0,69771
1,G076018E36317N_2010-08-31_L7_C02_T1_L2_SR.tif,124,179,588945.0,594315.0,4021425.0,4017705.0,19,32643,-2147484000.0,30.0,30.0,81534,0,68094
2,G076018E36317N_2010-09-23_L7_C02_T1_L2_SR.tif,124,179,588945.0,594315.0,4021425.0,4017705.0,19,32643,-2147484000.0,30.0,30.0,80194,0,60860
3,G076018E36317N_2010-10-09_L7_C02_T1_L2_SR.tif,124,179,588945.0,594315.0,4021425.0,4017705.0,19,32643,-2147484000.0,30.0,30.0,73175,0,57068
4,G076018E36317N_2010-09-08_L5_C02_T1_L2_SR.tif,124,179,588945.0,594315.0,4021425.0,4017705.0,19,32643,-2147484000.0,30.0,30.0,25514,0,0


### Step 3: Collect JSON metadata from EE

In [10]:
ee_meta_data_df = pd.read_csv(os.path.join(output_dir,".csv"))

In [12]:
completed_glims_ids = set(ee_meta_data_df.glims_id)
print(f"There are {len(completed_glims_ids)} completed glaciers")
remaining_glims_ids = [glims_id for glims_id in glims_ids if glims_id not in completed_glims_ids]
print(f"There are {len(remaining_glims_ids)} remaining glaciers")

There are 8769 completed glaciers
There are 9324 remaining glaciers


In [13]:
# ee_meta_data_df = pd.DataFrame({
#                                 "file_name":[],
#                                 "src_date": [],
#                                 "ee_identifier":[], 
#                                 "glims_id":[], 
#                                 "algorithm_source_surface_reflectance": [],
#                                 "algorithm_source_surface_temperature": [],
#                                 "cloud_cover": [],
#                                 "cloud_cover_land": [],
#                                 "collection_category": [],
#                                 "data_source_air_temperature": [],
#                                 "data_source_elevation": [],
#                                 "data_source_ozone": [],
#                                 "data_source_pressure": [],
#                                 "data_source_reanalysis": [],
#                                 "data_source_water_vapor": [],
#                                 "date_product_generated": [],
#                                 "earth_sun_distance": [],
#                                 "geometric_rmse_model": [],
#                                 "geometric_rmse_model_x": [],
#                                 "geometric_rmse_model_y": [],
#                                 "ground_control_points_model": [],
#                                 "ground_control_points_version": [],
#                                 "l1_date_product_generated": [],
#                                 "l1_landsat_product_id": [],
#                                 "l1_processing_level": [],
#                                 "l1_processing_software_version": [],
#                                 "landsat_product_id": [],
#                                 "landsat_scene_id": [],
#                                 "processing_level": [],
#                                 "processing_software_version": [],
#                                 "scene_center_time": [],
#                                 "sensor_id": [],
#                                 "spacecraft_id": [],
#                                 "sun_azimuth": [],
#                                 "sun_elevation": [],
#                                 "wrs_path": [],
#                                 "wrs_row": [],
#                                 "image_quality": [],
#                                 "image_quality_oli": [],
#                                 "image_quality_tirs": [],
#                                 "temperature_maximum_band_st_b6": [],
#                                 "temperature_minimum_band_st_b6": [],
#                                 "ephemeris_type": [],
#                                 "data_source_tirs_stray_light_correction": [],
#                                 "geometric_rmse_verify": [],
#                                 "ground_control_points_verify": [],
#                                 "roll_angle": [],
#                                 "target_wrs_path": [],
#                                 "target_wrs_row": [],
#                                 "temperature_maximum_band_st_b10": [],
#                                 "temperature_minimum_band_st_b10": [],
#                                 "tirs_ssm_model": []
#                                })


for glims_id in tqdm(remaining_glims_ids):
    glacier_dir = os.path.join(glaciers_dir, glims_id)
    if "meta_data" not in os.listdir(glacier_dir):
        continue
    meta_data_dir = os.path.join(glacier_dir, "meta_data")
    meta_data_file_names = os.listdir(meta_data_dir)
    for meta_data_file_name in meta_data_file_names:
        with open(os.path.join(meta_data_dir,meta_data_file_name), mode = "r") as file:
            meta = file.read()
            for unformatted_meta in meta.split("\n")[1:-1]: #no data in first and last value
                truncated_meta = unformatted_meta.split(",\"")[1][:-1]
                meta_data = truncated_meta.replace("\'", "\"")
                meta_dict = json.loads(meta_data)
                meta_id_split = meta_dict['id'].split("_")

                landsat = "L" + str(int(meta_id_split[1][-2:]))
                date = meta_id_split[-1]
                year = date[:4]
                month = date[4:6]
                day = date[6:]
                file_name = f"{glims_id}_{year}-{month}-{day}_{landsat}_C02_T1_L2_SR.tif"
                ee_identifier = meta_dict['id']
                algorithm_source_surface_reflectance = meta_dict['properties']["ALGORITHM_SOURCE_SURFACE_REFLECTANCE"]
                try:
                    algorithm_source_surface_temperature = meta_dict['properties']["ALGORITHM_SOURCE_SURFACE_TEMPERATURE"]
                except KeyError:
                    algorithm_source_surface_temperature = ""
                cloud_cover = meta_dict['properties']["CLOUD_COVER"]
                cloud_cover_land = meta_dict['properties']["CLOUD_COVER_LAND"]
                collection_category = meta_dict['properties']["COLLECTION_CATEGORY"]
                data_source_air_temperature = meta_dict['properties']["DATA_SOURCE_AIR_TEMPERATURE"]
                data_source_elevation = meta_dict['properties']["DATA_SOURCE_ELEVATION"]
                data_source_ozone = meta_dict['properties']["DATA_SOURCE_OZONE"]
                data_source_pressure = meta_dict['properties']["DATA_SOURCE_PRESSURE"]
                try:
                    data_source_reanalysis = meta_dict['properties']["DATA_SOURCE_REANALYSIS"]
                except KeyError:
                    data_source_reanalysis = ""
                data_source_water_vapor = meta_dict['properties']["DATA_SOURCE_WATER_VAPOR"]
                date_product_generated = meta_dict['properties']["DATE_PRODUCT_GENERATED"]
                earth_sun_distance = meta_dict['properties']["EARTH_SUN_DISTANCE"]
                geometric_rmse_model = meta_dict['properties']["GEOMETRIC_RMSE_MODEL"]
                geometric_rmse_model_x = meta_dict['properties']["GEOMETRIC_RMSE_MODEL_X"]
                geometric_rmse_model_y = meta_dict['properties']["GEOMETRIC_RMSE_MODEL_Y"]
                ground_control_points_model = meta_dict['properties']["GROUND_CONTROL_POINTS_MODEL"]
                ground_control_points_version = meta_dict['properties']["GROUND_CONTROL_POINTS_VERSION"]
                l1_date_product_generated = meta_dict['properties']["L1_DATE_PRODUCT_GENERATED"]
                l1_landsat_product_id = meta_dict['properties']["L1_LANDSAT_PRODUCT_ID"]
                l1_processing_level = meta_dict['properties']["L1_PROCESSING_LEVEL"]
                l1_processing_software_version = meta_dict['properties']["L1_PROCESSING_SOFTWARE_VERSION"]
                landsat_product_id = meta_dict['properties']["LANDSAT_PRODUCT_ID"]
                landsat_scene_id = meta_dict['properties']["LANDSAT_SCENE_ID"]
                processing_level = meta_dict['properties']["PROCESSING_LEVEL"]
                processing_software_version = meta_dict['properties']["PROCESSING_SOFTWARE_VERSION"]
                scene_center_time = meta_dict['properties']["SCENE_CENTER_TIME"]
                sensor_id = meta_dict['properties']["SENSOR_ID"]
                spacecraft_id = meta_dict['properties']["SPACECRAFT_ID"]
                sun_azimuth = meta_dict['properties']["SUN_AZIMUTH"]
                sun_elevation = meta_dict['properties']["SUN_ELEVATION"]
                wrs_path = meta_dict['properties']["WRS_PATH"]
                wrs_row = meta_dict['properties']["WRS_ROW"]


                try: 
                    image_quality = meta_dict['properties']['IMAGE_QUALITY']
                except KeyError:
                    image_quality = ""
                try: 
                    image_quality_tirs = meta_dict['properties']['IMAGE_QUALITY_TIRS']
                except:
                    image_quality_tirs = ""
                    
                try: 
                    image_quality_oli = meta_dict['properties']['IMAGE_QUALITY_OLI']
                except:
                    image_quality_oli = ""
                try: 
                    temperature_maximum_band_st_b6 = meta_dict['properties']['TEMPERATURE_MAXIMUM_BAND_ST_B6']
                except KeyError:
                    temperature_maximum_band_st_b6 = ""
                try:                     
                    temperature_minimum_band_st_b6 = meta_dict['properties']['TEMPERATURE_MINIMUM_BAND_ST_B6']
                except KeyError:       
                    temperature_minimum_band_st_b6 = ""
                try:     
                    ephemeris_type = meta_dict['properties']['EPHEMERIS_TYPE']
                except KeyError:   
                    ephemeris_type = ""
                try:    
                    data_source_tirs_stray_light_correction = meta_dict['properties']['DATA_SOURCE_TIRS_STRAY_LIGHT_CORRECTION']
                except KeyError:
                    data_source_tirs_stray_light_correction = ""
                try:    
                    geometric_rmse_verify = meta_dict['properties']['GEOMETRIC_RMSE_VERIFY']
                except KeyError:  
                    geometric_rmse_verify = ""
                try:    
                    ground_control_points_verify = meta_dict['properties']['GROUND_CONTROL_POINTS_VERIFY']
                except KeyError:  
                    ground_control_points_verify = ""
                try:   
                    roll_angle = meta_dict['properties']['ROLL_ANGLE']
                except KeyError:     
                    roll_angle = ""
                try:   
                    target_wrs_path = meta_dict['properties']['TARGET_WRS_PATH']
                except KeyError:  
                    target_wrs_path = ""
                try:    
                    target_wrs_row = meta_dict['properties']['TARGET_WRS_ROW']
                except KeyError:   
                    target_wrs_row = ""
                try:   
                    temperature_maximum_band_st_b10 = meta_dict['properties']['TEMPERATURE_MAXIMUM_BAND_ST_B10']
                except KeyError:
                    temperature_maximum_band_st_b10 = ""
                try:  
                    temperature_minimum_band_st_b10 = meta_dict['properties']['TEMPERATURE_MINIMUM_BAND_ST_B10']
                except KeyError:
                    temperature_minimum_band_st_b10 = ""
                try:   
                    tirs_ssm_model = meta_dict['properties']['TIRS_SSM_MODEL']
                except KeyError:
                    tirs_ssm_model = ""


                ee_meta_data = {
                                "file_name": file_name,
                                "src_date": f"{year}-{month}-{day}",
                                "ee_identifier": ee_identifier, 
                                "glims_id": glims_id, 
                                "algorithm_source_surface_reflectance": algorithm_source_surface_reflectance,
                                "algorithm_source_surface_temperature": algorithm_source_surface_temperature,
                                "cloud_cover": cloud_cover,
                                "cloud_cover_land": cloud_cover_land,
                                "collection_category": collection_category,
                                "data_source_air_temperature": data_source_air_temperature,
                                "data_source_elevation": data_source_elevation,
                                "data_source_ozone": data_source_ozone,
                                "data_source_pressure": data_source_pressure,
                                "data_source_reanalysis": data_source_reanalysis,
                                "data_source_water_vapor": data_source_water_vapor,
                                "date_product_generated": date_product_generated,
                                "earth_sun_distance": earth_sun_distance,
                                "geometric_rmse_model": geometric_rmse_model,
                                "geometric_rmse_model_x": geometric_rmse_model_x,
                                "geometric_rmse_model_y": geometric_rmse_model_y,
                                "ground_control_points_model": ground_control_points_model,
                                "ground_control_points_version": ground_control_points_version,
                                "l1_date_product_generated": l1_date_product_generated,
                                "l1_landsat_product_id": l1_landsat_product_id,
                                "l1_processing_level": l1_processing_level,
                                "l1_processing_software_version": l1_processing_software_version,
                                "landsat_product_id": landsat_product_id,
                                "landsat_scene_id": landsat_scene_id,
                                "processing_level": processing_level,
                                "processing_software_version": processing_software_version,
                                "scene_center_time": scene_center_time,
                                "sensor_id": sensor_id,
                                "spacecraft_id": spacecraft_id,
                                "sun_azimuth": sun_azimuth,
                                "sun_elevation": sun_elevation,
                                "wrs_path": wrs_path,
                                "wrs_row": wrs_row,
                                "image_quality": image_quality,
                                "image_quality_oli": image_quality_oli ,
                                "image_quality_tirs": image_quality_tirs ,
                                "temperature_maximum_band_st_b6": temperature_maximum_band_st_b6 ,
                                "temperature_minimum_band_st_b6": temperature_minimum_band_st_b6 ,
                                "ephemeris_type": ephemeris_type ,
                                "data_source_tirs_stray_light_correction": data_source_tirs_stray_light_correction ,
                                "geometric_rmse_verify": geometric_rmse_verify ,
                                "ground_control_points_verify": ground_control_points_verify ,
                                "roll_angle": roll_angle ,
                                "target_wrs_path": target_wrs_path ,
                                "target_wrs_row": target_wrs_row ,
                                "temperature_maximum_band_st_b10": temperature_maximum_band_st_b10 ,
                                "temperature_minimum_band_st_b10": temperature_minimum_band_st_b10 ,
                                "tirs_ssm_model": tirs_ssm_model 
                              }
                ee_meta_data_df = ee_meta_data_df.append(ee_meta_data, ignore_index = True)                


100%|██████████| 9324/9324 [2:54:21<00:00,  1.12s/it]  


In [18]:
ee_meta_data_df.to_csv(os.path.join(output_dir,".csv"), index = False)