In [33]:
import pandas as pd
import os
import rasterio
import sys
import json

In [34]:
output_dir = "/Users/mattw/Desktop/projects/GlacierView/src/earth_engine/data/processed_metadata/full_time_series"
logs_path = "/Users/mattw/Desktop/projects/GlacierView/src/earth_engine/data/ee_landing_zone/full_time_series/logs"
glacier_view_dir = os.path.join(os.path.expanduser("~"),"Desktop","projects","GlacierView")
glaciers_dir = os.path.join(glacier_view_dir,"src","earth_engine","data","ee_landing_zone/full_time_series/landsat")

### Step 1: collect and clean logs

In [19]:
batch_1_df = pd.read_csv(os.path.join(logs_path,'time_series_log_batch_1.log'), sep = ",", header = None)
batch_2_df = pd.read_csv(os.path.join(logs_path,'time_series_log_batch_2.log'), sep = ",", header = None)
batch_3_df = pd.read_csv(os.path.join(logs_path,'time_series_log_batch_3.log'), sep = ",", header = None)

In [20]:
GLIMS_ID_STR_INDEX = 1
CRS_STR_INDEX = 2
UTM_ZONE_STR_INDEX = 1

In [21]:
log_df = pd.concat((batch_1_df, batch_2_df, batch_3_df), axis = 0)

In [23]:
log_df.columns = ['ee_pull_date', 'glims_id','crs','utm_zone','landsat_satellite']
log_df.glims_id = log_df.glims_id.str.split(": ", expand = True)[GLIMS_ID_STR_INDEX] 
log_df.crs = log_df.crs.str.split(":", expand = True)[CRS_STR_INDEX]
log_df.utm_zone = log_df.utm_zone.str.split(": ", expand = True)[UTM_ZONE_STR_INDEX]

In [25]:
log_df.to_csv(os.path.join(output_dir, "processed_logs.csv"), index = False)

### Step 2: collect image file sizes (filename, file size in bytes)

In [35]:
glims_ids = log_df.glims_id.unique()
glaciers_dir = os.path.join(glacier_view_dir,"data","ee_data","training","UTM")
glaciers_sizes = []
for glims_id in glims_ids:
    glacier_dir = os.path.join(glaciers_dir,glims_id)
    try:
        folder = os.listdir(glacier_dir)
    except FileNotFoundError:
        continue
    for glacier in folder:
        if glacier == "meta_data":
            continue
        base_name = glacier.split(".")[0]
        glacier_path = os.path.join(glacier_dir,glacier)
        size_in_bytes = os.path.getsize(glacier_path)
        glaciers_sizes.append((base_name,size_in_bytes)) 

In [36]:
size_df = pd.DataFrame(glaciers_sizes, columns = ["glacier_pk", "file size in bytes"])

In [29]:
size_df.to_csv(os.path.join(output_dir,"file_sizes.csv"), index = False)

## Collect metadata from JSON

In [14]:
glacier_view_dir = os.path.join(os.path.expanduser("~"),"Desktop","projects","GlacierView")
glaciers_dir = os.path.join(glacier_view_dir,"src","earth_engine","data","ee_landing_zone/full_time_series/landsat")
glaciers = os.listdir(glaciers_dir)
ee_meta_data_df = pd.DataFrame({"glacier_pk":[], 
                               "cloud":[], 
                               "cloud_cover": [],
                               "image_quality": [],
                               "image_quality_tirs": [],
                               "iamge_quality_oli": [],
                               "cloud_cover_land": [],
                               "system_index": [],
                               "utm_zone": []
                              })
for glacier in glaciers:
    if glacier == ".DS_Store": continue
    glacier_dir = os.path.join(glaciers_dir, glacier)
    if "meta_data" not in os.listdir(glacier_dir): continue
    meta_data_dir = os.path.join(glacier_dir, "meta_data")
    meta_data_file_names = os.listdir(meta_data_dir)
    for meta_data_file_name in meta_data_file_names:
        with open(os.path.join(meta_data_dir,meta_data_file_name), mode = "r") as file:
            meta = file.read()
            split_meta = meta.split("\n")
            del split_meta[0]
            del split_meta[-1]
            for unformatted_meta in split_meta:
                truncated_meta = unformatted_meta.split(",\"")[1][:-1]
                meta_data = truncated_meta.replace("\'", "\"")
                meta_dict = json.loads(meta_data)
                meta_id_split = meta_dict['id'].split("_")
                
                landsat = "L" + str(int(meta_id_split[1][-2:]))
                date = meta_id_split[-1]
                year = date[:4]
                month = date[4:6]
                day = date[6:]
                glacier_pk = f"{glacier}_{year}-{month}-{day}_{landsat}_T1_TOA"
                
                cloud = meta_dict['properties']['cloud']
                cloud_cover = meta_dict['properties']['CLOUD_COVER']
                try: 
                    image_quality = meta_dict['properties']['IMAGE_QUALITY']
                except:
                    image_quality = ""
                
                try: 
                    image_quality_tirs = meta_dict['properties']['IMAGE_QUALITY_TIRS']
                except:
                    image_quality_tirs = ""
                    
                try: 
                    image_quality_oli = meta_dict['properties']['IMAGE_QUALITY_OLI']
                except:
                    image_quality_oli = ""
                       
                cloud_cover_land = meta_dict['properties']['CLOUD_COVER_LAND']
                system_index = meta_dict['properties']['system:index']
                utm_zone = meta_dict['properties']['UTM_ZONE']
                
                ee_meta_data = {"glacier_pk":glacier_pk, 
                               "cloud":cloud, 
                               "cloud_cover": cloud_cover,
                               "image_quality": image_quality,
                               "image_quality_tirs": image_quality_tirs,
                               "iamge_quality_oli": image_quality_oli,
                               "cloud_cover_land": cloud_cover_land,
                               "system_index": system_index,
                               "utm_zone": utm_zone
                              }
                ee_meta_data_df = ee_meta_data_df.append(ee_meta_data, ignore_index = True)                


In [17]:
ee_meta_data_df.to_csv(os.path.join(output_dir,"ee_metadata.csv"), index = False)