In [139]:
# Sentinel Hub Config

from env_vars import sentinel_hub_instance_id
from sentinelhub import SHConfig

# Import Area of Interest List

import pandas as pd
import json
from scripts.mgrs import encode,LLtoUTM


# Sentinel Hub Tile Look Up / Download

from sentinelhub import WebFeatureService, BBox, CRS, DataSource, AwsTileRequest


# Cloud Masking

import rasterio as rio
import numpy as np
import earthpy.mask as em

# Generate Product Detail DataFrame

import os
from glob import glob
import xml.etree.ElementTree as ET


# Sort / Organize Tiles by Individual Folders

from shutil import copyfile

# Reproject Masked Files 

import gdal
from glob import glob

# Create Master Raster


# Extract Polygon crops from products

import pandas as pd
from shapely.geometry import Polygon
import geopandas as gpd
from geopandas import GeoDataFrame
import earthpy.spatial as es
import traceback

# TIF to JPG

from PIL import Image


In [140]:
gdal.UseExceptions()

In [49]:
def add_trailing_slash(path):
    if path[-1] != '/':
        path += '/'
    return path

In [50]:
def create_dir(output_dir):
    # If the output folder doesn't exist, create it
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

In [51]:
def shub_connect(sentinel_hub_instance_id):

    INSTANCE_ID = sentinel_hub_instance_id  

    if INSTANCE_ID:
        config = SHConfig()
        config.instance_id = INSTANCE_ID
    else:
        config = None
        
    return config

In [52]:
config = shub_connect(sentinel_hub_instance_id)

In [210]:
def import_aois(csv_loc):    

    df_labels = pd.read_csv(csv_loc)
    df_labels = df_labels[["center-lat","center-long","polygon","Labels combined"]]

    polygons = []
    for polygon in df_labels["polygon"]:
        polygons.append(json.loads(polygon)["coordinates"])


    tiles = []
    tiles_dic = {}
    polygon_id = 0 
    coordinates = []
    for items in polygons:
        polygon_id += 1 
        for item in items:
            for lon_lat in item:
                coordinates.append(lon_lat)
                tile = encode(LLtoUTM(lon_lat[1],lon_lat[0]),1)[:-2]
                

                if polygon_id in tiles_dic:

                    tiles_dic[polygon_id].append(tile)

                else:

                    tiles_dic[polygon_id] = [tile]

                tiles.append(tile)

            tiles_dic[polygon_id] = list(set(tiles_dic[polygon_id]))

    tiles = list(set(tiles))

    df_labels["tiles"] = tiles_dic.values()

    #bounding box

    min_lon = min([i[0] for i in coordinates])
    min_lat = min([i[1] for i in coordinates])
    max_lon = max([i[0] for i in coordinates])
    max_lat = max([i[1] for i in coordinates])

    bounding_box = min_lon,min_lat,max_lon,max_lat
    
    return df_labels,bounding_box,tiles

In [211]:
df,bounding_box,tile_list = import_aois("/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/Polygon_List/polygons_100720.csv")

  return array(a, dtype, copy=False, order=order)


In [212]:
bounding_box

(8.42823, -3.373256, 25.688438, 5.845887)

In [216]:
len(tile_list)

63

In [53]:
def shub_lookup_tiles(bounding_box,tile_list,search_time_interval = ('2019-01-01T00:00:00', '2020-12-31T23:59:59'),
                   product_type = DataSource.SENTINEL2_L2A):
    
    #Misha's Tiles of Interest
    search_bbox = BBox(bbox=bounding_box, crs=CRS.WGS84)

    search_time_interval = ('2019-01-01T00:00:00', '2020-12-31T23:59:59')
    wfs_iterator = WebFeatureService(
        search_bbox,
        search_time_interval,
        data_source=product_type,
        maxcc=.05,
        config=config
    )
    results = wfs_iterator.get_tiles()
    df = pd.DataFrame(results, columns=['Tilename','Date','AmazonID'])
    df_tiles_of_interest = df[df["Tilename"].isin(tile_list)]
    df2 = df_tiles_of_interest.groupby('Tilename').head(10)
    output2 = list(df2.itertuples(index=False,name=None))
    return output2

In [54]:
results_list = shub_lookup_tiles(bounding_box,tile_list,search_time_interval = ('2019-01-01T00:00:00', '2020-12-31T23:59:59'),
                   product_type = DataSource.SENTINEL2_L2A)

In [55]:
for tup_l in results_list:
    if tup_l[0] == "35NLB":
        print(tup_l)

('35NLB', '2020-6-2', 0)
('35NLB', '2020-5-28', 0)
('35NLB', '2020-2-8', 0)
('35NLB', '2020-2-3', 0)
('35NLB', '2020-1-14', 0)
('35NLB', '2019-12-30', 0)
('35NLB', '2019-12-5', 0)
('35NLB', '2019-2-18', 0)
('35NLB', '2019-2-13', 0)
('35NLB', '2019-1-9', 0)


In [73]:
output_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw"

# downloaded_tiles = glob(f'{output_dir}/*')

downloaded = []
for direc in glob(f'{output_dir}/*'):
    downloaded_tile = direc.split("/")[-1].split("_")[1][1:]
    downloaded.append(downloaded_tile)

not_downloaded = []    
for tup_l in results_list:
    if tup_l[0] not in downloaded:
        not_downloaded.append(tup_l)
        
print("downloaded:",len(downloaded), "\n","not downloaded:",len(not_downloaded) )

downloaded: 462 
 not downloaded: 26


In [75]:
not_downloaded

[('34MCE', '2020-7-8', 0),
 ('34MCE', '2020-6-18', 0),
 ('34MCE', '2020-6-13', 0),
 ('34MCE', '2020-6-3', 0),
 ('35NLB', '2020-6-2', 0),
 ('35NLB', '2020-5-28', 0),
 ('34MCE', '2020-5-4', 0),
 ('34MCE', '2020-4-29', 0),
 ('34MCE', '2020-4-24', 0),
 ('34MCE', '2020-4-24', 1),
 ('32NQH', '2020-4-18', 0),
 ('34MCE', '2020-4-14', 0),
 ('34MCE', '2020-2-29', 0),
 ('35NLB', '2020-2-8', 0),
 ('35NLB', '2020-2-3', 0),
 ('35NLB', '2020-1-14', 0),
 ('32NQH', '2020-1-12', 0),
 ('32NQH', '2020-1-4', 0),
 ('35NLB', '2019-12-30', 0),
 ('35NLB', '2019-12-5', 0),
 ('32NQH', '2019-4-12', 0),
 ('32NQH', '2019-4-9', 0),
 ('35NLB', '2019-2-18', 0),
 ('35NLB', '2019-2-13', 0),
 ('32NQH', '2019-1-19', 0),
 ('35NLB', '2019-1-9', 0)]

In [76]:
def shub_download_tiles(results_list,output_dir,bands=["R10m/TCI"],product_type = DataSource.SENTINEL2_L2A):
    
    #Additional Params
    bands = bands
    
    output_dir = add_trailing_slash(output_dir)
    create_dir(output_dir)
    
    
    for tile in results_list:
        tile_name, time, aws_index = tile

        #Download SAFE Files
        request = AwsTileRequest(
            tile=tile_name,
            time=time,
            bands = bands, 
            aws_index=aws_index,
            data_folder=output_dir,
            data_source=product_type,
            safe_format = True
        )

        request.save_data(redownload=True)
    

In [77]:
output_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw_v2"

shub_download_tiles(not_downloaded,output_dir,bands=["R10m/TCI"],product_type = DataSource.SENTINEL2_L2A)



In [188]:
def cloud_mask_tci(prod_dir):
    
    '''
    
    prod refers product directory 
    
    '''
    
    prod_dir = add_trailing_slash(prod_dir)
    
    msk_file_path = glob(prod_dir + "*/MSK_CLDPRB_20m.jp2")[0]
    tci_file_path = glob(prod_dir + "IMG_DATA/R10m/*.jp2")[0]
    tci_filename = tci_file_path.split("/")[-1]
    output_tci_file_path = prod_dir + "/IMG_DATA/R10m/" + "processed_2_" + tci_filename 

    nodatavalue = int(0)

    with rio.open(tci_file_path) as sen_TCI_src:
        sen_TCI = sen_TCI_src.read(masked=True)
        sen_TCI_meta = sen_TCI_src.meta

    with rio.open(msk_file_path) as sen_mask_src:
        sen_mask_pre = sen_mask_src.read(1)
        sen_mask = np.repeat(np.repeat(sen_mask_pre,2,axis=0),2,axis=1)

    # All pixels above 0 probability will be classified as True

    sen_mask_qa = sen_mask > 1


    # Apply mask to source TCI file
    if np.count_nonzero(sen_mask_qa) > 0:
        sen_TCI_cl_free_nan = em.mask_pixels(sen_TCI, sen_mask_qa)
        sen_TCI_cl_free_processed = np.ma.filled(sen_TCI_cl_free_nan, fill_value=nodatavalue)
        
    else:
        sen_TCI_cl_free_processed = sen_TCI
    
    # If file shape only has one band (dimension), generate correct shape for export
    
    if len(sen_TCI_cl_free_processed.shape) < 3:
        sen_TCI_cl_free_processed = np.array((sen_TCI_cl_free_processed,sen_TCI_cl_free_processed,sen_TCI_cl_free_processed))
        sen_TCI_cl_free_processed = sen_TCI_cl_free_processed.astype(np.uint8)

    # Export cloud-masked TCI file
    with rio.open(output_tci_file_path, 'w',**sen_TCI_meta) as outf:
        print(f"creating {output_tci_file_path}")
        outf.write(sen_TCI_cl_free_processed)

In [186]:
def apply_mask_tci_safe_list(products_dir):
    '''
    
    products_dir refers to parent directory containing multiple products
    
    
    '''
    
    products_dir = add_trailing_slash(products_dir)
    
    dir_list = glob(products_dir + "/*/" )
    
    
    for directory in dir_list:
        print(f'Processing {dir_list.index(directory) + 1} of {len(dir_list)}')
        cloud_mask_tci(directory)
        
    print(f"Applied masks to {len(dir_list)} products")

In [187]:
tci_folder_list = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw"
apply_mask_tci_safe_list(tci_folder_list)

Processing 1 of 488
creating /Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T33NXB_A027603_20201004T091931//IMG_DATA/R10m/processed_2_T33NXB_20201004T085831_TCI_10m.jp2
Processing 2 of 488
creating /Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T33NYB_A027603_20201004T091931//IMG_DATA/R10m/processed_2_T33NYB_20201004T085831_TCI_10m.jp2
Processing 3 of 488
creating /Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T34NCH_A027603_20201004T091931//IMG_DATA/R10m/processed_2_T34NCH_20201004T085831_TCI_10m.jp2
Processing 4 of 488
creating /Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T33NZD_A027603_20201004T091931//IMG_DATA/R10m/processed_2_T33NZD_20201004T085831_TCI_10m.jp2
Processing 5 of 488
creating /Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data

In [83]:
# Generate array of processed directories if error during masking code
processed = []
with open("./data/processed.txt","r") as f:
    for lin in f:
        processed.append(lin.split("\n")[0])

In [86]:
def generate_product_detail_df(input_dir):
    
    '''
    Generate product details dataframe used as input for ordering products by Cloudy Pixel Percentage, No Data Pixel Percentage, or Unclassified Percentage
    
    '''
    input_dir = add_trailing_slash(input_dir)
    
    dirs = os.listdir(input_dir)

    meta_data = []
    for folder in dirs:
        xml_loc = glob(input_dir + "/" + folder + "/*.xml")[0]
        tree = ET.parse(xml_loc)
        directory = [elem.text for elem in tree.iter() if "MASK_FILENAME" in elem.tag][0].split("/")[1]
        tile_id = directory.split("_")[1]
        filepath_partial = input_dir + "/" + directory + "/IMG_DATA" + "/R10m"
        filepath = glob(filepath_partial + "/processed*.jp2")[0]
        filename = filepath.split("/")[-1]
        cloud_cover,no_data,unclassified = [elem.text for elem in tree.iter() if "CLOUDY_PIXEL_PERCENTAGE" in elem.tag 
                 or "NODATA_PIXEL_PERCENTAGE" in elem.tag or "UNCLASSIFIED_PERCENTAGE" in elem.tag]
        meta_data.append([directory,tile_id,cloud_cover,no_data,unclassified,filename,filepath])
    df = pd.DataFrame(meta_data,columns=["Directory","Tile_Id","Cloud Cover","No Data Percentage","Unclassified Percentage","Filename","Filepath"])
    df2 = df.sort_values(by=["Tile_Id","Cloud Cover","Unclassified Percentage"],ignore_index=True)
    return df2

In [190]:
input_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw"

df = generate_product_detail_df(input_dir)


In [191]:
df

Unnamed: 0,Directory,Tile_Id,Cloud Cover,No Data Percentage,Unclassified Percentage,Filename,Filepath
0,L2A_T32NMK_A024343_20200219T095439,T32NMK,10.375307,0.000000,7.602649,processed_T32NMK_20200219T094031_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
1,L2A_T32NMK_A018480_20190105T095815,T32NMK,12.898219,0.000000,4.594089,processed_T32NMK_20190105T094401_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
2,L2A_T32NMK_A014505_20191216T095042,T32NMK,14.797034,0.000000,1.208974,processed_T32NMK_20191216T094319_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
3,L2A_T32NMK_A015077_20200125T095047,T32NMK,3.746630,0.000000,1.804589,processed_T32NMK_20200125T094159_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
4,L2A_T32NMK_A014791_20200105T095703,T32NMK,9.728458,0.000000,2.432975,processed_T32NMK_20200105T094309_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
...,...,...,...,...,...,...,...
483,L2A_T35NLB_A019037_20190213T084055,T35NLB,3.899672,0.000000,0.273543,processed_T35NLB_20190213T083041_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
484,L2A_T35NLB_A009628_20190109T083800,T35NLB,30.178366,36.006102,29.398388,processed_T35NLB_20190109T083329_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
485,L2A_T35NLB_A023613_20191230T084321,T35NLB,4.121645,0.000000,0.004230,processed_T35NLB_20191230T083341_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
486,L2A_T35NLB_A014347_20191205T083849,T35NLB,5.536621,0.000000,5.679570,processed_T35NLB_20191205T083229_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...


In [88]:
def order_masked_tiles(df,output_dir):
    
    '''
    
    df input is the products detail pre-sorted dataframe to be used for sorting products 
    
    '''
    
    output_dir = add_trailing_slash(output_dir)
    create_dir(output_dir)
    

    layer = 1
    for index,row in df.iterrows(): 
        destination_dir = output_dir + str(layer)
        output_file = destination_dir + "/" + row["Filename"]

        # Check if directory exists
        if not os.path.isdir(destination_dir):
            os.mkdir(destination_dir)

        # Copy file to existing or new directory
        copyfile(row["Filepath"],output_file)

        # Check if Tile_Id already exists in the directory - only necessary up until the last tile
        if len(df) > index + 1:
            if df.loc[index,"Tile_Id"] == df.loc[index + 1,"Tile_Id"]:
                layer += 1
            else:
                layer = 1 

In [89]:
order_masked_tiles(df,"/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_v2/")

In [91]:
def convert_rasters(src_dir, dest_dir, epsg_format='EPSG:4326', windows=False):
    """Converts the rasters in the src_dir into a different EPSG format,
    keeping the same folder structure and saving them in the dest_dir."""

    src_dir = add_trailing_slash(src_dir)
    dest_dir = add_trailing_slash(dest_dir)
    
    # If the output folder doesn't exist, create it
    create_dir(dest_dir)

    input_files = glob(src_dir + '*/*.jp2')
    # Keep track of how many files were converted
    n = 1
    total = len(input_files)
    
    for f in input_files:
        print(f'processing file {n} of {total}')
        n += 1
        
        # The way we've set it up, we save each product into a numbered folder,
        # depending on which layer it's in. To keep this structure, we need to
        # pull out the folder number from the file path.
        # How exactly to do this depends on if you're using Windows or not,
        # since the path conventions are different.
        if windows:
            folder_num = f.split('\\')[-2]
            filename = f.split('\\')[-1]
        else:
            folder_num = f.split('/')[-2]
            filename = f.split('/')[-1]
        output_folder = dest_dir + folder_num + '/'
        
        
        # If the respective grouping folders are not available 
        create_dir(output_folder)
        
        output_filepath = output_folder + filename
        
        print(output_filepath)
        print(f)

        # Finally, we convert
        converted = gdal.Warp(output_filepath, [f],format='GTiff',
                              dstSRS=epsg_format, resampleAlg='near')
        converted = None
        
    print('Finished')
    

In [92]:
src_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_v2"
dest_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_warped_v2"

convert_rasters(src_dir, dest_dir)

processing file 1 of 26
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_warped_v2/1/processed_T32NQH_20200112T093249_TCI_10m.jp2
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_v2/1/processed_T32NQH_20200112T093249_TCI_10m.jp2
processing file 2 of 26
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_warped_v2/1/processed_T34MCE_20200429T084559_TCI_10m.jp2
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_v2/1/processed_T34MCE_20200429T084559_TCI_10m.jp2
processing file 3 of 26
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_warped_v2/1/processed_T35NLB_20200528T082611_TCI_10m.jp2
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_v2/1/processed_T35NLB_20200528T08

In [106]:
def make_full_virtual_raster(src_dir, dest_dir, num_layers=10):
    """Combines the rasters in the src_dir into a single virtual raster
    with proper prioritization. This is saved into the dest_dir.
    Make sure the num_layers variable is the same as the number of tile layers
    in your src_dir."""
    
    src_dir = add_trailing_slash(src_dir)
    dest_dir = add_trailing_slash(dest_dir)
    
    # If the output folder doesn't exist, create it
    create_dir(dest_dir)
    
    
    
    for layer in range(1, num_layers+1):
        print('Making Layer', layer)
        
        # Get the filenames from the layer in question
        filenames = glob(src_dir + f'{layer}/*.jp2', recursive=True)
        
        output_file = dest_dir + f'Layer{layer}.vrt'
    
        vrt = gdal.BuildVRT(output_file, filenames, resolution='average', resampleAlg='nearest', srcNodata=0)
    
        vrt.FlushCache()
    
    print('Making full raster')

    # To make the full raster, we combine every layer. Do it in reverse order because (I believe)
    # the last items in the list are prioritized.

    input_files = [dest_dir + f'Layer{i}.vrt' for i in reversed(range(1, num_layers+1))]
    
    output_file = dest_dir + 'full_v2.vrt'

    vrt = gdal.BuildVRT(output_file, input_files, resolution='average', resampleAlg='nearest', srcNodata=0)

    vrt.FlushCache()

    print('Finished')

In [107]:
src_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_ordered_warped"
dest_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_master_raster_v2"

make_full_virtual_raster(src_dir, dest_dir)


Making Layer 1
Making Layer 2
Making Layer 3
Making Layer 4
Making Layer 5
Making Layer 6
Making Layer 7
Making Layer 8
Making Layer 9
Making Layer 10
Making full raster
Finished


In [108]:
def vrt_to_tif(output_file,src_file):

    translate = gdal.Translate(output_file, src_file,
                               format='GTiff')
    translate.FlushCache()

In [109]:
src_file = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_master_raster_v2/full_v2.vrt"
output_file = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_master_raster_v2/full_tif_v2.tif"

vrt_to_tif(output_file,src_file)

In [110]:
def csv_to_gdf(csv_loc):
    '''
    import manually created areas of interest csv
    
    output is an in-memory geo dataframe with one polygon AOI per row to be utilized for cropping master raster
    
    '''
    df = pd.read_csv(csv_loc)
    df_labels = df[["polygon","Labels combined"]]

    #create geometry column for polygons
    polygons = []
    for polygon in df_labels["polygon"]:
        polygon_temp = []
        for coordinates in json.loads(polygon)["coordinates"]:
            for coordinate in coordinates:
                polygon_temp.append(tuple(coordinate))
            polygons.append(Polygon(polygon_temp))

    gdf_series = gpd.GeoSeries(polygons)
    gdf = gpd.GeoDataFrame(gdf_series,geometry=0)
    gdf["geometry"] = gdf[0]
    gdf = gdf.drop(columns=[0])
    
    # add Labels column 
    gdf["Labels"] = [s.strip().split(", ") for s in df_labels["Labels combined"]]
    
    return gdf

In [111]:
csv_loc = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/Polygon_List/polygons_100820.csv"

gdf = csv_to_gdf(csv_loc)

In [114]:
gdf

Unnamed: 0,geometry,Labels
0,"POLYGON ((9.08878 5.75315, 9.17925 5.70311, 9....","[ISL, Rainforest]"
1,"POLYGON ((8.89189 5.80456, 8.93188 5.80302, 8....","[ISL, Rainforest]"
2,"POLYGON ((13.14840 5.59719, 13.19990 5.62265, ...","[ISL, Rainforest]"
3,"POLYGON ((14.34763 5.34181, 14.35965 5.35079, ...","[Rainforest, Mining, Savannah, Roads]"
4,"POLYGON ((13.02927 5.42701, 13.10274 5.42769, ...","[ISL, Rainforest]"
...,...,...
96,"POLYGON ((18.06616 -2.71119, 18.06624 -2.70772...","[Fire, Shifting cultivation]"
97,"POLYGON ((19.17355 -2.85123, 19.20582 -2.83863...","[Fire, Savannah]"
98,"POLYGON ((19.27242 -3.13435, 19.15277 -3.13572...","[ISL, Rainforest]"
99,"POLYGON ((19.24650 -3.12663, 19.24650 -3.08275...","[ISL, Rainforest]"


In [115]:
def export_aoi_polygon_rasters(gdf,master_raster_path,output_dir):
    
    output_parent_dir = add_trailing_slash(output_dir) 
    
    # create parent output directory if it doesn't exist
    create_dir(output_dir)

    src_raster_file = rio.open(master_raster_path)
    
    for index in range(gdf.shape[0]):
        
        crop_extent = gdf.loc[[index],"geometry"]
        

        try:
            raster_crop, raster_meta = es.crop_image(src_raster_file, crop_extent)
#             print(f"succesfully cropped image {index} ")
            
        except Exception:
            
            print(f"polygon on row {index} does not overlap with master raster, continuing")
            traceback.print_exc()
            
        

        # Update the metadata to have the new shape (x and y and affine information)
        raster_meta.update({"driver": "GTiff",
                         "height": raster_crop.shape[1],
                         "width": raster_crop.shape[2],
                         "transform": raster_meta["transform"]})

#         mask the nodata values
        raster_crop_ma = np.ma.masked_equal(raster_crop, 0) 
        
        
        for labels in gdf.loc[[index],"Labels"]:
            for label in labels:
                
                # output directory per label
                output_label_dir = output_parent_dir + label
                output_label_dir = add_trailing_slash(output_label_dir) 
                
                # create output directory if it doesn't exist
                create_dir(output_label_dir)
                

                # output file path
                outpath = output_label_dir + str(index+1) + '.tif'
                print(outpath)

                # Export cloud-masked TCI file
                print(f'Cropping Polygon {index + 1} for Label "{label}"')
                
                with rio.open(outpath, 'w', **raster_meta) as outf:
                    outf.write(raster_crop_ma)

In [116]:
master_raster_path = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_master_raster_v2/full_tif_v2.tif"
output_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/"

export_aoi_polygon_rasters(gdf,master_raster_path,output_dir)

/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/ISL/1.tif
Cropping Polygon 0 for Label ISL
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/Rainforest/1.tif
Cropping Polygon 0 for Label Rainforest
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/ISL/2.tif
Cropping Polygon 1 for Label ISL
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/Rainforest/2.tif
Cropping Polygon 1 for Label Rainforest
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/ISL/3.tif
Cropping Polygon 2 for Label ISL
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/Rainforest/3.tif
Cropping Polygon 2 for Label Rainforest
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/Rainforest/4.tif
Cropping Po

In [137]:
def tif_to_jpg(in_dir,out_dir):
    
    
    
    in_dir_base = add_trailing_slash(in_dir)
    
    out_dir_base = add_trailing_slash(out_dir)
    
    # If the output parent folder doesn't exist, create it
    
    create_dir(out_dir)
    
    # List containing respective label directories
    
    in_dir_list = glob(in_dir_base + "*/")
    
    for in_dir_child in in_dir_list:
        
        label = in_dir_child.split("/")[-2]
        
        # If output child folder doesn't exist, create
        
        out_dir_child = out_dir_base + label
        
        out_dir_child = add_trailing_slash(out_dir_child)
        
        create_dir(out_dir_child)
    

        # Export Polygons from TIF to  JPEG

        tif_list = glob(in_dir_child + "*.tif",recursive=True)

        for tif_path in tif_list:
            base_filename = tif_path.split("/")[-1].split(".")[0]
            im = Image.open(tif_path)
            im.thumbnail(im.size)
            im.save(out_dir_child + base_filename + ".jpg", "JPEG", quality=100)
    

In [138]:
in_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/LabelBox/TIF"
out_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/AOI_Crops/LabelBox/JPG"

tif_to_jpg(in_dir,out_dir)

# AWS S3 Sandbox

In [155]:
import boto3

s3 = boto3.client("s3")

In [167]:
s3.create_bucket(Bucket="canopy-staging-download")

{'ResponseMetadata': {'RequestId': 'B9983BB7A06108AD',
  'HostId': '093iFDxG/xMV+xL5hnGk6FmVqsBlEJ9Av/f+LyOqDtwRs7vup5/FzX3ajWvM7sAR6FTDFDlFy1Q=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '093iFDxG/xMV+xL5hnGk6FmVqsBlEJ9Av/f+LyOqDtwRs7vup5/FzX3ajWvM7sAR6FTDFDlFy1Q=',
   'x-amz-request-id': 'B9983BB7A06108AD',
   'date': 'Tue, 13 Oct 2020 01:42:18 GMT',
   'location': '/canopy-staging-download',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Location': '/canopy-staging-download'}

In [163]:
response = s3.list_buckets()
buckets = [bucket["Name"] for bucket in response["Buckets"]]

In [153]:
!aws s3


usage: aws [options] <command> <subcommand> [parameters]
aws: error: too few arguments


In [154]:
!aws get-current-user

usage: aws [options] <command> <subcommand> [<subcommand> ...] [parameters]
To see help text, you can run:

  aws help
  aws <command> help
  aws <command> <subcommand> help
aws: error: argument command: Invalid choice, valid choices are:

accessanalyzer                           | acm                                     
acm-pca                                  | alexaforbusiness                        
amplify                                  | apigateway                              
apigatewaymanagementapi                  | apigatewayv2                            
appconfig                                | application-autoscaling                 
application-insights                     | appmesh                                 
appstream                                | appsync                                 
athena                                   | autoscaling                             
autoscaling-plans                        | backup                                  
batc

# Testing Cloud Masking Code

In [177]:
def cloud_mask_tci(prod_dir):
    
    '''
    
    prod refers product directory 
    
    '''
    
    prod_dir = add_trailing_slash(prod_dir)
    
    msk_file_path = glob(prod_dir + "*/MSK_CLDPRB_20m.jp2")[0]
    print(msk_file_path)
    tci_file_path = glob(prod_dir + "IMG_DATA/R10m/*.jp2")[0]
    print(tci_file_path)
    tci_filename = tci_file_path.split("/")[-1]
    output_tci_file_path = prod_dir + "/IMG_DATA/R10m/" + "processed_2_" + tci_filename 

    nodatavalue = int(0)

    with rio.open(tci_file_path) as sen_TCI_src:
        sen_TCI = sen_TCI_src.read(masked=True)
        sen_TCI_meta = sen_TCI_src.meta

    with rio.open(msk_file_path) as sen_mask_src:
        sen_mask_pre = sen_mask_src.read(1)
        sen_mask = np.repeat(np.repeat(sen_mask_pre,2,axis=0),2,axis=1)

    # All pixels above 0 probability will be classified as True

    sen_mask_qa = sen_mask > 0


    # Apply mask to source TCI file
#     if np.count_nonzero(sen_mask_qa) > 0:
    sen_TCI_cl_free_nan = em.mask_pixels(sen_TCI, sen_mask_qa)
    sen_TCI_cl_free_processed = np.ma.filled(sen_TCI_cl_free_nan, fill_value=nodatavalue)
        
#     else:
#         sen_TCI_cl_free_processed = sen_mask_qa
    
#     # If file shape only has one band (dimension), generate correct shape for export
    
#     if len(sen_TCI_cl_free_processed.shape) < 3:
#         sen_TCI_cl_free_processed = np.array((sen_TCI_cl_free_processed,sen_TCI_cl_free_processed,sen_TCI_cl_free_processed))
#         sen_TCI_cl_free_processed = sen_TCI_cl_free_processed.astype(np.uint8)

    # Export cloud-masked TCI file
    with rio.open(output_tci_file_path, 'w',**sen_TCI_meta) as outf:
        outf.write(sen_TCI_cl_free_processed)

In [178]:
cloud_mask_tci("/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T34MBC_A017722_20200728T085918")

/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T34MBC_A017722_20200728T085918/QI_DATA/MSK_CLDPRB_20m.jp2
/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Labelled/Tiles_v3/SAFE_true_raw/L2A_T34MBC_A017722_20200728T085918/IMG_DATA/R10m/T34MBC_20200728T084559_TCI_10m.jp2


# Testing Metadata Dataframe Creation

In [207]:
def generate_product_detail_df(input_dir):
    
    '''
    Generate product details dataframe used as input for ordering products by Cloudy Pixel Percentage, No Data Pixel Percentage, or Unclassified Percentage
    
    '''
    input_dir = add_trailing_slash(input_dir)
    
    dirs = os.listdir(input_dir)

    meta_data = []
    for folder in dirs:
        xml_loc = glob(input_dir + "/" + folder + "/*.xml")[0]
        tree = ET.parse(xml_loc)
        directory = [elem.text for elem in tree.iter() if "MASK_FILENAME" in elem.tag][0].split("/")[1]
        tile_id = directory.split("_")[1]
        filepath_partial = input_dir + "/" + directory + "/IMG_DATA" + "/R10m"
        filepath = glob(filepath_partial + "/processed*.jp2")[0]
        filename = filepath.split("/")[-1]
        cloud_cover,no_data,unclassified = [float(elem.text) for elem in tree.iter() if "CLOUDY_PIXEL_PERCENTAGE" in elem.tag 
                 or "NODATA_PIXEL_PERCENTAGE" in elem.tag or "UNCLASSIFIED_PERCENTAGE" in elem.tag]
        meta_data.append([directory,tile_id,cloud_cover,no_data,unclassified,filename,filepath])
    df = pd.DataFrame(meta_data,columns=["Directory","Tile_Id","Cloud Cover","No Data Percentage","Unclassified Percentage","Filename","Filepath"])
    df2 = df.sort_values(by=["Tile_Id","Cloud Cover","Unclassified Percentage"],ignore_index=True)
    return df2

In [208]:
df = generate_product_detail_df(input_dir)

In [209]:
df

Unnamed: 0,Directory,Tile_Id,Cloud Cover,No Data Percentage,Unclassified Percentage,Filename,Filepath
0,L2A_T32NMK_A015077_20200125T095047,T32NMK,3.746630,0.000000,1.804589,processed_T32NMK_20200125T094159_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
1,L2A_T32NMK_A014791_20200105T095703,T32NMK,9.728458,0.000000,2.432975,processed_T32NMK_20200105T094309_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
2,L2A_T32NMK_A024343_20200219T095439,T32NMK,10.375307,0.000000,7.602649,processed_T32NMK_20200219T094031_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
3,L2A_T32NMK_A018480_20190105T095815,T32NMK,12.898219,0.000000,4.594089,processed_T32NMK_20190105T094401_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
4,L2A_T32NMK_A014505_20191216T095042,T32NMK,14.797034,0.000000,1.208974,processed_T32NMK_20191216T094319_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
...,...,...,...,...,...,...,...
483,L2A_T35NLB_A019037_20190213T084055,T35NLB,3.899672,0.000000,0.273543,processed_T35NLB_20190213T083041_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
484,L2A_T35NLB_A023613_20191230T084321,T35NLB,4.121645,0.000000,0.004230,processed_T35NLB_20191230T083341_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
485,L2A_T35NLB_A014347_20191205T083849,T35NLB,5.536621,0.000000,5.679570,processed_T35NLB_20191205T083229_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...
486,L2A_T35NLB_A015205_20200203T084711,T35NLB,6.972405,0.000000,3.153487,processed_T35NLB_20200203T083049_TCI_10m.jp2,/Volumes/Lacie/zhenyadata/Project_Canopy_Data/...


In [214]:
def import_aois(csv_loc):    

    df_labels = pd.read_csv(csv_loc)
    df_labels = df_labels[["center-lat","center-long","polygon","Labels combined"]]

    polygons = []
    for polygon in df_labels["polygon"]:
        polygons.append(json.loads(polygon)["coordinates"])


    tiles = []
    tiles_dic = {}
    polygon_id = 0 
    coordinates = []
    for items in polygons:
        polygon_id += 1 
        for item in items:
            for lon_lat in item:
                coordinates.append(lon_lat)
                tile = encode(LLtoUTM(lon_lat[1],lon_lat[0]),1)[:-2]
                

                if polygon_id in tiles_dic:

                    tiles_dic[polygon_id].append(tile)

                else:

                    tiles_dic[polygon_id] = [tile]

                tiles.append(tile)

            tiles_dic[polygon_id] = list(set(tiles_dic[polygon_id]))

    tiles = list(set(tiles))

    df_labels["tiles"] = tiles_dic.values()

    #bounding box

    min_lon = min([i[0] for i in coordinates])
    min_lat = min([i[1] for i in coordinates])
    max_lon = max([i[0] for i in coordinates])
    max_lat = max([i[1] for i in coordinates])

    bounding_box = min_lon,min_lat,max_lon,max_lat
    
    return df_labels,bounding_box,tiles

In [215]:
import_aois(csv_loc)

(     center-lat  center-long  \
 0       5.77459      9.16861   
 1       5.75499      8.92339   
 2       5.59555     13.18814   
 3       5.35518     14.33673   
 4       5.33908     13.10411   
 ..          ...          ...   
 96     -2.70602     18.07841   
 97     -2.85049     19.19509   
 98     -3.08219     19.18118   
 99     -3.08245     19.16616   
 100    -3.27737     18.82696   
 
                                                polygon  \
 0    {"type":"Polygon","coordinates":[[[9.088783,5....   
 1    {"type":"Polygon","coordinates":[[[8.891888,5....   
 2    {"type":"Polygon","coordinates":[[[13.148403,5...   
 3    {"type":"Polygon","coordinates":[[[14.34763,5....   
 4    {"type":"Polygon","coordinates":[[[13.02927,5....   
 ..                                                 ...   
 96   {"type":"Polygon","coordinates":[[[18.066158,-...   
 97   {"type":"Polygon","coordinates":[[[19.173546,-...   
 98   {"type":"Polygon","coordinates":[[[19.272423,-...   
 99   {"type