In [1]:
import geopandas as gpd
import os

In [6]:
root = 'D:/canopy_data/reprojected-rasters/2015/'
tci_files = os.listdir(root)
tci_files[0]

'T32MME_20151023T094042_TCI.jp2'

In [7]:
file1_uri = root + tci_files[0]
file1_uri

'D:/canopy_data/reprojected-rasters/2015/T32MME_20151023T094042_TCI.jp2'

In [2]:
import rasterio as rio

In [11]:
with rio.open(file1_uri) as file1:
    raster1 = file1.read()

In [12]:
raster1

array([[[  0,  70,  78, ...,   0,   0,   0],
        [  0,  75,  74, ...,   0,   0,   0],
        [  0,  76,  75, ...,   0,   0,   0],
        ...,
        [255, 255, 255, ...,   0,   0,   0],
        [255, 255, 255, ...,   0,   0,   0],
        [  0,   0,   0, ...,   0,   0,   0]],

       [[  0,  80,  84, ...,   0,   0,   0],
        [  0,  81,  81, ...,   0,   0,   0],
        [  0,  82,  83, ...,   0,   0,   0],
        ...,
        [255, 255, 255, ...,   0,   0,   0],
        [255, 255, 255, ...,   0,   0,   0],
        [  0,   0,   0, ...,   0,   0,   0]],

       [[  0, 105, 104, ...,   0,   0,   0],
        [  0, 104, 107, ...,   0,   0,   0],
        [  0, 105, 108, ...,   0,   0,   0],
        ...,
        [255, 255, 255, ...,   0,   0,   0],
        [255, 255, 255, ...,   0,   0,   0],
        [  0,   0,   0, ...,   0,   0,   0]]], dtype=uint8)

In [13]:
raster1.shape

(3, 11017, 10944)

In [14]:
raster1[0]

array([[  0,  70,  78, ...,   0,   0,   0],
       [  0,  75,  74, ...,   0,   0,   0],
       [  0,  76,  75, ...,   0,   0,   0],
       ...,
       [255, 255, 255, ...,   0,   0,   0],
       [255, 255, 255, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=uint8)

In [21]:
# https://note.nkmk.me/en/python-numpy-count/

import numpy as np

raster1_combined = raster1[0] + raster1[1] + raster1[2]
raster1_combined

array([[  0, 255,  10, ...,   0,   0,   0],
       [  0,   4,   6, ...,   0,   0,   0],
       [  0,   7,  10, ...,   0,   0,   0],
       ...,
       [253, 253, 253, ...,   0,   0,   0],
       [253, 253, 253, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=uint8)

In [22]:
raster1_combined.shape

(11017, 10944)

In [25]:
raster1_combined <= 3

array([[ True, False, False, ...,  True,  True,  True],
       [ True, False, False, ...,  True,  True,  True],
       [ True, False, False, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [24]:
rows = raster1_combined.shape[0]
columns = raster1_combined.shape[1]

np.count_nonzero(raster1_combined <= 3) / (rows * columns)

0.5584822276922374

In [8]:
import numpy as np
import rasterio as rio

def detect_missing_pixels(filename, na_value=3):
    with rio.open(filename) as f:
        raster = f.read()
        
    raster_combined = raster[0] + raster[1] + raster[2]
    rows = raster_combined.shape[0]
    columns = raster_combined.shape[1]
    
    missing = np.count_nonzero(raster_combined <= na_value)
    total_pixels = (rows * columns)
    
    return (missing, total_pixels)

In [28]:
detect_missing_pixels(file1_uri)

0.5584822276922374

In [6]:
from sentinelsat import SentinelAPI, read_geojson, geojson_to_wkt
from datetime import date
from env_vars import sentinel_username,sentinel_password
import glob
import pandas as pd
import subprocess


def get_api():
    
    return SentinelAPI(sentinel_username, sentinel_password, "https://scihub.copernicus.eu/apihub/")


def get_products_df(api, footprint, date_start, date_end,
                 area='IsWithin', raw='1C',
                 platform='Sentinel-2', cloudcover=(1,5)):
    
    products = api.query(footprint,
                         date=(date_start, date_end),
                         area_relation=area,
                         raw=raw,
                         platformname=platform,
                         cloudcoverpercentage=cloudcover)
    
    return api.to_dataframe(products)


def get_products_df_for_year(api, footprint, year, cloudcover):
    months_dict = {
        1: 31,
        2: 28,
        3: 31,
        4: 30,
        5: 31,
        6: 30,
        7: 31,
        8: 31,
        9: 30,
        10: 31,
        11: 30,
        12: 31
    }
    
    month_start = 1
    month_end = 13
    if year < 2015 or year > 2020:
        return None
    elif year == 2015:
        month_start = 7
    elif year == 2020:
        months_dict[2] = 29
        month_end = 8

    products_df = pd.DataFrame()
    for month in range(month_start, month_end):
        print('getting month', month)
        date_start = date(year, month, 1)
        date_end = date(year, month, months_dict[month])
        products_df_2 = get_products_df(api, footprint, date_start, date_end, cloudcover=cloudcover)
        products_df = pd.concat([products_df, products_df_2])
        products_df_3 = get_products_df(api, footprint, date_start, date_end, area='Intersects', cloudcover=cloudcover)
        products_df = pd.concat([products_df, products_df_2])
        print('Products so far:', len(products_df))
        
    return products_df


def remove_tile_id_duplicates(products_df):
    products_df = products_df.reset_index()
    products_df = products_df.rename(columns={'index': 'sentinel_id'})
    products_df = products_df.drop_duplicates(subset=['tileid'])
    
    return products_df


def find_ee_index_matches(api, products_df, ee_index):
    
    if len(products_df) == 0:
        print('No search results')
        return []

    ee_index_2 = ee_index.reset_index()
    merged = products_df.merge(ee_index_2, left_on='title', right_on='PRODUCT_ID')
    
    print(len(merged), 'total rows')
            
    return merged


def generate_tci_uri(ee_index, row):
    uri = ee_index.loc[row, 'BASE_URL']
    uri += '/GRANULE/'
    granule_id = ee_index.loc[row, 'GRANULE_ID']
    uri += granule_id
    uri += '/IMG_DATA/'
    tile_id = granule_id.split('_')[1]
    date = ee_index.loc[row, 'DATATAKE_IDENTIFIER'].split('_')[1]
    uri += f'{tile_id}_{date}_TCI.jp2'
    
    return uri


def download_tcis(ee_index, rows, dest_folder):
    
    cloud_env = r"C:\Users\David\AppData\Local\Google\Cloud SDK\cloud_env.bat"
    
    for i, row in enumerate(rows):
        # https://stackoverflow.com/questions/5419389/how-to-overwrite-the-previous-print-to-stdout-in-python
        print(f'Downloading file {i} of {len(rows)}\r', end='')
        uri = generate_tci_uri(ee_index, row)
        subprocess.run([cloud_env, '&&', 'gsutil', 'cp', uri, dest_folder])
        
        
def get_tcis_for_year(year, footprint, ee_index, dest_folder, raw='2A', cloudcover=(1,5), remove_dupes=False):
    
    api = get_api()
    
    print('getting products')
    if year < 2015:
        print('ERROR: Year must be 2014 or above')
        return None
    else:
        products_df = get_products_df_for_year(api, footprint, year, cloudcover)
    
    if remove_dupes:
        products_df = remove_tile_id_duplicates(products_df)

    merged = find_ee_index_matches(api, products_df, ee_index)
    rows = merged['index'].tolist()
    print('downloading tcis')
    download_tcis(ee_index, rows, dest_folder)
    
    return merged

In [12]:
import os


os.listdir('D:/canopy_data/2015/')

['T32MME_20151023T094042_TCI.jp2',
 'T32MPD_20151226T092412_TCI.jp2',
 'T32MPE_20151226T092412_TCI.jp2',
 'T32MQD_20151219T093412_TCI.jp2',
 'T32MRA_20151223T091412_TCI.jp2',
 'T32MRD_20151226T092412_TCI.jp2',
 'T32MRE_20151226T092412_TCI.jp2',
 'T32NMH_20151122T094312_TCI.jp2',
 'T32NML_20151222T094412_TCI.jp2',
 'T32NNG_20150913T094016_TCI.jp2',
 'T32NNH_20150913T094016_TCI.jp2',
 'T32NNJ_20151222T094412_TCI.jp2',
 'T32NNK_20151209T093402_TCI.jp2',
 'T32NNL_20151222T094412_TCI.jp2',
 'T32NPH_20151229T093412_TCI.jp2',
 'T32NPJ_20151229T093412_TCI.jp2',
 'T32NQH_20151226T092412_TCI.jp2',
 'T32NQJ_20151216T092412_TCI.jp2',
 'T32NRG_20151226T092412_TCI.jp2',
 'T32NRH_20151219T093412_TCI.jp2',
 'T32NRJ_20151216T092412_TCI.jp2',
 'T32NRK_20151206T092352_TCI.jp2',
 'T33MTU_20151223T091412_TCI.jp2',
 'T33MTV_20151226T092412_TCI.jp2',
 'T33MUT_20151126T092322_TCI.jp2',
 'T33MUU_20151226T092412_TCI.jp2',
 'T33MUV_20151226T092412_TCI.jp2',
 'T33MWV_20150924T090726_TCI.jp2',
 'T33MXU_20150805T09

In [15]:
def make_missing_pixels_df(year, footprint, ee_index, dest_folder, cloudcover=(1,5), remove_dupes=False):
    
    merged_df = get_tcis_for_year(year, footprint, ee_index, dest_folder, cloudcover, remove_dupes)
    
    print('detecting mixing pixels')
    
    filenames = [dest_folder + f for f in os.listdir(dest_folder)]
    
    missings = []
    totals = []
    missing_pixels_percents = []
    for filename in filenames:
        missing, total = detect_missing_pixels(filename)
        missings.append(missing)
        totals.append(total)
        missing_pixels_percents.append(missing / total)
        
    merged_df['total_pixels'] = totals
    merged_df['missing_pixels'] = missings
    merged_df['percent_pixels_missing'] = missing_pixels_percents
    
    return merged_df

In [11]:
ee_index = pd.read_csv('earth-engine-index.csv')

footprint = geojson_to_wkt(read_geojson('./data/Geometry/republic_of_the_congo_boundary_bounding_box.geojson'))

In [16]:
df = make_missing_pixels_df(2020, footprint, ee_index, 'D:/canopy_data/republic-of-the-congo/2020/')

getting products
getting month 1
Products so far: 56
getting month 2
Products so far: 114
getting month 3
Products so far: 144
getting month 4
Products so far: 198
getting month 5
Products so far: 260
getting month 6
Products so far: 296
getting month 7
Products so far: 346
332 total rows
downloading tcis
detecting mixing pixels


NameError: name 'missing_pixels_percent' is not defined

In [17]:
dest_folder = 'D:/canopy_data/republic-of-the-congo/2020/'

filenames = [dest_folder + f for f in os.listdir(dest_folder)]
    
missings = []
totals = []
missing_pixels_percents = []
for filename in filenames:
    missing, total = detect_missing_pixels(filename)
    missings.append(missing)
    totals.append(total)
    missing_pixels_percents.append(missing / total)
        
df = pd.DataFrame(data={'filename': filenames, 'missing_pixels': missings,
                        'total_pixels': totals, 'missing_pixel_percentage': missing_pixels_percents})

df.head()

Unnamed: 0,filename,missing_pixels,total_pixels,missing_pixel_percentage
0,D:/canopy_data/republic-of-the-congo/2020/T32N...,117949527,120560400,0.978344
1,D:/canopy_data/republic-of-the-congo/2020/T32N...,2934540,120560400,0.024341
2,D:/canopy_data/republic-of-the-congo/2020/T32N...,99133863,120560400,0.822275
3,D:/canopy_data/republic-of-the-congo/2020/T33M...,63034147,120560400,0.522843
4,D:/canopy_data/republic-of-the-congo/2020/T33M...,62729400,120560400,0.520315


In [20]:
df.describe()

Unnamed: 0,missing_pixels,total_pixels,missing_pixel_percentage
count,166.0,166.0,166.0
mean,36856200.0,120560400.0,0.305707
std,42821240.0,0.0,0.355185
min,6099.0,120560400.0,5.1e-05
25%,606892.5,120560400.0,0.005034
50%,9227142.0,120560400.0,0.076535
75%,74726370.0,120560400.0,0.619825
max,120322900.0,120560400.0,0.99803


In [21]:
ee_index[ee_index['PRODUCT_ID'].str.contains('L2A')]

Unnamed: 0,GRANULE_ID,PRODUCT_ID,DATATAKE_IDENTIFIER,MGRS_TILE,SENSING_TIME,TOTAL_SIZE,CLOUD_COVER,GEOMETRIC_QUALITY_FLAG,GENERATION_TIME,NORTH_LAT,SOUTH_LAT,WEST_LON,EAST_LON,BASE_URL


In [22]:
ee_index.head()

Unnamed: 0,GRANULE_ID,PRODUCT_ID,DATATAKE_IDENTIFIER,MGRS_TILE,SENSING_TIME,TOTAL_SIZE,CLOUD_COVER,GEOMETRIC_QUALITY_FLAG,GENERATION_TIME,NORTH_LAT,SOUTH_LAT,WEST_LON,EAST_LON,BASE_URL
0,L1C_T51HWC_A021621_20190813T014402,S2A_MSIL1C_20190813T013321_N0208_R031_T51HWC_2...,GS2A_20190813T013321_021621_N02.08,51HWC,2019-08-13T01:47:02.634000Z,472312038.0,0.0,,2019-08-13T05:44:52.000000Z,-33.433323,-34.429078,123.192969,124.194586,gs://gcp-public-data-sentinel-2/tiles/51/H/WC/...
1,L1C_T21HYT_A011547_20190523T133233,S2B_MSIL1C_20190523T133239_N0207_R081_T21HYT_2...,GS2B_20190523T133239_011547_N02.07,21HYT,2019-05-23T13:43:06.000000Z,93794242.0,0.0,,2019-05-23T15:10:06.000000Z,-37.894755,-38.160337,-54.58066,-53.464817,gs://gcp-public-data-sentinel-2/tiles/21/H/YT/...
2,L1C_T11SLA_A016512_20180820T184735,S2A_MSIL1C_20180820T183921_N0206_R070_T11SLA_2...,GS2A_20180820T183921_016512_N02.06,11SLA,2018-08-20T18:47:35.340000Z,852706489.0,4.608,,2018-08-20T23:54:18.000000Z,37.042336,36.036258,-119.248493,-118.007274,gs://gcp-public-data-sentinel-2/tiles/11/S/LA/...
3,L1C_T02KMG_A003029_20171004T213912,S2B_MSIL1C_20171004T213909_N0205_R143_T02KMG_2...,GS2B_20171004T213909_003029_N02.05,02KMG,2017-10-04T21:39:12.460000Z,502814591.0,9.4476,PASSED,2017-10-04T21:39:12.000000Z,-16.280273,-17.273285,-171.686702,-170.908268,gs://gcp-public-data-sentinel-2/tiles/02/K/MG/...
4,L1C_T49NHB_A001931_20170720T024456,S2B_MSIL1C_20170720T022549_N0205_R046_T49NHB_2...,GS2B_20170720T022549_001931_N02.05,49NHB,2017-07-20T02:44:56.730000Z,176714634.0,12.6707,PASSED,2017-07-20T02:44:56.000000Z,1.806308,0.814825,114.385102,114.681769,gs://gcp-public-data-sentinel-2/tiles/49/N/HB/...
