In [1]:
import boto3
import pandas as pd
import os
import subprocess
from datetime import datetime
from hurry.filesize import size, si, verbose

pd.options.display.max_colwidth=500

Boto3 is a Python library which allows you to access Amazon web services, such as Amazon's cloud storage platform, s3: Simple Storage Service.

Two tools Boto3 offers are an s3 client and an s3 resource.

The client is more low-level - it has access to all the info, i.e. when files were last updated and the file size.

The resource is higher-level, and better suited to run summary operations.



In [2]:
s3_resource = boto3.resource("s3")
s3_client = s3_resource.meta.client

<b>S3 Client</b>

In [3]:
s3_client = boto3.client("s3")
bucket_list = s3_client.list_buckets()
buckets = [bucket["Name"] for bucket in bucket_list["Buckets"]]
print("Bucket List:", buckets)

Bucket List: ['Development_Alert', 'WHRC-carbon', 'alerts-test-data', 'aqueduct-projection-backup01', 'aws-athena-query-results-838255262149-us-east-1', 'aws-logs-838255262149-us-east-1', 'blueraster-wri', 'bmf_s3', 'communityland', 'digital-globe-imagery', 'fire-alerts', 'fires', 'forest-watcher-files', 'gfw-clusters-test', 'gfw-data', 'gfw-files', 'gfw-gee-glad-export', 'gfw-glad-clusters-v1-dev-serverlessdeploymentbuck-93xjmdvla1vg', 'gfw-glad-clusters-v1-dev-serverlessdeploymentbuck-zim9ksavf2sx', 'gfw-maps', 'gfw-notifications', 'gfw-pro', 'gfw-user-fires', 'gfw-user-fires-staging', 'gfw.blog', 'gfw2-data', 'gfw2-data.s3.amazonaws.com', 'gfw2-test', 'gfw2_download', 'gfw2stories', 'gfw_odp', 'grump-tiles', 'ignfi_s3', 'landmarkmap', 'landscape-application', 'lpfn', 'osinfor', 'palm-risk-poc', 'rw-nrt-scripts', 's3hub-2f9587736414f4c637f06a6d0a895309295931e8773a0856d95e337c9', 'suitability-mapper', 'terra-i_grids', 'user-contributed-data', 'whrc2', 'wri', 'wri-api-backups', 'wri-as

Read this article to learn how the Prefix and Delimiter options work when calling list_objects from the s3 client.

Also, look over for the reference to "hooking into the events" to register a change in the client config (in this case, allow to send messages through boto3 without signing what you send with an SSL certificate).

Listing the top level contents of a s3 bucket with Prefix and Delimiter: https://github.com/boto/boto3/issues/134

Another exploration of Prefix and Delimiter: https://realguess.net/2014/05/24/amazon-s3-delimiter-and-prefix/

In [5]:
objects = s3_client.list_objects_v2(Bucket="wri-public-data",
                                Prefix="resourcewatch/raster/",
                                 # Use Delimiter to only see the folders in resourcewatch/raster/
                                 # To see the actual items, need to remove the Delimiter keyword
                                #Delimiter="/"
                                )

objects["Contents"]
all_raster_objects = [obj for obj in objects["Contents"] if obj["Key"][-1] != "/" and 'foo_048' in obj["Key"] and 'Australia' in obj['Key']]

In [4]:
#https://alexwlchan.net/2017/07/listing-s3-keys/

def get_matching_s3_keys(bucket, prefix='', suffix=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    s3 = boto3.client('s3')
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            size = obj['Size']
            if key.startswith(prefix) and key.endswith(suffix):
                yield key, size

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [48]:
for _key, _size in get_matching_s3_keys(bucket='wri-public-data', prefix='resourcewatch/raster/', suffix='.tif'):
    all_raster_objects.append([_key,_size])
    
raster_summary = pd.DataFrame(all_raster_objects)#[['Key','Size']]
raster_summary.columns = ['Key','Size']
raster_summary = raster_summary.sort_values(by='Size', axis=0, ascending=False)

raster_summary['Size'] = raster_summary.apply(lambda row: size(row['Size'], system=verbose), axis=1)
raster_summary.drop_duplicates().to_csv('/Users/nathansuberi/Desktop/RW_Data/rw_raster_sizes.csv')

In [49]:
raster_summary.drop_duplicates()

Unnamed: 0,Key,Size
109,resourcewatch/raster/ene_023_2012_and_2016_nightlights_composite/ene_023_2012_nightlights_composite.tif,14 gigabytes
110,resourcewatch/raster/ene_023_2012_and_2016_nightlights_composite/ene_023_2016_nightlights_composite.tif,14 gigabytes
3388,resourcewatch/raster/soc_032_High_Resolution_Settlement_Layers/soc_032_high_res_population_256.tif,6 gigabytes
3411,resourcewatch/raster/soc_072_population_grid/soc_072_population_grid.tif,4 gigabytes
2050,resourcewatch/raster/test_docker/soc_031_population_grid_new.tif,4 gigabytes
716,resourcewatch/raster/bio_012_1_amphibian_species_under_threat/bio_012_1_amphibian_species_under_threat.tif,3 gigabytes
2099,resourcewatch/raster/bio_012_2_mammalian_species_under_threat/bio_012_2_mammalian_species_under_threat.tif,3 gigabytes
1498,resourcewatch/raster/foo_048_crop_extent/Europe_Central_Asia.tif,3 gigabytes
1735,resourcewatch/raster/foo_048_crop_extent/SNE_Asia.tif,2 gigabytes
1642,resourcewatch/raster/foo_048_crop_extent/N_America.tif,1 gigabyte


In [12]:
# Don't have the rights to create a bucket without location constraint: s3.create_bucket(Bucket="nds_bucket")
# Location constraint incorrect: s3.create_bucket(Bucket='nds_bucket', CreateBucketConfiguration={
#    'LocationConstraint': 'us-east-1c'})
# No CORS setup: s3.get_bucket_cors(Bucket="wri-public-data")
s3_client.get_bucket_acl(Bucket="wri-public-data")

{'Grants': [{'Grantee': {'DisplayName': 'worldresourcesinst',
    'ID': '2f9587736414f4c637f06a6d0a895309295931e8773a0856d95e337c9fffc42d',
    'Type': 'CanonicalUser'},
   'Permission': 'READ'},
  {'Grantee': {'DisplayName': 'worldresourcesinst',
    'ID': '2f9587736414f4c637f06a6d0a895309295931e8773a0856d95e337c9fffc42d',
    'Type': 'CanonicalUser'},
   'Permission': 'WRITE'},
  {'Grantee': {'DisplayName': 'worldresourcesinst',
    'ID': '2f9587736414f4c637f06a6d0a895309295931e8773a0856d95e337c9fffc42d',
    'Type': 'CanonicalUser'},
   'Permission': 'READ_ACP'},
  {'Grantee': {'DisplayName': 'worldresourcesinst',
    'ID': '2f9587736414f4c637f06a6d0a895309295931e8773a0856d95e337c9fffc42d',
    'Type': 'CanonicalUser'},
   'Permission': 'WRITE_ACP'},
  {'Grantee': {'DisplayName': 'worldresourcesinst',
    'ID': '2f9587736414f4c637f06a6d0a895309295931e8773a0856d95e337c9fffc42d',
    'Type': 'CanonicalUser'},
   'Permission': 'FULL_CONTROL'},
  {'Grantee': {'Type': 'Group',
    'URI':

In [5]:
# objects = s3_client.list_objects(Bucket="wri-public-data",
#                                 Prefix="resourcewatch/",
#                                  # Use Delimiter to only see the folders in resourcewatch/raster/
#                                  # To see the actual items, need to remove the Delimiter keyword
#                                 #Delimiter="/"
#                                 )
all_vector_objects = []

def keep_vector(key):
    if 'ARCHIVE' in key:
        return False
    if key[-1] == "/":
        return False
    if '.tif' in key:
        return False
    if len(key.split('/')) > 2:
        return False
    return True

for _key, _size in get_matching_s3_keys(bucket='wri-public-data', prefix='resourcewatch/', suffix='.zip'):
    if keep_vector(_key):
        all_vector_objects.append([_key,_size])

In [18]:
vector_summary = pd.DataFrame(all_vector_objects)#[['Key','Size']]
vector_summary.columns = ['Key','Size']
vector_summary = vector_summary.sort_values(by='Size', axis=0, ascending=False)

vector_summary['Size'] = vector_summary.apply(lambda row: size(row['Size'], system=verbose), axis=1)
vector_summary.to_csv('/Users/nathansuberi/Desktop/RW_Data/rw_vector_sizes.csv')

In [20]:
wri_ids = ['.'.join(key.split('/')[1].split('_')[:2]) for key in vector_summary['Key']]
keep_keys = list(map(lambda item: len(item)==7, wri_ids))

vector_summary['wri_ids'] = wri_ids
vector_summary['keep'] = keep_keys
vector_summary['s3_link'] = 'https://wri-public-data.s3.amazonaws.com/' + vector_summary['Key']

s3_links = vector_summary[['wri_ids', 's3_link', 'keep']].sort_values(by=['wri_ids', 's3_link'])

s3_links['use'] = ~s3_links['wri_ids'].duplicated(keep='first')
join_to_mdata = s3_links.loc[s3_links['use'], ['wri_ids', 's3_link', 'keep']]
join_to_mdata.set_index('wri_ids', inplace=True)
print(join_to_mdata.loc['soc.020'])
join_to_mdata = join_to_mdata.loc[join_to_mdata['keep']]


s3_link    https://wri-public-data.s3.amazonaws.com/resourcewatch/soc_020_gini_edit.zip
keep                                                                               True
Name: soc.020, dtype: object


In [None]:
bio.006
bio.007
bio.022
bio.030
cit.001
cli.006
cli.018
cli.047
com.004
com.009
com.011
com.022
com.024
com.025
com.026
ene.001
ene.013
ene.029
foo.013
foo.045
foo.046
foo.047
for.005
for.016
soc.012
soc.014
soc.043
soc.045
soc.069
soc.074
soc.076
soc.077
soc.079
soc.081
soc.082
wat.014
wat.037


In [10]:
#### Download Google Spreadsheets ####
# Launch Metadata
!curl "https://docs.google.com/spreadsheets/d/1laymLZAbNsto9Pj4iAHCdyaqZo2OYedKuyXaG48ZuLU/export?format=tsv" > current_metadata.tsv
current_mdata = pd.read_csv(open("current_metadata.tsv", "r"), sep="\t", index_col=[0])
os.remove("current_metadata.tsv")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  956k    0  956k    0     0  2592k      0 --:--:-- --:--:-- --:--:-- 2592k


In [12]:
joined = current_mdata.join(join_to_mdata)
joined[['s3_link', 'Download link']].to_csv('links on s3.csv')

s3_link    https://wri-public-data.s3.amazonaws.com/resourcewatch/soc_020_gini_edit.zip
keep                                                                               True
Name: soc.020, dtype: object

<b>S3 Resource</b>

Note below the use of "filter" instead of "list_objects". This is a higher level function that returns summary info about s3 assets, such as their bucket and full key name. It does not provide information on the size of the file or the last update.

In [27]:
s3_resource = boto3.resource("s3")
s3_resource.meta.client.head_bucket(Bucket="wri-public-data")
#for bucket in s3_resource.buckets.all():
#    print(bucket.name)
# The line above and below are the same
#print(s3_client.head_bucket(Bucket="wri-public-data"))

all_raster_objects_summary = []

wri_public_data = s3_resource.Bucket('wri-public-data')
for key in wri_public_data.objects.filter(Prefix="resourcewatch/raster/"):
#for key in bucket.objects.filter(Prefix="resourcewatch/raster/",
#                                Delimiter="/"):
    if(not key.key.endswith("/")):
        all_raster_objects_summary.append(key)

# This has less information than what comes with the s3.resource call
all_raster_objects_summary

[s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an01.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an02.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an03.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an04.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an05.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an06.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an07.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='resourcewatch/raster/Annual_discharge_anomalies/Dis_an08.tif'),
 s3.ObjectSummary(bucket_name='wri-public-data', key='re

<b>Calculate raster statistics with all rasters in s3 w/ Bucket=wri-public-data, Prefix=resourcewatch/raster/</b>

In [4]:
# This all_raster_objects comes from the client, so has file size and last update included
s3_files = pd.DataFrame(all_raster_objects)

# Big files take a long time, so this selects only files less than a million bytes (~1 mb)
s3_files_small = s3_files[s3_files["Size"] < 1000000]
print(s3_files.shape)
print(s3_files_small.shape)

(615, 6)
(266, 6)


Can run GDAL using subprocess.check_output() to use command line tools (instead of using the ! symbol, with the added benefit of returning the output to the Jupyter notebook directly)

In [7]:
# In case this takes a long time, you can store the results in this gdalinfo list
# gdalinfo = []

def run_gdalinfo_remotely(start_time, num_files, key, count):
    try:
        start = datetime.now()

        s3_loc = "/vsicurl/https://wri-public-data.s3.amazonaws.com/" + key
        res = subprocess.check_output(["gdalinfo", s3_loc, "-stats"])

        # Show how far in we are...
        print("Finished", count, "out of", num_files)
        print("Progress:", count/float(num_files))
        # This would help you to investigate the outputs on some entries, 
        # even if you have to keyboard interrupt before the entire apply function finishes
        # gdalinfo.append(res)

        end = datetime.now()
        print("Time for current file:", end-start)
        print("Total time:", end-start_time, "\n")
        return(res)
    except:
        return("Not a gdalinfo compatible file")

# Initializers
# print("Total number of files to try:", s3_files_small.shape[0])
# num_files = s3_files_small.shape[0]
# s3_files_small["count"] = list(range(1, num_files+1))

print("Total number of files to try:", s3_files.shape[0])
num_files = s3_files.shape[0]
s3_files["count"] = list(range(1, num_files+1))
start_time = datetime.now()

s3_files["gdalinfo"] = s3_files.apply(lambda row: run_gdalinfo_remotely(start_time, 
                                                                        num_files, 
                                                                        row["Key"],
                                                                        row["count"],), axis=1)

Total number of files to try: 615
Finished 1 out of 615
Progress: 0.0016260162601626016
Time for current file: 0:00:04.804231
Total time: 0:00:04.807482 

Finished 2 out of 615
Progress: 0.0032520325203252032
Time for current file: 0:00:04.187172
Total time: 0:00:08.996067 

Finished 3 out of 615
Progress: 0.004878048780487805
Time for current file: 0:00:04.963413
Total time: 0:00:13.960682 

Finished 4 out of 615
Progress: 0.0065040650406504065
Time for current file: 0:00:04.601072
Total time: 0:00:18.563683 

Finished 5 out of 615
Progress: 0.008130081300813009
Time for current file: 0:00:04.182318
Total time: 0:00:22.747569 

Finished 6 out of 615
Progress: 0.00975609756097561
Time for current file: 0:00:04.470400
Total time: 0:00:27.219419 

Finished 7 out of 615
Progress: 0.011382113821138212
Time for current file: 0:00:05.575513
Total time: 0:00:32.795942 

Finished 8 out of 615
Progress: 0.013008130081300813
Time for current file: 0:00:04.621763
Total time: 0:00:37.418762 

Fini

In [13]:
s3_files.loc[71]["gdalinfo"]

b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https://wri-public-data.s3.amazonaws.com/resourcewatch/raster/cit_014/GHS_1975_1k.tif\nSize is 35497, 15236\nCoordinate System is:\nPROJCS["World_Mollweide",\n    GEOGCS["GCS_WGS_1984",\n        DATUM["D_WGS_1984",\n            SPHEROID["WGS_1984",6378137.0,298.257223563]],\n        PRIMEM["Greenwich",0.0],\n        UNIT["Degree",0.017453292519943295]],\n    PROJECTION["Mollweide"],\n    PARAMETER["False_Easting",0.0],\n    PARAMETER["False_Northing",0.0],\n    PARAMETER["Central_Meridian",0.0],\n    UNIT["Meter",1.0]]\nOrigin = (-17619594.547443531453609,8751029.461868489161134)\nPixel Size = (1000.000000000000000,-1000.000000000000000)\nMetadata:\n  AREA_OR_POINT=Area\n  DataType=Generic\nImage Structure Metadata:\n  COMPRESSION=LZW\n  INTERLEAVE=BAND\nCorner Coordinates:\nUpper Left  (-17619594.547, 8751029.462) \nLower Left  (-17619594.547,-6484970.538) \nUpper Right (17877405.453, 8751029.462) \nLower Right (17877405.453,-6484970.538) \nCen

In [87]:
# Join the gdalinfo results from the smaller dataset back to your dataframe of all files on s3
s3_files = s3_files.join(s3_files_small["gdalinfo"])

Unnamed: 0,ETag,Key,LastModified,Owner,Size,StorageClass,gdalinfo
38,"""a4a4284cca59054d28c2b45f0ae43829""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:11+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",349,STANDARD,Not a gdalinfo compatible file
39,"""6198e88c1cb3d16f512dd53ef1009ca7""",resourcewatch/raster/bio_008_cumulative_climat...,2017-10-06 21:08:16+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",768168,STANDARD,"b""Driver: AAIGrid/Arc/Info ASCII Grid\nFiles: ..."
40,"""7d5040da88038078cf97664d74b4a98a""",resourcewatch/raster/bio_008_cumulative_climat...,2017-10-06 21:08:17+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",148360,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
42,"""64804a5fe798c782e746eccb74a212e8""",resourcewatch/raster/bio_008_cumulative_climat...,2017-10-06 21:08:17+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",308228,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
65,"""7589bacfbaa4861ca650f2181f5f32a1""",resourcewatch/raster/bio_035_coral_reef_future...,2017-10-04 17:25:37+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",46377,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
66,"""a2fc744a156d93ea0a494f9a2057689b""",resourcewatch/raster/bio_035_coral_reef_future...,2017-10-04 17:55:35+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",280339,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
67,"""c923585547f9e6b631460adde0cf2299""",resourcewatch/raster/bio_035_coral_reef_future...,2017-10-04 17:26:02+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",45724,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
68,"""0fd7a83fb3228f65ec21e05ed6d1c485""",resourcewatch/raster/bio_035_coral_reef_future...,2017-10-04 17:55:35+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",285482,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
69,"""db71d5876f19c90e65c6bd7af72ba487""",resourcewatch/raster/bio_035_coral_reef_future...,2017-10-04 18:58:05+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",565455,STANDARD,b'Driver: GTiff/GeoTIFF\nFiles: /vsicurl/https...
70,"""a4c32553f06f722778ad3610807fd187""",resourcewatch/raster/cit_014/GHSL_data_access_...,2017-11-07 23:56:13+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",418497,STANDARD,Not a gdalinfo compatible file


In [None]:
# Extract information from the gdalinfo command
# How long does this take on local computer? How long on a cloud instance?

In [14]:
sample_gdalinfo = s3_files["gdalinfo"].loc[570]
#print(sample_gdalinfo)

def extract_raster_statistics(gdalinfo):
    try:
        statistics = str(gdalinfo).split("\\n")[-5:-1]
        stats = [stat.strip() for stat in statistics]
        stats = [stat.split("=") for stat in stats]
        stats_dict = dict(stats)
        #stat_df = pd.DataFrame.from_dict(orient="index", data=stats_dict)
        #stat_df.columns = ["Value"]
    except:
        stats_dict={"Not gdalinfo compatible"}
    
    return(stats_dict)
    #return(stat_df)
    
# This creates empty dictionaries for each unfilled row - why? Is the datatype default.dict?
s3_files["stats"] = s3_files["gdalinfo"].apply(lambda info: extract_raster_statistics(info))
# This doesn't work - why?
# s3_files["stats"] = s3_files.apply(lambda row: extract_raster_statistics(row["gdalinfo"]), axis=1)

In [15]:
s3_files["stats"]

0      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
1      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
2      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
3      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
4      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
5      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
6      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
7      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
8      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
9      {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
10     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
11     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
12     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
13     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
14     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
15     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
16     {'STATISTICS_MAXIMUM': '50', 'STATISTICS_MEAN'...
17     {'STATISTICS_MAXIMUM': '

In [16]:
s3_files.to_csv("/Users/nathansuberi/Desktop/RW_Data/gdalinfo_on_s3.csv")

Can also use rasterio to calculate stats

https://mapbox.s3.amazonaws.com/playground/perrygeo/rasterio-docs/cookbook.html

In [6]:
from pprint import pprint
import rasterio
import numpy as np

def rasterio_stats(start_time, num_files, key, count, file_size):
    #try:
    start = datetime.now()

    s3_loc = "/vsicurl/https://wri-public-data.s3.amazonaws.com/" + key

    # Could do all this with src.block_windows... just track min and max and keep updating them
    # Could take a long time... maybe only do for those datasets we have to, i.e. can't fit in memory
    # Judge by the size
    CUTOFF_FOR_USING_BLOCK_WINDOWS = 1000000

    if file_size > CUTOFF_FOR_USING_BLOCK_WINDOWS:
        stats = []
        with rasterio.open(s3_loc, "r") as src:                
            num_bands = src.profile["count"]
            for band in range(1, num_bands+1): 

                new_stats = {'min':np.inf,
                 'mean':None,
                 'median':None,
                 'max':-np.inf}

                windows = src.block_windows()
                for ix, window in windows:
                    array = src.read(indexes=band, window=window)
                    array_min = array.min()
                    array_max = array.max()
                    if array_min < new_stats["min"]:
                        new_stats["min"] = array_min
                    if array_max > new_stats["max"]:
                        new_stats["max"] = array_max
            stats.append(new_stats)
            pass

        return(stats)

    with rasterio.open(s3_loc, "r") as src:
        array = src.read()

    stats = []
    for band in array:
        stats.append({
            'min': band.min(),
            'mean': band.mean(),
            'median': np.median(band),
            'max': band.max()})

    # Show how far in we are...
    print("Finished", count, "out of", num_files)
    print("Progress:", count/float(num_files))
    # This would help you to investigate the outputs on some entries, 
    # even if you have to keyboard interrupt before the entire apply function finishes
    # gdalinfo.append(res)

    end = datetime.now()
    print("Time for current file:", end-start)
    print("Total time:", end-start_time, "\n")

    return(stats)
"""    except:
        # Show how far in we are...
        print("Interrupted")
        print("Finished", count, "out of", num_files)
        print("Progress:", count/float(num_files))
        # This would help you to investigate the outputs on some entries, 
        # even if you have to keyboard interrupt before the entire apply function finishes
        # gdalinfo.append(res)

        end = datetime.now()
        print("Time for current file:", end-start)
        print("Total time:", end-start_time, "\n")
        return("Not a gdalinfo compatible file")
"""
# Initializers
#num_files = s3_files_small.shape[0]
#s3_files_small["count"] = list(range(1, num_files+1))
num_files = s3_files.shape[0]
s3_files["count"] = list(range(1, num_files+1))
start_time = datetime.now()

s3_files["rasterio_stats"] = s3_files.apply(lambda row: rasterio_stats(start_time, 
                                                                        num_files, 
                                                                        row["Key"],
                                                                        row["count"],
                                                                        row["Size"]), axis=1)


RasterioIOError: ("'/vsicurl/https://wri-public-data.s3.amazonaws.com/resourcewatch/raster/Annual_discharge_anomalies/optimize.bat' not recognized as a supported file format.", 'occurred at index 38')

In [10]:
# Join the gdalinfo results from the smaller dataset back to your dataframe of all files on s3
s3_files = s3_files.join(s3_files_small["rasterio_stats"])

In [11]:
s3_files

Unnamed: 0,ETag,Key,LastModified,Owner,Size,StorageClass,rasterio_stats
0,"""b19b8950cb1f4b50843642511ad59c44""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
1,"""47b6977ea93e1ccd1cd5a4e3f551abff""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
2,"""317a22a0425c823f4ab36a34edb0c5b4""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
3,"""2744364ab86d0a22904f938bbe1b0654""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
4,"""29ecefb7dde5b640d422d9629301ac23""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
5,"""fa3a1858d2b4c423d99f53ea3dc6a60b""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
6,"""7003db296e5c9704e9aa1c6104bbc390""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
7,"""af7632d6641fa3a88af9d04ff98f81c7""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
8,"""9aeecfd08caf463667442f976df6129c""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,
9,"""01db6772a98b32c9174b46549e9b8cac""",resourcewatch/raster/Annual_discharge_anomalie...,2017-11-01 18:50:10+00:00,"{'DisplayName': 'worldresourcesinst', 'ID': '2...",1034300,STANDARD,


Use the python email library (smtplib) and basic outline below to send a message that reports on the statistics calculated by gdalinfo, above. 

In [17]:
import smtplib

gmail_user = 'nsuberi@gmail.com'  
gmail_password = '@'

sent_from = gmail_user  
to = ['nathan.suberi@wri.org' ]
subject = 'Resource Watch status update'  
body = "sample"

email_text = """\  
From: %s  
To: %s  
Subject: %s

%s
""" % (sent_from, to, subject, body)


server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server.ehlo()
server.login(gmail_user, gmail_password)
server.sendmail(sent_from, to, email_text)
server.close()

print('Email sent!')
#except:  
#    print('Something went wrong...')


## Works to "send" from my account... but hasn't ended up in the other account!

Email sent!
