In [2]:
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

In [3]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [4]:
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

In [5]:
create_bucket('open-data-analytics-noaa')

'Created or already exists open-data-analytics-noaa bucket.'

In [6]:
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

In [7]:
list_buckets(match='noaa')

Existing buckets containing "noaa" string:
  open-data-analytics-noaa


In [8]:
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')

In [9]:
list_bucket_contents(bucket='noaa-ghcn-pds', match='.csv', size_mb= 1000)

csv.gz/1763.csv.gz (  0MB)
csv.gz/1764.csv.gz (  0MB)
csv.gz/1765.csv.gz (  0MB)
csv.gz/1766.csv.gz (  0MB)
csv.gz/1767.csv.gz (  0MB)
csv.gz/1768.csv.gz (  0MB)
csv.gz/1769.csv.gz (  0MB)
csv.gz/1770.csv.gz (  0MB)
csv.gz/1771.csv.gz (  0MB)
csv.gz/1772.csv.gz (  0MB)
csv.gz/1773.csv.gz (  0MB)
csv.gz/1774.csv.gz (  0MB)
csv.gz/1775.csv.gz (  0MB)
csv.gz/1776.csv.gz (  0MB)
csv.gz/1777.csv.gz (  0MB)
csv.gz/1778.csv.gz (  0MB)
csv.gz/1779.csv.gz (  0MB)
csv.gz/1780.csv.gz (  0MB)
csv.gz/1781.csv.gz (  0MB)
csv.gz/1782.csv.gz (  0MB)
csv.gz/1783.csv.gz (  0MB)
csv.gz/1784.csv.gz (  0MB)
csv.gz/1785.csv.gz (  0MB)
csv.gz/1786.csv.gz (  0MB)
csv.gz/1787.csv.gz (  0MB)
csv.gz/1788.csv.gz (  0MB)
csv.gz/1789.csv.gz (  0MB)
csv.gz/1790.csv.gz (  0MB)
csv.gz/1791.csv.gz (  0MB)
csv.gz/1792.csv.gz (  0MB)
csv.gz/1793.csv.gz (  0MB)
csv.gz/1794.csv.gz (  0MB)
csv.gz/1795.csv.gz (  0MB)
csv.gz/1796.csv.gz (  0MB)
csv.gz/1797.csv.gz (  0MB)
csv.gz/1798.csv.gz (  0MB)
csv.gz/1799.csv.gz (  0MB)
c

In [19]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows, header = None)
    return data

In [20]:
df = preview_csv_dataset(bucket='noaa-ghcn-pds', key='csv/2019.csv', rows = 1000)

In [None]:
df # can do group by (rename columns first)
# Keep old code to tell story

In [None]:
def key_exists(bucket, key):
    try:
        s3_resource.Object(bucket, key).load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            # The key does not exist.
            return(False)
        else:
            # Something else has gone wrong.
            raise
    else:
        # The key does exist.
        return(True)

def copy_among_buckets(from_bucket, from_key, to_bucket, to_key):
    if not key_exists(to_bucket, to_key):
        s3_resource.meta.client.copy({'Bucket': from_bucket, 'Key': from_key}, 
                                        to_bucket, to_key)        
        print(f'File {to_key} saved to S3 bucket {to_bucket}')
    else:
        print(f'File {to_key} already exists in S3 bucket {to_bucket}') 

In [None]:
copy_among_buckets(from_bucket='afsis', from_key='2009-2013/Wet_Chemistry/ICRAF/README.md',to_bucket=''open-data-analytics-afsis', to_key='ICRAF.README.md')