In [1]:
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

In [2]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [3]:
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

In [None]:
create_bucket('afsis-oli')

In [4]:
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

In [5]:
list_buckets(match='afsis')

Existing buckets containing "afsis" string:
  afsis-oli


In [6]:
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')

In [None]:
list_bucket_contents(bucket='afsis', match='.csv', size_mb=250)

In [7]:
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows)
    return data

In [8]:
df = preview_csv_dataset(bucket='afsis', key='2009-2013/Dry_Chemistry/ICRAF/Bruker_TXRF/TXRF.csv', rows=10)

In [9]:
df1 = preview_csv_dataset(bucket='afsis', key='2009-2013/Dry_Chemistry/ICRAF/Bruker_TXRF/TXRF.csv', rows=10)

In [10]:
df.head()

Unnamed: 0,SSN,Public,Na,Mg,Al,P,S,Cl,K,Ca,...,Pr,Nd,Sm,Hf,Ta,W,Hg,Pb,Bi,Th
0,icr005965,True,16023.3,4433.5,37618.6,84.4,45.7,268.1,12412.2,30705.6,...,0.9,14.7,14.0,0.9,2.5,0.2,4.9,3.9,0.1,13.4
1,icr005966,True,20524.6,5832.2,40248.2,72.1,45.7,229.6,12892.2,23234.5,...,1.1,15.8,18.2,0.5,3.2,0.2,4.2,3.3,0.1,19.9
2,icr005985,True,19350.4,5085.8,36766.3,50.6,45.7,157.3,16839.7,16746.2,...,1.2,19.2,14.1,0.8,2.0,1.2,2.6,12.0,0.1,17.9
3,icr005986,True,17410.2,5271.2,37912.2,50.6,45.7,285.2,16818.0,31939.6,...,1.1,16.7,12.6,0.3,1.2,0.5,6.3,10.2,0.1,16.5
4,icr005998,True,19092.5,9169.8,37359.8,50.6,45.7,251.4,17577.9,25298.2,...,1.1,16.7,17.2,0.5,3.2,0.4,4.2,5.6,0.1,18.4


In [11]:
df1.head()

Unnamed: 0,SSN,Public,Na,Mg,Al,P,S,Cl,K,Ca,...,Pr,Nd,Sm,Hf,Ta,W,Hg,Pb,Bi,Th
0,icr005965,True,16023.3,4433.5,37618.6,84.4,45.7,268.1,12412.2,30705.6,...,0.9,14.7,14.0,0.9,2.5,0.2,4.9,3.9,0.1,13.4
1,icr005966,True,20524.6,5832.2,40248.2,72.1,45.7,229.6,12892.2,23234.5,...,1.1,15.8,18.2,0.5,3.2,0.2,4.2,3.3,0.1,19.9
2,icr005985,True,19350.4,5085.8,36766.3,50.6,45.7,157.3,16839.7,16746.2,...,1.2,19.2,14.1,0.8,2.0,1.2,2.6,12.0,0.1,17.9
3,icr005986,True,17410.2,5271.2,37912.2,50.6,45.7,285.2,16818.0,31939.6,...,1.1,16.7,12.6,0.3,1.2,0.5,6.3,10.2,0.1,16.5
4,icr005998,True,19092.5,9169.8,37359.8,50.6,45.7,251.4,17577.9,25298.2,...,1.1,16.7,17.2,0.5,3.2,0.4,4.2,5.6,0.1,18.4


In [12]:
df.shape

(10, 42)