In [31]:
import boto3
from urllib.parse import urlparse
from io import StringIO, BytesIO
import gzip
from awscli.customizations.s3.utils import split_s3_bucket_key

def get_io_from_s3url(url):
    #get 'Body' from s3 url
    client = boto3.client('s3')
    bucket_name, key_name = split_s3_bucket_key(url)
    response = client.get_object(Bucket=bucket_name, Key=key_name)
    content=response['Body'].read()
    
    #assume content is gz_coded, do gz_decode 
    outFile = StringIO() #create a StringIO for out
    
    #the content we got from response['Body'].read() is a blob of bytes (compresssed)
    # we first create a BytesIO from this content
    compressedFile = BytesIO(content) 
    
    #then we decompress it into another File (in memory)
    decompressedFile = gzip.GzipFile(fileobj=compressedFile)
    
    #and write the decompressed content as file io object    
    outFile.write(decompressedFile.read().decode('utf-8'))
    outFile.flush()
    outFile.seek(0)
   
    return outFile

def get_gzDecoded_IO(url):
    parsedUrl = urlparse(url)
    #'file://c:/temp/t1/01.jsonl.gz'
        #ParseResult(scheme='file', netloc='c:', path='/temp/t1/01.jsonl.gz', params='', query='', fragment='')
    ##'file://c:/temp/t1/test1.txt' 
        #ParseResult(scheme='s3', netloc='net.energyhub.assets', path='/public/dev-exercises/audit-data/2016/01/01.jsonl.gz', params='', query='', fragment='')
    #if resource is from s3
    fio=None
    if(parsedUrl[0]=='s3'):
        fio=get_io_from_s3url(url)
    elif(parsedUrl[0]=='file'):
        fio=gzip.open(parsedUrl[2], 'r')
    return fio
    
fio=get_gzDecoded_IO('s3://net.energyhub.assets/public/dev-exercises/audit-data/2016/01/01.jsonl.gz')  
#fio=get_gzDecoded_IO('file:///temp/t1/01.jsonl.gz')  
#fio=get_gzDecoded_IO('file:///repos/Mgy/Pyhton/TnL/EnergyHubAudit/data/2016/01/05.jsonl.gz')  

for line in fio:
    print(line.replace('\n',''))
fio.close()


{"changeTime": "2016-01-01T00:30:00.001059", "after": {"ambientTemp": 79.0}, "before": {"ambientTemp": 77.0}}
{"changeTime": "2016-01-01T00:43:00.001064", "after": {"ambientTemp": 80.0}, "before": {"ambientTemp": 79.0}}
{"changeTime": "2016-01-01T01:32:00.009816", "after": {"ambientTemp": 81.0}, "before": {"ambientTemp": 80.0}}
{"changeTime": "2016-01-01T01:38:00.001038", "after": {"ambientTemp": 82.0}, "before": {"ambientTemp": 81.0}}
{"changeTime": "2016-01-01T01:44:00.001145", "after": {"ambientTemp": 81.0}, "before": {"ambientTemp": 82.0}}
{"changeTime": "2016-01-01T02:08:30.010956", "after": {"ambientTemp": 79.0}, "before": {"ambientTemp": 81.0}}
{"changeTime": "2016-01-01T02:47:30.002413", "after": {"ambientTemp": 77.0}, "before": {"ambientTemp": 79.0}}
{"changeTime": "2016-01-01T03:02:30.001424", "after": {"ambientTemp": 78.0}, "before": {"ambientTemp": 77.0}}
{"changeTime": "2016-01-01T03:08:00.007712", "after": {"ambientTemp": 80.0}, "before": {"ambientTemp": 78.0}}
{"changeTi

In [18]:
#aws s3 cp s3://s3-mgy.pub.01/test1.txt .
#aws s3 cp s3://net.energyhub.assets/public/dev-exercises/audit-data/2016/01/01.jsonl.gz .

import boto3

s3_client = boto3.client('s3')
#obj =s3_client.get_object(Bucket='s3-mgy.pub.01', Key='test1.txt')
obj =s3_client.get_object(Bucket='s3-mgy-pub-01', Key='test1.txt')
obj =s3_client.get_object(Bucket='net.energyhub.assets', Key='public/dev-exercises/audit-data/2016/01/01.jsonl.gz')
print(obj['Body'].read())

b'\x1f\x8b\x08\x08\x9a\x13\xe8X\x00\x0301.jsonl\x00\xb5\x97\xcbn\xea0\x10\x86\xf7\xe7)\xaa\xac\x0f\xc83\xbe\x8d\xbd\xeb;\xf0\x02)5\x07$.\x15\xa4+\xd4w?&qP\xeb\x16O\x90\xa9\xc4&\x8a\xcd7\x9e\xf9\xff\xf1\xe4\xdc,\xd7\xed\xfe_Xlv\xa1\xf1O\r\n03\x01\xf1\xb7\x10\xc2K\xe1\x85\x98\x0b\x01B\xbb\xe6\xefS\xd3\xae\xbap\x8c\xcb\xceM\xbb{\xd9\x84}\xb7\x08\xbb\xb7\xf8l\xdd\\|\xc4\xf7/au8\x86\x9f\x16\xd8\xb8\xe0\xe3\xcf\xb9HSr\xa4\x19U\xa2\x91\xe0h\x8e\xa5\x81\x978\xd0\x1c\x81)\xd2\x80\xa1\xf5\xe1\xb04\x1a\xcf&\xa9HC\x8e\x06\x13hJ%\x1a(]w6di1\x8d\x14\x852\x8fGs\xba\x98IV%\x13\xce\x86^\xd9\x9e&P\x81,\xd2l\xbdJ\xa2 q\xa0\x81\xc2\xa2&-\xd5;@^2\xd9\xd7\xcdZ\xc0:\x07\xd0\x04\x1a\xa4\xb3\x91\x93\x95u\xe3\x1d\x10i\x942\xe9\xb4\xf8J;-\xd7\xe1\xf5}{\xd9\xd3\x1d\xdfC\x86\xfa\xf4v\xd5nO\x81\x05\xa1J \xa0\x1c\x14\xba\xb7\xc3f\xdf\xf5\x0f\xeb\xd0\x8e\xf1\x9b\xbe6\x19\xf6\xd6\xda^5\\\x0c*\xc5\x80\xd6\xc0o\x8b4\xa2L/\x1b)\x812\xbbo\xdbS\xf7\xbc\r\xc7nqb\xb6eQ|\xdf\xa8g\x803\x197\x1a/a\x90\xa9\xb6\x02\x1b&:\xe3q\xb8

In [37]:
#aws s3 cp s3://s3-mgy.pub.01/test1.txt .
#aws s3 cp s3://net.energyhub.assets/public/dev-exercises/audit-data/2016/01/01.jsonl.gz .

import boto3
import gzip
import codecs
s3_client = boto3.client('s3')

obj =s3_client.get_object(Bucket='s3-mgy-pub-01', Key='test1.txt')
obj =s3_client.get_object(Bucket='net.energyhub.assets', Key='public/dev-exercises/audit-data/2016/01/01.jsonl.gz')

content = obj['Body'].read()

#assume content is gz_coded, do gz_decode 
outFile = StringIO() #create a StringIO for out

#the content we got from response['Body'].read() is a blob of bytes (compresssed)
# we first create a BytesIO from this content
compressedFile = BytesIO(content) 

#then we decompress it into another File (in memory)
decompressedFile = gzip.GzipFile(fileobj=compressedFile)

#and write the decompressed content as file io object    
outFile.write(decompressedFile.read().decode('utf-8'))
outFile.flush()
outFile.seek(0)

for line in outFile:
    print(line.replace('\n',''))
fio.close()

{"changeTime": "2016-01-01T00:30:00.001059", "after": {"ambientTemp": 79.0}, "before": {"ambientTemp": 77.0}}
{"changeTime": "2016-01-01T00:43:00.001064", "after": {"ambientTemp": 80.0}, "before": {"ambientTemp": 79.0}}
{"changeTime": "2016-01-01T01:32:00.009816", "after": {"ambientTemp": 81.0}, "before": {"ambientTemp": 80.0}}
{"changeTime": "2016-01-01T01:38:00.001038", "after": {"ambientTemp": 82.0}, "before": {"ambientTemp": 81.0}}
{"changeTime": "2016-01-01T01:44:00.001145", "after": {"ambientTemp": 81.0}, "before": {"ambientTemp": 82.0}}
{"changeTime": "2016-01-01T02:08:30.010956", "after": {"ambientTemp": 79.0}, "before": {"ambientTemp": 81.0}}
{"changeTime": "2016-01-01T02:47:30.002413", "after": {"ambientTemp": 77.0}, "before": {"ambientTemp": 79.0}}
{"changeTime": "2016-01-01T03:02:30.001424", "after": {"ambientTemp": 78.0}, "before": {"ambientTemp": 77.0}}
{"changeTime": "2016-01-01T03:08:00.007712", "after": {"ambientTemp": 80.0}, "before": {"ambientTemp": 78.0}}
{"changeTi

In [22]:
import boto
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from io import StringIO, BytesIO
import gzip


aws_access_key_id ='AKIAJKVTP2YJBUGBZ4JA'
aws_secret_access_key ='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'
conn = S3Connection(aws_access_key_id, aws_secret_access_key)


bocketId='net.energyhub.assets'
keyId='public/dev-exercises/audit-data/2016/01/01.jsonl.gz'
bocketId='s3-mgy-pub-01'
keyId='01.jsonl.gz'

b = conn.get_bucket(bocketId)
#k=b.get_key('test1.txt')01.jsonl.gz
k=b.get_key(keyId)
f = BytesIO()
k.get_file(f)
f.seek(0) #This is crucial

gzf = gzip.GzipFile(fileobj=f)
for line in gzf:
    print(line)



b'{"changeTime": "2016-01-01T00:30:00.001059", "after": {"ambientTemp": 79.0}, "before": {"ambientTemp": 77.0}}\n'
b'{"changeTime": "2016-01-01T00:43:00.001064", "after": {"ambientTemp": 80.0}, "before": {"ambientTemp": 79.0}}\n'
b'{"changeTime": "2016-01-01T01:32:00.009816", "after": {"ambientTemp": 81.0}, "before": {"ambientTemp": 80.0}}\n'
b'{"changeTime": "2016-01-01T01:38:00.001038", "after": {"ambientTemp": 82.0}, "before": {"ambientTemp": 81.0}}\n'
b'{"changeTime": "2016-01-01T01:44:00.001145", "after": {"ambientTemp": 81.0}, "before": {"ambientTemp": 82.0}}\n'
b'{"changeTime": "2016-01-01T02:08:30.010956", "after": {"ambientTemp": 79.0}, "before": {"ambientTemp": 81.0}}\n'
b'{"changeTime": "2016-01-01T02:47:30.002413", "after": {"ambientTemp": 77.0}, "before": {"ambientTemp": 79.0}}\n'
b'{"changeTime": "2016-01-01T03:02:30.001424", "after": {"ambientTemp": 78.0}, "before": {"ambientTemp": 77.0}}\n'
b'{"changeTime": "2016-01-01T03:08:00.007712", "after": {"ambientTemp": 80.0}, "

In [24]:
"""

this code snipet coming from stackoverflow
https://stackoverflow.com/questions/4993439/how-can-i-access-s3-files-in-python-using-urls
https://github.com/aws/aws-cli/blob/e2295b022db35eea9fec7e6c5540d06dbd6e588b/awscli/customizations/s3/utils.py#L192-L216

the objective is to get buket and key/path for a given 's3://...' link
"""
from awscli.customizations.s3.utils import split_s3_bucket_key
import boto3

myS3path='s3://s3-mgy.pub.01/test1.txt'
myS3path='s3://net.energyhub.assets/public/dev-exercises/audit-data/2016/01/01.jsonl.gz'
client = boto3.client('s3')
bucket_name, key_name = split_s3_bucket_key(myS3path)
print(bucket_name, key_name)

response = client.get_object(Bucket=bucket_name, Key=key_name)

print(response['Body'].read())
s3 = boto3.resource('s3')
try:
    s3.Bucket(bucket_name).download_file(key_name, 'c:/temp/audit_test.gz')
    
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

net.energyhub.assets public/dev-exercises/audit-data/2016/01/01.jsonl.gz
b'\x1f\x8b\x08\x08\x9a\x13\xe8X\x00\x0301.jsonl\x00\xb5\x97\xcbn\xea0\x10\x86\xf7\xe7)\xaa\xac\x0f\xc83\xbe\x8d\xbd\xeb;\xf0\x02)5\x07$.\x15\xa4+\xd4w?&qP\xeb\x16O\x90\xa9\xc4&\x8a\xcd7\x9e\xf9\xff\xf1\xe4\xdc,\xd7\xed\xfe_Xlv\xa1\xf1O\r\n03\x01\xf1\xb7\x10\xc2K\xe1\x85\x98\x0b\x01B\xbb\xe6\xefS\xd3\xae\xbap\x8c\xcb\xceM\xbb{\xd9\x84}\xb7\x08\xbb\xb7\xf8l\xdd\\|\xc4\xf7/au8\x86\x9f\x16\xd8\xb8\xe0\xe3\xcf\xb9HSr\xa4\x19U\xa2\x91\xe0h\x8e\xa5\x81\x978\xd0\x1c\x81)\xd2\x80\xa1\xf5\xe1\xb04\x1a\xcf&\xa9HC\x8e\x06\x13hJ%\x1a(]w6di1\x8d\x14\x852\x8fGs\xba\x98IV%\x13\xce\x86^\xd9\x9e&P\x81,\xd2l\xbdJ\xa2 q\xa0\x81\xc2\xa2&-\xd5;@^2\xd9\xd7\xcdZ\xc0:\x07\xd0\x04\x1a\xa4\xb3\x91\x93\x95u\xe3\x1d\x10i\x942\xe9\xb4\xf8J;-\xd7\xe1\xf5}{\xd9\xd3\x1d\xdfC\x86\xfa\xf4v\xd5nO\x81\x05\xa1J \xa0\x1c\x14\xba\xb7\xc3f\xdf\xf5\x0f\xeb\xd0\x8e\xf1\x9b\xbe6\x19\xf6\xd6\xda^5\\\x0c*\xc5\x80\xd6\xc0o\x8b4\xa2L/\x1b)\x812\xbbo\xdbS\xf7\xb