# Imports

In [1]:
import os
import pandas as pd
import boto3

### Configuring credentials

In [2]:
import configparser
config = configparser.ConfigParser()
config.read('/home/jovyan/.aws/dbuser_config.cfg')
os.environ['AWS_KEY_ID'] = config.get("AWS", "KEY")
os.environ['AWS_SECRET'] = config.get("AWS", 'SECRET')

### Connecting to s3
- Creating a s3 ressource and a s3 client
- See https://stackoverflow.com/q/39272744/7177305
- Sometimes, you need to use the ressource, sometimes the client. Pay attention if you copy-paste answers from stackoverflow

In [3]:
s3r = boto3.resource("s3", 
                  region_name='us-west-2', 
                  aws_access_key_id=os.environ.get('AWS_KEY_ID'), 
                  aws_secret_access_key=os.environ.get('AWS_SECRET'))
s3c = boto3.client("s3", 
                  region_name='us-west-2', 
                  aws_access_key_id=os.environ.get('AWS_KEY_ID'), 
                  aws_secret_access_key=os.environ.get('AWS_SECRET')
                 )

#### Listing buckets

In [4]:
s3c.list_buckets()['Buckets']

[{'Name': 'aws-emr-resources-075227836161-us-west-2',
  'CreationDate': datetime.datetime(2020, 10, 19, 12, 0, 41, tzinfo=tzlocal())},
 {'Name': 'dendpaulogieruswest2',
  'CreationDate': datetime.datetime(2020, 11, 5, 9, 30, 54, tzinfo=tzlocal())}]

In [8]:
mybucket= 'dendpaulogieruswest2'
myprefix = 'sampledata'

#### Listing Objects
https://stackoverflow.com/q/30249069/7177305

In [106]:
for key in s3c.list_objects(Bucket=mybucket, Prefix=myprefix)['Contents']:
    print(key['Key'])

sampledata/
sampledata/connectionssample.json
sampledata/connectionssample2.json
sampledata/scores.csv
sampledata/titanic-data.csv


## Read using pandas

### Csv
https://stackoverflow.com/questions/30818341/how-to-read-a-csv-file-from-an-s3-bucket-using-pandas-in-python

In [109]:
from io import StringIO
csv_obj = s3c.get_object(Bucket=mybucket, Key='sampledata/titanic-data.csv')
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')
df = pd.read_csv(StringIO(csv_string))
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Multiple files
https://stackoverflow.com/questions/52855221/reading-multiple-csv-files-from-s3-bucket-with-boto3

In [114]:
myprefix2 = 'sampledata/connectionssample'
bucket = s3r.Bucket(mybucket)
prefix_objs = bucket.objects.filter(Prefix=myprefix2)
for c in prefix_objs.all():
    print(c)
prefix_df = []
for obj in prefix_objs:
    key = obj.key
    body = obj.get()['Body']
    fstring = body.read().decode('utf-8')
    temp = pd.read_json(StringIO(fstring))     
    prefix_df.append(temp)
df = pd.concat(prefix_df)

s3.ObjectSummary(bucket_name='dendpaulogieruswest2', key='sampledata/connectionssample.json')
s3.ObjectSummary(bucket_name='dendpaulogieruswest2', key='sampledata/connectionssample2.json')
