In [None]:
%set_env AWS_PROFILE=YOUR-CURRENT-AWS-PROFILE-NAME
%set_env AWS_CONFIG_FILE=/home/jovyan/.aws/config
%set_env AWS_SHARED_CREDENTIALS_FILE=/home/jovyan/.aws/credentials

In [None]:
import boto3

# First set up a default session explicitly before instantiating s3 client.
# The implicit session, when you attempt s3.list_buckets(), does not succeed in
# authentication with the above environment variables.
session = boto3.session.Session()

s3 = boto3.client('s3')
response = s3.list_buckets()

# Check the existence of the given bucket
mybucket='pcdas'
found = False
print(f'Checking the existence of my bucket {mybucket}:')
for bucket in response['Buckets']:
    if mybucket == bucket["Name"]:
        print(f'  {bucket["Name"]} {bucket["CreationDate"]} {response["Owner"]["DisplayName"]}')
        found = True
        break
if not found:
    print(f'  {mybucket} is not found.')

### Source of population dataset
 * [United Nations – Dept. of Economic and Social Affairs, population dynamics](https://population.un.org/wpp/Download/Standard/CSV/)
 
 ```
 $ curl "https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_TotalPopulationBySex.csv" > TotalPopulation.csv
 $ head -1 TotalPopulation.csv
 LocID,Location,VarID,Variant,Time,MidPeriod,PopMale,PopFemale,PopTotal,PopDensity
 $ gzip TotalPopulation.csv
 $ aws s3 cp TotalPopulation.csv.gz s3://<bucket>
 ```
 
### Filtering S3 CSV file at the storage site using SELECT statement
 * Note the filter condition in where clause, and the limit clause
 

In [18]:
r = s3.select_object_content(
    Bucket=mybucket,
    Key='TotalPopulation.csv.gz',
    ExpressionType='SQL',
    Expression="select * from s3object s where (s.Location like '%United States of America%') limit 20",
    InputSerialization={
        'CSV': {
            "FileHeaderInfo": "USE",
        },
        'CompressionType': 'GZIP',
    },
    OutputSerialization={'CSV': {}},
)


for event in r['Payload']:
    if 'Records' in event:
        records = event['Records']['Payload'].decode('utf-8')
        print(records)
    elif 'Stats' in event:
        statsDetails = event['Stats']['Details']
        print("Stats details bytesScanned: ")
        print(statsDetails['BytesScanned'])
        print("Stats details bytesProcessed: ")
        print(statsDetails['BytesProcessed'])


840,United States of America,2,Medium,1950,1950.5,79233.218,79571.179,158804.397,17.361
840,United States of America,2,Medium,1951,1951.5,80165.741,80706.523,160872.264,17.587
840,United States of America,2,Medium,1952,1952.5,81281.509,81984.517,163266.026,17.848
840,United States of America,2,Medium,1953,1953.5,82534.378,83375.618,165909.996,18.137
840,United States of America,2,Medium,1954,1954.5,83884.153,84852.237,168736.39,18.446
840,United States of America,2,Medium,1955,1955.5,85296.618,86388.719,171685.337,18.769
840,United States of America,2,Medium,1956,1956.5,86743.656,87961.35,174705.006,19.099
840,United States of America,2,Medium,1957,1957.5,88203.12,89548.357,177751.477,19.432
840,United States of America,2,Medium,1958,1958.5,89658.37,91130.017,180788.387,19.764
840,United States of America,2,Medium,1959,1959.5,91097.416,92688.833,183786.249,20.092
840,United States of America,2,Medium,1960,1960.5,92510.602,94209.968,186720.57,20.412
840,United States of Ameri