In [7]:
%pwd
%cd ..

/home/batman/ocn/mantaray


In [8]:
"""
After confirming the connection to S3, use this script to manage datasets.

A data catalogue .csv is maintained in the /catalog directory.

The catalog is loaded in a Pandas Dataframe object.


"""

import osmosis_aws_driver.data_S3_plugin as ocean_s3
# General imports
import sys
import os
#import glob
import pandas as pd
import hashlib

In [9]:
S3_BUCKET_NAME = r'data-catalogue-r00'



In [10]:
import logging

loggers_dict = logging.Logger.manager.loggerDict

logger = logging.getLogger()
logger.handlers = []

# Set level
logger.setLevel(logging.DEBUG)

# FORMAT = "%(asctime)s - %(levelno)s - %(module)-15s - %(funcName)-15s - %(message)s"
FORMAT = "%(asctime)s L%(levelno)s: %(module)-15s %(message)s"

DATE_FMT = "%Y-%m-%d %H:%M:%S"
formatter = logging.Formatter(FORMAT, DATE_FMT)

# Create handler and assign
handler = logging.StreamHandler(sys.stderr)
handler.setFormatter(formatter)
logger.handlers = [handler]
logger.critical("Logging started")


2018-10-09 16:30:46 L50: <ipython-input-10-6c35145f83e4> Logging started


In [11]:
# The working directory is the repo root
logging.debug("Current working directory: {}".format(os.getcwd()))

# The source catalog
FNAME_SOURCE_CATALOG = "Original/OceanDataSets_master catalog clean.csv"
# The current catalog stores the updated state
FNAME_CURRENT_CATALOG = r"Master catalog current.csv"
PATH_SOURCE_CATALOGUE = os.path.join(os.getcwd(),'catalog', FNAME_SOURCE_CATALOG)
PATH_CURRENT_CATALOGUE = os.path.join(os.getcwd(),'catalog', FNAME_CURRENT_CATALOG)
assert os.path.exists(PATH_SOURCE_CATALOGUE), "{}".format(PATH_SOURCE_CATALOGUE)
assert os.path.exists(PATH_CURRENT_CATALOGUE), "{}".format(PATH_CURRENT_CATALOGUE)

2018-10-09 16:30:46 L10: <ipython-input-11-3d0a2122a6e2> Current working directory: /home/batman/ocn/mantaray


In [12]:
df = pd.read_csv(PATH_CURRENT_CATALOGUE)

total_GB = sum(df.loc[:,'SizeGB'])
logging.debug("Loaded data catalogue with {} records representing {:0.0f} GB".format(len(df),total_GB))
logging.debug("{} files have been flagged as already uploaded to S3.".format(sum(df['uploaded'])))
errors = df[df['error'] != 'No error']['error'].value_counts()
logging.debug("{} files have been flagged with an upload error.".format(sum(errors)))

print("Error summary:")
for err in errors.iteritems():
    print('\t',*err)

res = df.head()
df = df[0:5]


2018-10-09 16:30:47 L10: <ipython-input-12-951140238dab> Loaded data catalogue with 887 records representing 2717 GB
2018-10-09 16:30:47 L10: <ipython-input-12-951140238dab> 240 files have been flagged as already uploaded to S3.
2018-10-09 16:30:47 L10: <ipython-input-12-951140238dab> 239 files have been flagged with an upload error.


Error summary:
	 NameError 134
	 InvalidSchema 97
	 MissingSchema 7
	 ConnectionError 1


In [13]:

# The `osmosis-aws-driver`, imported here as `ocean_s3` is a wrapper for Boto3.
# config = dict()
# config['region'] = 'eu-central-1'

config = None # No configuration needed
ocn_s3 = ocean_s3.S3_Plugin(config)

2018-10-09 16:30:50 L10: data_S3_plugin  Created a new S3 plugin object in region: eu-central-1


In [14]:
for i,b in enumerate(ocn_s3.list_buckets()):
    print(i,b['Name'])

0 data-catalogue-r00
1 ein-bucket1
2 ocean-test-osmosis-data-plugin-1537445137
3 ocean-test-osmosis-data-plugin-1537521079
4 ocean-test-osmosis-data-plugin-1537521088
5 ocean-test-osmosis-data-plugin-1537521138


In [15]:
# bucketname="ocean-test-osmosis-data-plugin-1537444458"
# ocn_s3.delete_bucket(bucketname)

In [16]:
bucket = ocn_s3.s3_resource.Bucket(S3_BUCKET_NAME)

In [17]:
s3files = {obj.key:obj for obj in  bucket.objects.all()}
total_GB=sum([s3files[f].size for f in s3files])/1000/1000/1000
logging.debug("{} files on {}, {:0.2f} GB".format(len(s3files),S3_BUCKET_NAME,total_GB))

2018-10-09 16:31:00 L10: <ipython-input-17-6c3030e86c95> 240 files on data-catalogue-r00, 76.64 GB


In [18]:
# Select a subset of files
these_keys = list(s3files.keys())[:2]
for f in these_keys:
    meta_data = s3files[f].Object().metadata
    print(f, meta_data)

006b7dcaf188ebe4c2e6dd2ec4be24c9712141b12864d6b65684003ef116e456 {'hash': '006b7dcaf188ebe4c2e6dd2ec4be24c9712141b12864d6b65684003ef116e456', 'created_time': '2018-08-07T14%3A51%3A51.099497', 'recordname': '2015%20Montreal%20MOXI%20Bike%20Share', 'version': 'v1', 'note': 'Montreal', 'category2': 'nan', 'category3': 'nan', 'category4': 'nan', 'type': 'text', 'p-id': 'p3065', 'classification': 'public', 'industry': 'Travel%20and%20Transportation', 'format': 'zip', 'keywords': 'nan', 'license': 'nan', 'download_link': 'https%3A%2F%2Fmontreal.bixi.com%2Fc%2Fbixi%2Ffile_db%2Fdata_all.file%2FBixiMontrealRentals2015.zip', 'sizegb': '0.173', 'updatefrequency': 'static', 'lifecyclestage': 'nan', 'source_code_license': 'nan', 'category1': 'biking', 'description': 'This%20dataset%20includes%20monthly%20trip%20information%20for%20MOXI%20riders.%20Timestamps%20and%20GPS%20information%20can%20be%20useful%20for%20analysis%20or%20further%20development.%20'}
02223c6f5386ae49a60f2f875a28adf73f072a4098da

In [19]:
for row in df.iterrows():
    print(row)

(0, RecordName                             Mapillar Mapillary Vistas Dataset
Download Link          https://s3-eu-west-1.amazonaws.com/static.mapi...
SizeGB                                                             0.015
Version                                                               v1
Format                                                               zip
License                                                      proprietary
Classification                                                    public
UpdateFrequency                                                   static
LifecycleStage                                                   initial
Description            A diverse street-level imagery dataset with pi...
Note                                                                 NaN
industry                                                             NaN
keywords                                                             NaN
Type                                           

In [20]:
df['uploaded']

0     True
1     True
2     True
3     True
4    False
Name: uploaded, dtype: bool