In [37]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.17.78-py2.py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 6.1 MB/s eta 0:00:01
[?25hCollecting botocore<1.21.0,>=1.20.78
  Downloading botocore-1.20.78-py2.py3-none-any.whl (7.5 MB)
[K     |████████████████████████████████| 7.5 MB 12.5 MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Using cached jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.5.0,>=0.4.0
  Using cached s3transfer-0.4.2-py2.py3-none-any.whl (79 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.17.78 botocore-1.20.78 jmespath-0.10.0 s3transfer-0.4.2


In [1]:
import os
import h5py
import time
import argparse
import copy

import numpy as np
from tqdm import tqdm
import logging
import boto3
from botocore.exceptions import ClientError
import matplotlib.pyplot as plt
from PIL import Image
import csv
from tqdm import tqdm

In [2]:
creds = {}
with open('../data/rootkey.csv', newline='') as rootkey:
    reader = csv.reader(rootkey, delimiter=' ')
    for row in reader:
        key, value = row[0].split("=")
        creds[key] = value
print(creds.keys())

BUCKET_NAME = 'kiln-labeling'

dict_keys(['AWSAccessKeyId', 'AWSSecretKey'])


In [4]:
local_mode = True

In [8]:
DATA_PATH = './mturk_test_hdfs' if local_mode else '/atlas/u/mhelabd/data/kiln-scaling/labelled_bangladesh_2019_2020/negative_sample/'
directory = os.fsencode(DATA_PATH)
all_files = os.listdir(directory)

print("directory: {}".format(directory))
print("all files ({}): {}".format(len(all_files), all_files))

directory: b'./mturk_test_hdfs'
all files (6): [b'.DS_Store', b'examples_1.hdf5', b'examples_2', b'.ipynb_checkpoints', b'examples_2.hdf5', b'examples_1']


In [6]:
fig_index = 0

def visualize_tile(image, indices=[3, 2, 1]):
    global fig_index
    plt.figure(fig_index)
    fig_index += 1
    row_idx = np.array(indices)
    X = np.transpose(image[row_idx], (1, 2, 0))
    X *= 1 / np.max(X)
    plt.imshow(X)
    
def save_tile_as_image(data, img_filename, indices=[3, 2, 1]):
    X = np.transpose(data, (1, 2, 0))
    X *= 1 / np.max(X) # * 255
    img = Image.fromarray((X * 255).astype(np.uint8))
    img.save(img_filename, "JPEG")

In [7]:
# Retrieve the list of existing buckets
s3 = boto3.client(
    's3',
    aws_access_key_id=creds['AWSAccessKeyId'],
    aws_secret_access_key=creds['AWSSecretKey'],
)
response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')
    
def upload_file(file_path, bucket_name, obj_name):
    global s3
    try:
        response = s3.upload_file(
            file_path, 
            bucket_name, 
            obj_name,
            ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'}
        )
    except ClientError as e:
        logging.error(e)
        return False
    return True

Existing buckets:
  kiln-labeling


In [23]:
img_index = 0
templates = ['image_coord{}_x1', 'image_coord{}_x2', 'image_coord{}_y1', 'image_coord{}_y2', 'image_url{}']
base_url = 'https://kiln-labeling.s3.us-east-2.amazonaws.com/'

with open('mturk_data.csv', mode='w') as data_file:
    file_writer = csv.writer(data_file, delimiter=',')
    row = [t.format(i) for i in range(20) for t in templates]
    print(row)
    
    for file_index, file in enumerate(all_files):
        FILENAME = os.fsdecode(file)
        if FILENAME[-5:] == '.hdf5':
            print("Starting file: " + FILENAME)
            with h5py.File(f"{DATA_PATH}/{FILENAME}", 'r') as f:
                folder_name = FILENAME[:-5]
                if not os.path.exists(DATA_PATH + "/" + folder_name):
                    os.makedirs(DATA_PATH + "/" + folder_name)

                images = np.array(f['images'])
                bounds = np.array(f['bounds'])
                for index in tqdm(range(images.shape[0])):
                    # visualize_tile(images[index])
                    rgb_image = images[index][[3, 2, 1], :, :]
                    # check images are not nan
                    if not np.isnan(rgb_image).any():
                        local_filename = DATA_PATH + "/" + folder_name + "/" + str(index) + ".jpeg"
                        
                        # save local file if it doesn't exist already
                        if not os.path.exists(local_filename):
                            save_tile_as_image(rgb_image, local_filename)
                        
                        # upload to s3
                        s3_filename = folder_name + "/" + str(index) + ".jpeg"
                        upload_file(local_filename, BUCKET_NAME, s3_filename)
                        
                        # build csv
                        if img_index % 20 == 0:
                            print("Writing row:", row)
                            file_writer.writerow(row)
                            row = []
                        row += [bounds[index][1], bounds[index][3], bounds[index][0], bounds[index][2], base_url + s3_filename]
                        img_index += 1

['image_coord0_x1', 'image_coord0_x2', 'image_coord0_y1', 'image_coord0_y2', 'image_url0', 'image_coord1_x1', 'image_coord1_x2', 'image_coord1_y1', 'image_coord1_y2', 'image_url1', 'image_coord2_x1', 'image_coord2_x2', 'image_coord2_y1', 'image_coord2_y2', 'image_url2', 'image_coord3_x1', 'image_coord3_x2', 'image_coord3_y1', 'image_coord3_y2', 'image_url3', 'image_coord4_x1', 'image_coord4_x2', 'image_coord4_y1', 'image_coord4_y2', 'image_url4', 'image_coord5_x1', 'image_coord5_x2', 'image_coord5_y1', 'image_coord5_y2', 'image_url5', 'image_coord6_x1', 'image_coord6_x2', 'image_coord6_y1', 'image_coord6_y2', 'image_url6', 'image_coord7_x1', 'image_coord7_x2', 'image_coord7_y1', 'image_coord7_y2', 'image_url7', 'image_coord8_x1', 'image_coord8_x2', 'image_coord8_y1', 'image_coord8_y2', 'image_url8', 'image_coord9_x1', 'image_coord9_x2', 'image_coord9_y1', 'image_coord9_y2', 'image_url9', 'image_coord10_x1', 'image_coord10_x2', 'image_coord10_y1', 'image_coord10_y2', 'image_url10', 'ima

  0%|          | 2/999 [00:00<05:24,  3.08it/s]

Writing row: ['image_coord0_x1', 'image_coord0_x2', 'image_coord0_y1', 'image_coord0_y2', 'image_url0', 'image_coord1_x1', 'image_coord1_x2', 'image_coord1_y1', 'image_coord1_y2', 'image_url1', 'image_coord2_x1', 'image_coord2_x2', 'image_coord2_y1', 'image_coord2_y2', 'image_url2', 'image_coord3_x1', 'image_coord3_x2', 'image_coord3_y1', 'image_coord3_y2', 'image_url3', 'image_coord4_x1', 'image_coord4_x2', 'image_coord4_y1', 'image_coord4_y2', 'image_url4', 'image_coord5_x1', 'image_coord5_x2', 'image_coord5_y1', 'image_coord5_y2', 'image_url5', 'image_coord6_x1', 'image_coord6_x2', 'image_coord6_y1', 'image_coord6_y2', 'image_url6', 'image_coord7_x1', 'image_coord7_x2', 'image_coord7_y1', 'image_coord7_y2', 'image_url7', 'image_coord8_x1', 'image_coord8_x2', 'image_coord8_y1', 'image_coord8_y2', 'image_url8', 'image_coord9_x1', 'image_coord9_x2', 'image_coord9_y1', 'image_coord9_y2', 'image_url9', 'image_coord10_x1', 'image_coord10_x2', 'image_coord10_y1', 'image_coord10_y2', 'image

 25%|██▍       | 249/999 [00:01<02:07,  5.87it/s]

Writing row: [91.7530248046838, 23.875423621328643, 91.75877402250217, 23.88117283914701, 'https://kiln-labeling.s3.us-east-2.amazonaws.com/examples_1/0.jpeg', 91.8680091610511, 23.875423621328643, 91.87375837886947, 23.88117283914701, 'https://kiln-labeling.s3.us-east-2.amazonaws.com/examples_1/1.jpeg', 92.1152255272408, 23.875423621328643, 92.12097474505916, 23.88117283914701, 'https://kiln-labeling.s3.us-east-2.amazonaws.com/examples_1/2.jpeg', 92.1957145766979, 23.875423621328643, 92.20146379451627, 23.88117283914701, 'https://kiln-labeling.s3.us-east-2.amazonaws.com/examples_1/3.jpeg', 89.63156342970714, 22.84631363184132, 89.6373126475255, 22.852062849659685, 'https://kiln-labeling.s3.us-east-2.amazonaws.com/examples_1/248.jpeg', 89.72930013261934, 22.84631363184132, 89.7350493504377, 22.852062849659685, 'https://kiln-labeling.s3.us-east-2.amazonaws.com/examples_1/249.jpeg', 89.424591588246, 22.840564414022953, 89.43034080606436, 22.84631363184132, 'https://kiln-labeling.s3.us-ea

 30%|██▉       | 297/999 [00:08<00:20, 34.05it/s]


KeyboardInterrupt: 