In [1]:
!pip install boto3

You should consider upgrading via the '/sailhome/mliu356/ml-env/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import os
import h5py
import time
import argparse
import copy

import numpy as np
from tqdm import tqdm
import logging
import boto3
from botocore.exceptions import ClientError
import matplotlib.pyplot as plt
from PIL import Image
import csv
from tqdm import tqdm

In [2]:
creds = {}
with open('../data/rootkey.csv', newline='') as rootkey:
    reader = csv.reader(rootkey, delimiter=' ')
    for row in reader:
        key, value = row[0].split("=")
        creds[key] = value
print(creds.keys())

BUCKET_NAME = 'kiln-labeling'

dict_keys(['AWSAccessKeyId', 'AWSSecretKey'])


In [3]:
local_mode = False

In [8]:
DATA_PATH = './mturk_test_hdfs' if local_mode else '/atlas/u/mhelabd/data/kiln-scaling/labelled_bangladesh_2019_2020/negative_sample/'
directory = os.fsencode(DATA_PATH)
all_files = os.listdir(directory)
all_files.sort()

print("directory: {}".format(directory))
print("all files ({}): {}".format(len(all_files), all_files))

directory: b'/atlas/u/mhelabd/data/kiln-scaling/labelled_bangladesh_2019_2020/negative_sample/'
all files (22): [b'examples_1.hdf5', b'examples_10.hdf5', b'examples_11.hdf5', b'examples_12.hdf5', b'examples_13.hdf5', b'examples_14.hdf5', b'examples_15.hdf5', b'examples_16.hdf5', b'examples_17.hdf5', b'examples_18.hdf5', b'examples_19.hdf5', b'examples_2', b'examples_2.hdf5', b'examples_21.hdf5', b'examples_3.hdf5', b'examples_4.hdf5', b'examples_5.hdf5', b'examples_6', b'examples_6.hdf5', b'examples_7.hdf5', b'examples_8.hdf5', b'examples_9.hdf5']


In [5]:
fig_index = 0

def visualize_tile(image, indices=[3, 2, 1]):
    global fig_index
    plt.figure(fig_index)
    fig_index += 1
    row_idx = np.array(indices)
    X = np.transpose(image[row_idx], (1, 2, 0))
    X *= 1 / np.max(X)
    plt.imshow(X)
    
def save_tile_as_image(data, img_filename, indices=[3, 2, 1]):
    X = np.transpose(data, (1, 2, 0))
    X *= 1 / np.max(X) # * 255
    img = Image.fromarray((X * 255).astype(np.uint8))
    img.save(img_filename, "JPEG")

In [6]:
# Retrieve the list of existing buckets
s3 = boto3.client(
    's3',
    aws_access_key_id=creds['AWSAccessKeyId'],
    aws_secret_access_key=creds['AWSSecretKey'],
)
response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')
    
def upload_file(file_path, bucket_name, obj_name):
    global s3
    try:
        response = s3.upload_file(
            file_path, 
            bucket_name, 
            obj_name,
            ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'}
        )
    except ClientError as e:
        logging.error(e)
        return False
    return True

Existing buckets:
  kiln-labeling


In [9]:
img_index = 0
templates = ['image_coord{}_x1', 'image_coord{}_x2', 'image_coord{}_y1', 'image_coord{}_y2', 'image_url{}']
base_url = 'https://kiln-labeling.s3.us-east-2.amazonaws.com/'

with open('mturk_data.csv', mode='w') as data_file:
    file_writer = csv.writer(data_file, delimiter=',')
    row = [t.format(i) for i in range(20) for t in templates]
    
    for file_index, file in enumerate(all_files):
        FILENAME = os.fsdecode(file)
        if FILENAME[-5:] == '.hdf5':
            print("Starting file: " + FILENAME)
            with h5py.File(f"{DATA_PATH}/{FILENAME}", 'r') as f:
                folder_name = FILENAME[:-5]
                if not os.path.exists(DATA_PATH + "/" + folder_name):
                    os.makedirs(DATA_PATH + "/" + folder_name)

                images = np.array(f['images'])
                bounds = np.array(f['bounds'])
                for index in tqdm(range(images.shape[0])):
                    # visualize_tile(images[index])
                    rgb_image = images[index][[3, 2, 1], :, :]
                    # check images are not nan
                    if not np.isnan(rgb_image).any():
                        local_filename = DATA_PATH + "/" + folder_name + "/" + str(index) + ".jpeg"
                        
                        # save local file if it doesn't exist already
                        if not os.path.exists(local_filename):
                            save_tile_as_image(rgb_image, local_filename)
                        
                        # upload to s3
                        s3_filename = folder_name + "/" + str(index) + ".jpeg"
                        upload_file(local_filename, BUCKET_NAME, s3_filename)
                        
                        # build csv
                        if img_index % 20 == 0:
                            file_writer.writerow(row)
                            row = []
                        row += [bounds[index][1], bounds[index][3], bounds[index][0], bounds[index][2], base_url + s3_filename]
                        img_index += 1

Starting file: examples_1.hdf5


100%|██████████| 999/999 [01:52<00:00,  8.90it/s] 


Starting file: examples_10.hdf5


100%|██████████| 999/999 [02:50<00:00,  5.84it/s]


Starting file: examples_11.hdf5


100%|██████████| 999/999 [02:29<00:00,  6.70it/s]


Starting file: examples_12.hdf5


100%|██████████| 999/999 [02:41<00:00,  6.18it/s] 


Starting file: examples_13.hdf5


100%|██████████| 999/999 [02:35<00:00,  6.44it/s]


Starting file: examples_14.hdf5


100%|██████████| 999/999 [02:04<00:00,  8.01it/s] 


Starting file: examples_15.hdf5


100%|██████████| 999/999 [02:45<00:00,  6.04it/s]


Starting file: examples_16.hdf5


100%|██████████| 999/999 [02:36<00:00,  6.39it/s]


Starting file: examples_17.hdf5


100%|██████████| 999/999 [02:33<00:00,  6.53it/s] 


Starting file: examples_18.hdf5


100%|██████████| 999/999 [01:55<00:00,  8.61it/s]


Starting file: examples_19.hdf5


100%|██████████| 999/999 [02:12<00:00,  7.55it/s] 


Starting file: examples_2.hdf5


100%|██████████| 999/999 [02:42<00:00,  6.15it/s]


Starting file: examples_21.hdf5


100%|██████████| 497/497 [01:01<00:00,  8.05it/s] 


Starting file: examples_3.hdf5


100%|██████████| 999/999 [02:44<00:00,  6.06it/s]


Starting file: examples_4.hdf5


100%|██████████| 999/999 [02:36<00:00,  6.38it/s]


Starting file: examples_5.hdf5


100%|██████████| 999/999 [01:50<00:00,  9.04it/s] 


Starting file: examples_6.hdf5


100%|██████████| 999/999 [02:14<00:00,  7.42it/s] 


Starting file: examples_7.hdf5


100%|██████████| 999/999 [02:01<00:00,  8.20it/s] 


Starting file: examples_8.hdf5


100%|██████████| 999/999 [02:13<00:00,  7.47it/s]


Starting file: examples_9.hdf5


100%|██████████| 999/999 [02:33<00:00,  6.53it/s] 
