# Overview
The following notebook will import data from AWS S3, look for images (PNG and JPEG) then attempt to label the images for analysis
The notebook requires access to 
* AWS S3
* AWS Rekognition


## Imports

In [6]:
import base64
import csv
import io
import json
import os
import pathlib
from io import BytesIO
import cv2

import boto3
import numpy as np
import pandas as pd
import requests
from IPython.display import HTML, display

## Main Function

In [7]:
def main(bucket):
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket, Prefix="")

    with open("main.csv", "w", newline="") as myfile:
        writer = csv.writer(myfile)
        
        # Loop thour all pages
        for page in pages:
            for obj in page["Contents"]:
                file_extension = pathlib.Path(obj["Key"]).suffix
                if file_extension.casefold() == ".jpeg" or file_extension.casefold() == ".png":
                    
                    
                    
                    try:
                    
                        base64 = getbase64(obj["Key"], bucket)


                        list = [obj["Key"], obj["ETag"]]


                        response = detect_labels(obj["Key"], bucket)
                        for i, label in enumerate(response["Labels"]):
                            if (i <=4):
                                list.append(label["Name"])
                                list.append(label["Confidence"])
                        print(list)
                    except:
                        print("File Skipped due to error")
                    

                    writer.writerow(list)

## Create Function to Tag Images

In [8]:
def detect_labels(photo, bucket):

    client = boto3.client("rekognition")

    response = client.detect_labels(
        Image={"S3Object": {"Bucket": bucket, "Name": photo}}, MaxLabels=10
    )

    print("Detected labels for " + photo)
    print()
    for label in response["Labels"]:
        print("Label: " + label["Name"])
        print("Confidence: " + str(label["Confidence"]))
        print("Instances:")
        for instance in label["Instances"]:
            print("  Bounding box")
            print("    Top: " + str(instance["BoundingBox"]["Top"]))
            print("    Left: " + str(instance["BoundingBox"]["Left"]))
            print("    Width: " + str(instance["BoundingBox"]["Width"]))
            print("    Height: " + str(instance["BoundingBox"]["Height"]))
            print("  Confidence: " + str(instance["Confidence"]))
            print()

        print("Parents:")
        for parent in label["Parents"]:
            print("   " + parent["Name"])
        print("----------")
        print()
    return response

## Function to create Base64 strings from images

In [9]:
# bucket = 'agd-000001-grpdrv-poc'
# obj = "AA_Enduring Power of Attorney/Working Files/Attorney+General's+Department.png"



def getbase64(obj, bucket):
    # import file
    s3 = boto3.resource("s3")
    file = s3.Object(bucket, obj)
    file = file.get()
    data = file["Body"].read()

    # read and resize 
    img = cv2.imdecode(np.asarray(bytearray(data)), cv2.IMREAD_COLOR)
    scale_percent = 10 # percent of original size
    width = int(img.shape[1] * scale_percent / 100)
    height = int(img.shape[0] * scale_percent / 100)
    dim = (width, height)
    img = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    
    # to base64
    _, buffer_img= cv2.imencode('.jpg', img)
    b64 = str(base64.b64encode(buffer_img).decode("ascii"))
    
    return b64



# getbase64(obj, bucket)

## Run

In [None]:
bucket = 'agd-000001-grpdrv-poc'

main(bucket)

Detected labels for AA_Enduring Power of Attorney/Working Files/Attorney+General's+Department.png

Label: Kangaroo
Confidence: 86.71527099609375
Instances:
  Bounding box
    Top: 0.10600493103265762
    Left: 0.392913281917572
    Width: 0.08329878002405167
    Height: 0.22506479918956757
  Confidence: 86.71527099609375

Parents:
   Animal
   Mammal
----------

Label: Text
Confidence: 84.46595764160156
Instances:
Parents:
----------

Label: Symbol
Confidence: 82.48308563232422
Instances:
Parents:
----------

Label: Bird
Confidence: 71.71637725830078
Instances:
  Bounding box
    Top: 0.12198956310749054
    Left: 0.5336669683456421
    Width: 0.0806104838848114
    Height: 0.1843235194683075
  Confidence: 71.71637725830078

Parents:
   Animal
----------

Label: Logo
Confidence: 69.73622131347656
Instances:
Parents:
----------

Label: Outdoors
Confidence: 69.021240234375
Instances:
Parents:
----------

Label: QR Code
Confidence: 60.74440383911133
Instances:
  Bounding box
    Top: 0.17



Detected labels for AR Mandatory Online Training CM  E Learning Master Files/01_introduction_to_cm_v9-1/course/html/assets/img/hpa_logo.png

Label: Green
Confidence: 99.96696472167969
Instances:
Parents:
----------

Label: Outdoors
Confidence: 92.84886932373047
Instances:
Parents:
----------

Label: Nature
Confidence: 92.44857788085938
Instances:
Parents:
   Outdoors
----------

Label: Night
Confidence: 90.11473846435547
Instances:
Parents:
   Nature
   Outdoors
----------

Label: Sky
Confidence: 83.8837890625
Instances:
Parents:
   Nature
   Outdoors
----------

Label: Blackboard
Confidence: 73.5577392578125
Instances:
  Bounding box
    Top: 0.07498355209827423
    Left: 0.01431734673678875
    Width: 0.9577580094337463
    Height: 0.844245970249176
  Confidence: 73.5577392578125

Parents:
----------

Label: Tennis Ball
Confidence: 67.59435272216797
Instances:
Parents:
   Ball
   Sport
   Tennis
----------

Label: Text
Confidence: 57.85098648071289
Instances:
Parents:
----------

Lab



Detected labels for AR Mandatory Online Training CM  E Learning Master Files/01_introduction_to_cm_v9-1/course/html/assets/img/hpe_pri_grn_pos_rgb.png

Label: Green
Confidence: 99.96696472167969
Instances:
Parents:
----------

Label: Outdoors
Confidence: 92.84886932373047
Instances:
Parents:
----------

Label: Nature
Confidence: 92.44857788085938
Instances:
Parents:
   Outdoors
----------

Label: Night
Confidence: 90.11473846435547
Instances:
Parents:
   Nature
   Outdoors
----------

Label: Sky
Confidence: 83.8837890625
Instances:
Parents:
   Nature
   Outdoors
----------

Label: Blackboard
Confidence: 73.5577392578125
Instances:
  Bounding box
    Top: 0.07498355209827423
    Left: 0.01431734673678875
    Width: 0.9577580094337463
    Height: 0.844245970249176
  Confidence: 73.5577392578125

Parents:
----------

Label: Tennis Ball
Confidence: 67.59435272216797
Instances:
Parents:
   Ball
   Sport
   Tennis
----------

Label: Text
Confidence: 57.85098648071289
Instances:
Parents:
----



Detected labels for AR Mandatory Online Training CM  E Learning Master Files/01_introduction_to_cm_v9-1/course/html/assets/img/hpesm_pri_grn_rev_rgb2.png

Label: Logo
Confidence: 86.09744262695312
Instances:
Parents:
----------

Label: Outdoors
Confidence: 69.5733642578125
Instances:
Parents:
----------

Label: Nature
Confidence: 63.27147674560547
Instances:
Parents:
   Outdoors
----------

Label: Green
Confidence: 57.18977355957031
Instances:
Parents:
----------

Label: Text
Confidence: 56.785465240478516
Instances:
Parents:
----------

Label: Leaf
Confidence: 56.5432243347168
Instances:
Parents:
   Plant
----------

Label: Plant
Confidence: 56.5432243347168
Instances:
Parents:
----------

Label: Water
Confidence: 56.243343353271484
Instances:
Parents:
----------

Label: Sea
Confidence: 56.208412170410156
Instances:
Parents:
   Nature
   Outdoors
   Water
----------

Label: Tennis Ball
Confidence: 55.01400375366211
Instances:
Parents:
   Ball
   Sport
   Tennis
----------

['AR Mandat