## Notebook tasks for our "link_thumbnail" pictures:
### - Save images to an AWS S3 bucket
### - Use AWS Rekognition to reverse image search and return tags for each image
### - Saves data in both long and wide data frames
Brought to you by Natalie Olivo
<a href = https://www.linkedin.com/in/natalie-olivo-82548951/>LinkedIn</a>
<a href = https://nmolivo.github.io/NMOstatic/>Website</a>
<a href = https://medium.com/@NatalieOlivo>Blog</a>
<a href = https://github.com/nmolivo>GitHub</a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../gitignore/newtweets_10percent.csv")

In [None]:
df.columns

In [None]:
image_list = df["link_thumbnail"]

In [None]:
test_1 = image_list[34]
test_1

In [None]:
import boto
import boto3
conn = boto.connect_s3()
import requests

In [None]:
# Uses the creds in ~/.aws/credentials
s3 = boto3.resource('s3')
bucket_name_to_upload_image_to = '880220067832'

In [None]:
# Do this as a quick and easy check to make sure your S3 access is OK
for bucket in s3.buckets.all():
    if bucket.name == bucket_name_to_upload_image_to:
        print('Good to go. Found the bucket to upload the image into.')
        good_to_go = True

if not good_to_go:
    print('Not seeing your s3 bucket, might want to double check permissions in IAM')

In [None]:
mapping_dict ={}
for i, img_url in enumerate(image_list[0:10000]):
    img_name = "img_%05d" % (i,)
    mapping_dict[img_name] = img_url
    
    if (img_url == np.nan) | (str(img_url) == "nan"):
        continue
    else:
        # Uses the creds in ~/.aws/credentials
        s3_image_filename = img_name
        internet_image_url = img_url

        # Given an Internet-accessible URL, download the image and upload it to S3,
        # without needing to persist the image to disk locally
        req_for_image = requests.get(internet_image_url, stream=True)
        file_object_from_req = req_for_image.raw
        req_data = file_object_from_req.read()

        # Do the actual upload to s3
        s3.Bucket(bucket_name_to_upload_image_to).put_object(Key=s3_image_filename, Body=req_data)

In [None]:
md_01 = pd.DataFrame(mapping_dict, index = range(0,len(mapping_dict)))

In [None]:
md_02 = pd.DataFrame(md_01.T[0])

In [None]:
md_02

In [None]:
pd.DataFrame(md_02).to_csv('../assets/mapping_dict_thumbnail.csv', index = range(0,len(mapping_dict)))

## Big for-loop that creates both wide and long df's with our image tags from Rekognition:
12/1 Notes: This takes a long time to run. May want to only test on ~20 images at first.

In [None]:
bucket_name = '880220067832'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
images = [img.key for img in bucket.objects.all()]
client = boto3.client('rekognition')

results_wide = []
results_long = []

for img in images:
    img_dict_wide = {'img': img}
    #print(img)
    try:
        labels = client.detect_labels(Image={'S3Object':{'Bucket':bucket_name,'Name':img}},MinConfidence=75)
        if 'Labels' in labels:
            for l, label in enumerate(labels['Labels']):
                results_long.append({'img': img, 'type': 'Label', 'label': label['Name'], 
                                     'confidence': label['Confidence']})
                col = 'label_' + str(l)
                img_dict_wide[col] = label['Name']
                img_dict_wide[col + '_confidence'] = label['Confidence'] 
    except:
        continue
    try:        
        celebrities = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if 'CelebrityFaces' in celebrities:
            for f, face in enumerate(celebrities['CelebrityFaces']):
                results_long.append({'img': img, 'type': 'Celebrity', 'label': face['Name'], 
                                     'confidence': face['Face']['Confidence']})
                col = 'celeb_' + str(f)
                img_dict_wide[col] = face['Name']
                img_dict_wide[col + '_confidence'] = face['Face']['Confidence']
    except:
        continue
    try:
        text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if "TextDetections" in text_in_image:
            for w, word in enumerate(text_in_image["TextDetections"]):
                results_long.append({'img': img, 'type': "Text", 'label': word["DetectedText"],
                                    'confidence': word["Confidence"]})
                col = 'word_' + str(w)
                img_dict_wide[col] = word["DetectedText"]
                img_dict_wide[col+ '_confidence'] = word["Confidence"]
    except:
        continue
            
    if 'Labels' not in labels and 'CelebrityFaces' not in celebrities and "TextDetections" not in text_in_image:
        results_long.append({'img': img, 'type': None, 'label': None, 'confidence': None})
        
    results_wide.append(img_dict_wide)
####
####
img_df_long = pd.DataFrame(results_long, columns=['img', 'type', 'label', 'confidence'])
img_df_wide = pd.DataFrame(results_wide)
cols = sorted(img_df_wide.columns)
cols.remove('img')
img_df_wide = img_df_wide[['img'] + cols]

In [None]:
img_df_wide.head(20)

In [None]:
img_df_long.head(20)

In [None]:
#For our topic modelers only focused on images data!
img_df_long.to_csv("../assets/thumbnail_link_long.csv")

In [None]:
#For mapping to the dataframe provided to us by TM
img_df_wide.to_csv("../assets/thumbnail_link_wide.csv")

In [None]:
#and remember, we have a mapping dictionary that stores our S3 image title and corresponding image link.
#'mapping_dict.csv'

## Developer notes: Below is code from the big for-loop that creates both wide and long dataframes with image information gathered from AWS Rekognition, in case you were curious.

### detect_labels

In [None]:
fileName= 'img_00066'
bucket='880220067832'

client=boto3.client('rekognition')

response = client.detect_labels(Image={'S3Object':{'Bucket':bucket,'Name':fileName}},MinConfidence=75)

label_dict ={}
for label in response['Labels']:
    label_dict[label['Name']] = str(label['Confidence'])
label_dict

### recognize_celebrities

In [None]:
fileName= 'img_00002'
bucket='880220067832'

client=boto3.client('rekognition')

response = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket,'Name':fileName}})

print('Detected labels for ' + fileName)
for face in response['CelebrityFaces']:
    print (face['Name'] + ' : ' + str(face['Face']['Confidence']))

In [None]:
response

In [None]:
response.keys()

In [None]:
response['CelebrityFaces'][0]['Name']

### detect_text

In [None]:
fileName= 'img_00006'
bucket='880220067832'

client=boto3.client('rekognition')

text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket,'Name':fileName}})

In [None]:
text_in_image["TextDetections"]

In [None]:
response["TextDetections"][0]["Confidence"]

In [None]:
response["TextDetections"][0]["DetectedText"]

### Resources

https://stackoverflow.com/questions/14346065/upload-image-available-at-public-url-to-s3-using-boto<br>
http://docs.aws.amazon.com/rekognition<br>
http://docs.aws.amazon.com/rekognition/latest/dg/get-started-exercise.html