## Notebook tasks:
### - Save images to an AWS S3 bucket
### - Use AWS Rekognition to reverse image search and return tags for each image
### - Saves data in both long and wide data frames
Brought to you by Natalie Olivo<br>
<a href = https://www.linkedin.com/in/natalie-olivo-82548951/>LinkedIn</a><br>
<a href = https://nmolivo.github.io/NMOstatic/>Website</a><br>
<a href = https://medium.com/@NatalieOlivo>Blog</a><br>
<a href = https://github.com/nmolivo>GitHub</a><br>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("newtweets_10percent.csv")

In [None]:
image_list = df['link_thumbnail']

In [None]:
#make sure it looks good!
test_1 = image_list[3]
test_1

In [None]:
import boto
import boto3
conn = boto.connect_s3()
import requests

In [None]:
# Uses the creds in ~/.aws/credentials
s3 = boto3.resource('s3')
bucket_name_to_upload_image_to = '#########' #insert the name of your bucket here.

In [None]:
# Do this as a quick and easy check to make sure your S3 access is OK
for bucket in s3.buckets.all():
    if bucket.name == bucket_name_to_upload_image_to:
        print('Good to go. Found the bucket to upload the image into.')
        good_to_go = True

if not good_to_go:
    print('Not seeing your s3 bucket, might want to double check permissions in IAM')

In [None]:
### We received all our images in the form of links. 
### We need our images saved to an S3 bucket, preferably without saving them to our hard drive.

### Don't have a bucket? What is a bucket? http://docs.aws.amazon.com/AmazonS3/latest/user-guide/create-bucket.html

In [None]:
mapping_dict ={}
for i, img_url in enumerate(image_list[0:10000]):
    img_name = "img_%05d" % (i,)
    mapping_dict[img_name] = img_url
    
    if (img_url == np.nan) | (str(img_url) == "nan"):
        continue
    else:
        # Uses the creds in ~/.aws/credentials
        s3_image_filename = img_name
        internet_image_url = img_url

        # Given an Internet-accessible URL, download the image and upload it to S3,
        # without needing to persist the image to disk locally
        req_for_image = requests.get(internet_image_url, stream=True)
        file_object_from_req = req_for_image.raw
        req_data = file_object_from_req.read()

        # Do the actual upload to s3
        s3.Bucket(bucket_name_to_upload_image_to).put_object(Key=s3_image_filename, Body=req_data)

In [None]:
# Save down your mapping dict so that you can eventually re-map your image tags to your full dataframe.
mapping_dict = pd.DataFrame(mapping_dict, index = range(0,len(mapping_dict)))
mapping_dict = pd.DataFrame(md_01.T[0])
mapping_dict.to_csv('mappingdict.csv')

## Developer notes: Below is code for component parts of a big for-loop that creates both wide and long dataframes with image information gathered from AWS Rekognition.

In [None]:
#So first off you probably want to look at the output of each tool in rekognition.  (It's different for each one!)
#I'm going to focus on:
#DetectObjects
#RecognizeCelebrities
#TextDetection

### detect_labels

In [None]:
# http://docs.aws.amazon.com/rekognition/latest/dg/get-started-exercise.html
fileName= 'img_######'
bucket='##########'
client=boto3.client('rekognition')
## ^^ we only need to do this code once for the following examples. but I include it 
##    re-instated in case you want to check out different pics.

response = client.detect_labels(Image={'S3Object':{'Bucket':bucket,'Name':fileName}},MinConfidence=75)
# Notes on detecting people
# min confidence 75 return labels = ["person", "people", "human"]
# min confidence ~50 returns labels = ["blonde", "woman"]

In [None]:
#take a look at the output
response

In [None]:
response.keys()

### detect_text

In [None]:
fileName= 'img_######'
bucket='##########'

text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket,'Name':fileName}})

In [None]:
text_in_image.keys()

In [None]:
text_in_image["TextDetections"]

### recognize_celebrities

In [None]:
fileName= 'img_######'
bucket='##########'

celeb_detect = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket,'Name':fileName}})

In [None]:
celeb_detect.keys()

In [None]:
response['CelebrityFaces'][0]['Name']

## Big for-loop that creates both wide and long df's with our image tags from Rekognition:

12/1 Notes: Sometimes due to network conditions, it will throw an error on a specific record. When ran again, that same record will not throw an error.
12/1 Notes: This takes a long time to run. May want to only test on ~20 images at first.

In [None]:
bucket_name = '##########'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
images = [img.key for img in bucket.objects.all()]
client = boto3.client('rekognition')

results_wide = []
results_long = []

for img in images:
    img_dict_wide = {'img': img}
    #print(img)
    try:
        labels = client.detect_labels(Image={'S3Object':{'Bucket':bucket_name,'Name':img}},MinConfidence=75)
        if 'Labels' in labels:
            for l, label in enumerate(labels['Labels']):
                results_long.append({'img': img, 'type': 'Label', 'label': label['Name'], 
                                     'confidence': label['Confidence']})
                col = 'label_' + str(l)
                img_dict_wide[col] = label['Name']
                img_dict_wide[col + '_confidence'] = label['Confidence'] 
    except:
        continue
    try:        
        celebrities = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if 'CelebrityFaces' in celebrities:
            for f, face in enumerate(celebrities['CelebrityFaces']):
                results_long.append({'img': img, 'type': 'Celebrity', 'label': face['Name'], 
                                     'confidence': face['Face']['Confidence']})
                col = 'celeb_' + str(f)
                img_dict_wide[col] = face['Name']
                img_dict_wide[col + '_confidence'] = face['Face']['Confidence']
    except:
        continue
    try:
        text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if "TextDetections" in text_in_image:
            for w, word in enumerate(text_in_image["TextDetections"]):
                results_long.append({'img': img, 'type': "Text", 'label': word["DetectedText"],
                                    'confidence': word["Confidence"]})
                col = 'word_' + str(w)
                img_dict_wide[col] = word["DetectedText"]
                img_dict_wide[col+ '_confidence'] = word["Confidence"]
    except:
        continue
            
    if 'Labels' not in labels and 'CelebrityFaces' not in celebrities and "TextDetections" not in text_in_image:
        results_long.append({'img': img, 'type': None, 'label': None, 'confidence': None})
        
    results_wide.append(img_dict_wide)
####
####
img_df_long = pd.DataFrame(results_long, columns=['img', 'type', 'label', 'confidence'])
img_df_wide = pd.DataFrame(results_wide)
cols = sorted(img_df_wide.columns)
cols.remove('img')
img_df_wide = img_df_wide[['img'] + cols]

In [None]:
# save down your dfs.

#For our topic modelers only focused on images data!
img_df_long.to_csv("twitter_img_text_long.csv")

#For mapping to the dataframe provided to us by TM
img_df_wide.to_csv("twitter_img_text_wide.csv")