## Notebook tasks:
### - Save images to an AWS S3 bucket
### - Use AWS Rekognition to reverse image search and return tags for each image
### - Saves data in both long and wide data frames
Brought to you by Natalie Olivo<br>
<a href = https://www.linkedin.com/in/natalie-olivo-82548951/>LinkedIn</a><br>
<a href = https://nmolivo.github.io/NMOstatic/>Website</a><br>
<a href = https://medium.com/@NatalieOlivo>Blog</a><br>
<a href = https://github.com/nmolivo>GitHub</a><br>

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("../../GA-materials/project-5/Project_TrackMaven/gitignore/newtweets_10percent.csv")

In [5]:
image_list = df['link_thumbnail']

In [6]:
#make sure it looks good!
test_1 = image_list[3]
test_1

'https://media.vanityfair.com/photos/5a0cb19d70fe0563a1ebbf44/16:9/w_1200,h_630,c_limit/t-Lisa-Lucas-Interview-a.jpg'

In [7]:
import boto
import boto3
conn = boto.connect_s3()
import requests

In [8]:
# Uses the creds in ~/.aws/credentials
s3 = boto3.resource('s3')
bucket_name_to_upload_image_to = '#########' #insert the name of your bucket here.

In [None]:
# Do this as a quick and easy check to make sure your S3 access is OK
for bucket in s3.buckets.all():
    if bucket.name == bucket_name_to_upload_image_to:
        print('Good to go. Found the bucket to upload the image into.')
        good_to_go = True

if not good_to_go:
    print('Not seeing your s3 bucket, might want to double check permissions in IAM')
# I learned this trick from the answers to this Stack Overflow question:
# https://stackoverflow.com/questions/14346065/upload-image-available-at-public-url-to-s3-using-boto

In [None]:
### We received all our images in the form of links. 
### We need our images saved to an S3 bucket, preferably without saving them to our hard drive.

### Don't have a bucket? What is a bucket? http://docs.aws.amazon.com/AmazonS3/latest/user-guide/create-bucket.html

In [None]:
mapping_dict ={}
for i, img_url in enumerate(image_list[0:10000]):
    img_name = "img_%05d" % (i,)
    mapping_dict[img_name] = img_url
    
    if (img_url == np.nan) | (str(img_url) == "nan"):
        continue
    else:
        # Uses the creds in ~/.aws/credentials
        s3_image_filename = img_name
        internet_image_url = img_url

        # Given an Internet-accessible URL, download the image and upload it to S3,
        # without needing to persist the image to disk locally
        req_for_image = requests.get(internet_image_url, stream=True)
        file_object_from_req = req_for_image.raw
        req_data = file_object_from_req.read()

        # Do the actual upload to s3
        s3.Bucket(bucket_name_to_upload_image_to).put_object(Key=s3_image_filename, Body=req_data)

In [None]:
# Save down your mapping dict so that you can eventually re-map your image tags to your full dataframe.
mapping_dict = pd.DataFrame(mapping_dict, index = range(0,len(mapping_dict)))
mapping_dict = pd.DataFrame(md_01.T[0])
mapping_dict.to_csv('mappingdict.csv')

## Developer notes: Below is code for component parts of a big for-loop that creates both wide and long dataframes with image information gathered from AWS Rekognition.

In [None]:
#So first off you probably want to look at the output of each tool in rekognition.  (It's different for each one!)
#I'm going to focus on:
#DetectObjects
#RecognizeCelebrities
#TextDetection

### detect_labels

In [21]:
# http://docs.aws.amazon.com/rekognition/latest/dg/get-started-exercise.html
fileName='img_00007'
bucket='880220067832'
client=boto3.client('rekognition')
## ^^ we only need to do this code once for the following examples. but I include it 
##    re-instated in case you want to check out different pics.

response = client.detect_labels(Image={'S3Object':{'Bucket':bucket,'Name':fileName}},MinConfidence=75)

In [22]:
#take a look at the output
response

{'Labels': [{'Confidence': 99.28775787353516, 'Name': 'Human'},
  {'Confidence': 99.2877426147461, 'Name': 'People'},
  {'Confidence': 99.28775787353516, 'Name': 'Person'},
  {'Confidence': 91.67272186279297, 'Name': 'Audience'},
  {'Confidence': 91.67272186279297, 'Name': 'Crowd'},
  {'Confidence': 91.67272186279297, 'Name': 'Speech'},
  {'Confidence': 78.27274322509766, 'Name': 'Clothing'},
  {'Confidence': 78.27274322509766, 'Name': 'Coat'},
  {'Confidence': 78.27274322509766, 'Name': 'Overcoat'},
  {'Confidence': 78.27274322509766, 'Name': 'Suit'}],
 'OrientationCorrection': 'ROTATE_0',
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '536',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 04 Dec 2017 03:13:26 GMT',
   'x-amzn-requestid': '18599e6b-d8a1-11e7-8234-c9f1716fbb2e'},
  'HTTPStatusCode': 200,
  'RequestId': '18599e6b-d8a1-11e7-8234-c9f1716fbb2e',
  'RetryAttempts': 0}}

In [28]:
response.keys()

dict_keys(['OrientationCorrection', 'Labels', 'ResponseMetadata'])

### detect_text

In [25]:
fileName= 'img_00006'
bucket='880220067832'

text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket,'Name':fileName}})
text_in_image

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '6048',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 04 Dec 2017 03:21:04 GMT',
   'x-amzn-requestid': '2932678b-d8a2-11e7-9290-1792ccb88bc9'},
  'HTTPStatusCode': 200,
  'RequestId': '2932678b-d8a2-11e7-9290-1792ccb88bc9',
  'RetryAttempts': 0},
 'TextDetections': [{'Confidence': 87.87747955322266,
   'DetectedText': 'Protects',
   'Geometry': {'BoundingBox': {'Height': 0.0289202518761158,
     'Left': 0.8098856210708618,
     'Top': 0.3966602385044098,
     'Width': 0.05391734838485718},
    'Polygon': [{'X': 0.8098856210708618, 'Y': 0.3966602385044098},
     {'X': 0.863802969455719, 'Y': 0.3976689577102661},
     {'X': 0.8636319041252136, 'Y': 0.4265892207622528},
     {'X': 0.8097145557403564, 'Y': 0.4255805015563965}]},
   'Id': 0,
   'Type': 'LINE'},
  {'Confidence': 81.16915130615234,
   'DetectedText': 'Minorcuts',
   'Geometry': {'BoundingBox': {'Height': 0.0252319965511

In [26]:
text_in_image.keys()

dict_keys(['TextDetections', 'ResponseMetadata'])

In [27]:
text_in_image["TextDetections"]

[{'Confidence': 87.87747955322266,
  'DetectedText': 'Protects',
  'Geometry': {'BoundingBox': {'Height': 0.0289202518761158,
    'Left': 0.8098856210708618,
    'Top': 0.3966602385044098,
    'Width': 0.05391734838485718},
   'Polygon': [{'X': 0.8098856210708618, 'Y': 0.3966602385044098},
    {'X': 0.863802969455719, 'Y': 0.3976689577102661},
    {'X': 0.8636319041252136, 'Y': 0.4265892207622528},
    {'X': 0.8097145557403564, 'Y': 0.4255805015563965}]},
  'Id': 0,
  'Type': 'LINE'},
 {'Confidence': 81.16915130615234,
  'DetectedText': 'Minorcuts',
  'Geometry': {'BoundingBox': {'Height': 0.025231996551156044,
    'Left': 0.8168795108795166,
    'Top': 0.4240514636039734,
    'Width': 0.06063205003738403},
   'Polygon': [{'X': 0.8168795108795166, 'Y': 0.4240514636039734},
    {'X': 0.8775115609169006, 'Y': 0.4253864288330078},
    {'X': 0.8773359060287476, 'Y': 0.4506184160709381},
    {'X': 0.8167038559913635, 'Y': 0.4492834508419037}]},
  'Id': 1,
  'Type': 'LINE'},
 {'Confidence': 

### recognize_celebrities

In [44]:
fileName= 'img_00012'
bucket='880220067832'

celeb_detect = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket,'Name':fileName}})

In [45]:
celeb_detect.keys()

dict_keys(['OrientationCorrection', 'CelebrityFaces', 'UnrecognizedFaces', 'ResponseMetadata'])

In [46]:
celeb_detect['CelebrityFaces']

[{'Face': {'BoundingBox': {'Height': 0.10687500238418579,
    'Left': 0.4807872474193573,
    'Top': 0.15562500059604645,
    'Width': 0.16026242077350616},
   'Confidence': 99.9999771118164,
   'Landmarks': [{'Type': 'eyeLeft',
     'X': 0.5403168201446533,
     'Y': 0.19756773114204407},
    {'Type': 'eyeRight', 'X': 0.5921167731285095, 'Y': 0.20492416620254517},
    {'Type': 'nose', 'X': 0.5595902800559998, 'Y': 0.22208547592163086},
    {'Type': 'mouthLeft', 'X': 0.5338063836097717, 'Y': 0.23306140303611755},
    {'Type': 'mouthRight', 'X': 0.5765158534049988, 'Y': 0.23889882862567902}],
   'Pose': {'Pitch': -3.3401520252227783,
    'Roll': 11.797859191894531,
    'Yaw': -0.263323575258255},
   'Quality': {'Brightness': 23.701353073120117,
    'Sharpness': 99.99090576171875}},
  'Id': '1ax3nr0o',
  'MatchConfidence': 94.0,
  'Name': 'Kim Kardashian',
  'Urls': ['www.imdb.com/name/nm2578007']}]

In [42]:
celeb_detect['CelebrityFaces'][0]['Name']

'Kim Kardashian'

## Big for-loop that creates both wide and long df's with our image tags from Rekognition:

12/1 Notes: Sometimes due to network conditions, it will throw an error on a specific record. When ran again, that same record will not throw an error.
12/1 Notes: This takes a long time to run. May want to only test on ~20 images at first.

In [None]:
bucket_name = '##########'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
images = [img.key for img in bucket.objects.all()]
client = boto3.client('rekognition')

results_wide = []
results_long = []

for img in images:
    img_dict_wide = {'img': img}
    #print(img)
    try:
        labels = client.detect_labels(Image={'S3Object':{'Bucket':bucket_name,'Name':img}},MinConfidence=75)
        if 'Labels' in labels:
            for l, label in enumerate(labels['Labels']):
                results_long.append({'img': img, 'type': 'Label', 'label': label['Name'], 
                                     'confidence': label['Confidence']})
                col = 'label_' + str(l)
                img_dict_wide[col] = label['Name']
                img_dict_wide[col + '_confidence'] = label['Confidence'] 
    except:
        continue
    try:        
        celebrities = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if 'CelebrityFaces' in celebrities:
            for f, face in enumerate(celebrities['CelebrityFaces']):
                results_long.append({'img': img, 'type': 'Celebrity', 'label': face['Name'], 
                                     'confidence': face['Face']['Confidence']})
                col = 'celeb_' + str(f)
                img_dict_wide[col] = face['Name']
                img_dict_wide[col + '_confidence'] = face['Face']['Confidence']
    except:
        continue
    try:
        text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if "TextDetections" in text_in_image:
            for w, word in enumerate(text_in_image["TextDetections"]):
                results_long.append({'img': img, 'type': "Text", 'label': word["DetectedText"],
                                    'confidence': word["Confidence"]})
                col = 'word_' + str(w)
                img_dict_wide[col] = word["DetectedText"]
                img_dict_wide[col+ '_confidence'] = word["Confidence"]
    except:
        continue
            
    if 'Labels' not in labels and 'CelebrityFaces' not in celebrities and "TextDetections" not in text_in_image:
        results_long.append({'img': img, 'type': None, 'label': None, 'confidence': None})
        
    results_wide.append(img_dict_wide)
####
####
img_df_long = pd.DataFrame(results_long, columns=['img', 'type', 'label', 'confidence'])
img_df_wide = pd.DataFrame(results_wide)
cols = sorted(img_df_wide.columns)
cols.remove('img')
img_df_wide = img_df_wide[['img'] + cols]

In [None]:
# save down your dfs.

#For our topic modelers only focused on images data!
img_df_long.to_csv("twitter_img_text_long.csv")

#For mapping to the dataframe provided to us.
img_df_wide.to_csv("twitter_img_text_wide.csv")