In [None]:
import sys, os
import boto3
import nltk
from collections import Counter
import requests
import time
import pprint  #print json 


rekognition = boto3.client('rekognition')
polly = boto3.client('polly')


In [None]:
# API calls to Rekognition 
def identify_image(url):
    with requests.get(url) as response:
        image = response.content
        print('\n\n\nDetecting labels...')
        response = rekognition.detect_labels(Image={'Bytes': image}, MaxLabels=3, MinConfidence=65)
        labels = response['Labels']
        #print(str(labels))
        pprint.pprint(response)
        print('\n\n\n\nDetecting faces...')
        response = rekognition.detect_faces(Attributes=[ 'ALL' ], Image={'Bytes':image})
        face_details_list = response['FaceDetails']
        #print(str(face_details_list))
        pprint.pprint(response)
        print('\n\n\n\nDetecting text...')
        response = rekognition.detect_text(Image={'Bytes':image})
        text_detection = response['TextDetections']
        #print(str(text_detection))
        pprint.pprint(response)


    return labels, face_details_list, text_detection

In [None]:
# API calls to polly 
def speak(polly, text, format='mp3', voice='Joanna'):
    
    import IPython
    resp = polly.synthesize_speech(OutputFormat=format, Text=text, VoiceId=voice)
    sound_file = open('polly_description.mp3', 'wb')
    sound_bytes = resp['AudioStream'].read()
    sound_file.write(sound_bytes)
    sound_file.close()

In [None]:
# auxiliar code to build a sentece from a json that recognition return 

def join_with_and(words):
    if len(words) == 0:
        return ''
    if len(words) == 1:
        return words[0]
    else:
        last_word = words.pop()
        return ', '.join(words) + ' and ' + last_word

def identify_image(url):
    with requests.get(url) as response:
        image = response.content
        print('\n\n\nDetecting labels...')
        response = rekognition.detect_labels(Image={'Bytes': image}, MaxLabels=3, MinConfidence=65)
        labels = response['Labels']
        print(str(labels))
        print('\n\n\n\nDetecting faces...')
        response = rekognition.detect_faces(Attributes=[ 'ALL' ], Image={'Bytes':image})
        face_details_list = response['FaceDetails']
        print(str(face_details_list))
        print('\n\n\n\nDetecting text...')
        response = rekognition.detect_text(Image={'Bytes':image})
        text_detection = response['TextDetections']
        print(str(text_detection))

    return labels, face_details_list, text_detection


def find_article(words,corpus_bigrams_counter, corpus_trigrams_counter):

    articles = ['a', 'an', 'some']
    max = 0
    best_article = ''
    print(words)
    for article in articles:
        if len(words) == 1:
            found = corpus_bigrams_counter[(article, words[0])]
        elif len(words) == 2:
            found = corpus_trigrams_counter[(article, words[0], words[1])]
        else:
            found = 0
        print(article + ' ' + str(found))
        if found > max:
            print("max!")
            max = found
            best_article = article + ' '
    return best_article

def init_ngrams():
    corpus = nltk.corpus.brown
    try:
        print(len(corpus.words()))
    except:
        print('Downloading corpus')
        nltk.download('brown')

    print('Computing bigrams...')
    corpus_bigrams = nltk.ngrams(corpus.words(), 2)
    corpus_trigrams = nltk.ngrams(corpus.words(), 3)

    print('Computing counter...')
    corpus_bigrams_counter = Counter(corpus_bigrams)
    corpus_trigrams_counter = Counter(corpus_trigrams)
    return corpus_bigrams_counter, corpus_trigrams_counter


corpus_bigrams_counter, corpus_trigrams_counter = init_ngrams()
    
def format_text(labels,face_details_list,text_detection):

    sentence = 'I see '
    if len(labels) + len(face_details_list) == 0:
        sentence += 'nothing'

    first = True
    for label in labels:
        if first:
            first = False
        else:
            sentence += ', '
        name = label['Name'].lower()
        article = find_article(name.split(' '),corpus_bigrams_counter, corpus_trigrams_counter)
        sentence += article + name ###.replace('_', ' ')

    if len(labels) > 0 and len(face_details_list) > 0:
        sentence += ', '

    first = True
    for faceDetail in face_details_list:
        if first:
            first = False
        else:
            sentence += ', '
        details = []
        gender = faceDetail['Gender']['Value'].lower()
        details.append('a ' + gender + ' face')
        ageRange = str(faceDetail['AgeRange']['Low']) + '-' + str(faceDetail['AgeRange']['High'])
        details.append('aged ' + ageRange)
        if gender == 'male':
            who = 'he'
            whose = 'his'
        elif gender == 'female':
            who = 'she'
            whose = 'her'
        else:
            who = 'it'
            whose = 'its'

        if faceDetail['Mustache']['Value']:
            details.append('with mustache')
        if faceDetail['Beard']['Value']:
            details.append('with a beard')
        if faceDetail['Sunglasses']['Value']:
            details.append('wearing sunglasses')
        elif faceDetail['Eyeglasses']['Value']:
            details.append('wearing eyeglasses')
        if faceDetail['EyesOpen']['Value']:
            details.append(whose + ' eyes are open')
        if not faceDetail['EyesOpen']['Value']:
            details.append(whose + ' eyes are closed')
        if faceDetail['MouthOpen']['Value']:
            details.append(whose + ' mouth is open')
        if not faceDetail['MouthOpen']['Value']:
            details.append(whose + ' mouth is closed')
        if faceDetail['Smile']['Value']:
            details.append(who + ' is smiling')
        emotions = []
        for emotion in faceDetail['Emotions']:
            if emotion['Confidence'] > 80:
                emotions.append('very ' + emotion['Type'].lower())
            elif emotion['Confidence'] > 80:
                emotions.append(emotion['Type'].lower())
            elif emotion['Confidence'] > 80:
                emotions.append('a little ' + emotion['Type'].lower())
        if len(emotions) > 0:
            details.append(who + ' looks ' + join_with_and(emotions))
        sentence += join_with_and(details)

    sentence += '. '

    if (text_detection):
        sentence += 'I can also read: '
        for text_like in text_detection:
            if text_like['Type'] == 'LINE':
                sentence += ', ' + text_like['DetectedText']
    print(sentence)
    return sentence


In [None]:
# please try with different images
# DON'T FORGET TO PLAY AUDIO!!!! 
    
image_url = "https://i.pinimg.com/originals/b7/cc/68/b7cc68a555e48b90004735c21c107b8e.jpg"
labels, face_details_list, text_detection = identify_image(image_url.strip())

sentence = format_text(labels,face_details_list,text_detection)
speak(polly, sentence)

from IPython.display import HTML
HTML('<iframe src=\'polly_description.mp3\' width=600 height=250></iframe> <iframe src='+image_url+' width=900 height=500></iframe> ')
