In [1]:
'''
This script will read COCO captions annotation, split each caption
sentence into subject phrase, verb pharse and location phrase.
Then save them back to a new annotation json file.
coco_splitted_captions_train2014.json

Install spacy:
pip install spacy
python -m spacy download en_core_web_sm
'''
import json
import numpy as np
import string
from tqdm import tqdm
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

In [2]:
COCO_path = '../../../../datasets/coco'
coco_captions_train2014 = json.load(open(COCO_path + '/annotations/captions_train2014.json', 'rt'))

id2image = {}
id2captions = {}
for img in coco_captions_train2014['images']:
    id2image[img['id']] = img
    id2captions[img['id']] = []
for caption in coco_captions_train2014['annotations']:
    id2captions[caption['image_id']] += [caption]

In [3]:
def normalize_text(text):
    text = text.encode('ascii', 'ignore').decode('ascii')
    words = str(text).lower().translate(None, string.punctuation).strip().split()
    return ' '.join(words)

def split_caption(text):
    text = normalize_text(text)
    split_indices = [0]
    doc = nlp(unicode(text))
    words = text.split()
    verb_phrase = None
    location_phrase = None
    if not len(words) == len(doc):
        return [], None, None, None
    for i, token in enumerate(doc):
        if token.pos_ == u'VERB' and token.tag_ != u'VBN':
            split_indices += [i]  
            verb_phrase = ' '.join(words[i:])
            break
    for i, token in reversed(list(enumerate(doc))):
        if token.pos_ == u'ADP' and token.text not in ['of']:
            if len(split_indices) == 2 and i <= split_indices[1]:
                continue
            split_indices += [i]
            location_phrase = ' '.join(words[i:])
            break
    split_indices += [len(words)]
    split_indices = np.unique(split_indices)
    all_phrases = []
    for i in range(len(split_indices)-1):
        all_phrases += [' '.join(words[split_indices[i]:split_indices[i+1]])]
    subject_phrase = all_phrases[0]
    if len(all_phrases) == 3:
        subject_phrase, verb_phrase, location_phrase = all_phrases
    return all_phrases, subject_phrase, verb_phrase, location_phrase

# lets do an example here
text = 'a red man in red shirt crying in the wood'
print 'Let perform split on this example:', text
print split_caption(text)

Let perform split on this example: a red man in red shirt crying in the wood
(['a red man in red shirt', 'crying', 'in the wood'], 'a red man in red shirt', 'crying', 'in the wood')


In [4]:
print 'Lets try on these sentences-----------------'
subjects = []
verbs = []
locations = []
for caption in id2captions[id2captions.keys()[113]]:
    print '          ', caption['caption']
    phrases, subject, verb, location = split_caption(caption['caption'])
    subjects += [subject]
    verbs += [verb]
    locations += [location]
subjects = [i for i in subjects if i is not None]
verbs = [i for i in verbs if i is not None]
locations = [i for i in locations if i is not None]
print 'subjects -----------------------------------'
print '          ', subjects
print 'verbs -----------------------------------'
print '          ', verbs
print 'locations -----------------------------------'
print '          ', locations

Lets try on these sentences-----------------
           A  small black sheep in a field with other larger sheep
           A small, black sheep looks away in the grass.
           A baby sheep stands with a group of adult sheep in a grassy meadow.
           A herd of sheep grazing on a filed with tall green grass.
           A black lamb stands with its family in a lush green field.
subjects -----------------------------------
           ['a small black sheep in a field', 'a small black sheep', 'a baby sheep', 'a herd of sheep grazing on a filed', 'a black lamb']
verbs -----------------------------------
           ['looks away', 'stands with a group of adult sheep', 'stands with its family']
locations -----------------------------------
           ['with other larger sheep', 'in the grass', 'in a grassy meadow', 'with tall green grass', 'in a lush green field']


In [5]:
for caption in tqdm(coco_captions_train2014['annotations']):
    caption['phrases'], caption['subject_phrase'], caption['verb_phrase'], caption['location_phrase'] = split_caption(caption['caption'])

100%|██████████| 414113/414113 [1:01:17<00:00, 112.62it/s]


In [7]:
json.dump(coco_captions_train2014, open('coco_splitted_captions_train2014.json', 'w'))

In [12]:
x = json.load(open('coco_splitted_captions_train2014.json', 'rt'))
print x['annotations'][999]

{u'image_id': 511577, u'phrases': [u'a narrow bathroom', u'with a sink shower and toilet'], u'caption': u'A narrow bathroom with a sink, shower and toilet.\n', u'verb_phrase': None, u'subject_phrase': u'a narrow bathroom', u'location_phrase': u'with a sink shower and toilet', u'id': 27037}
