In [1]:
import numpy as np
import spacy
from collections import Counter, defaultdict
from sklearn.preprocessing import normalize
import settings
import json
from nltk.corpus import wordnet as wn
from nltk import Tree

In [2]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [3]:
data = json.load(open('data/coco_noun.tags'))
nlp = spacy.load(settings.SPACY_MODEL)

In [68]:
#id_ = '384029'
id_ = "352211"
caption_list = data['train2014'][id_]['captions']
captions = [nlp(c) for c in caption_list]
caption_list

['A woman on waterskis is towed across a lake. ',
 'A young lady rides water skis on a lake',
 'A woman in a blue vest is water skiing',
 'A young woman in a bikini and life jacket waterskiing.',
 'A woman in a bikini riding water skis while being towed by a boat.']

In [69]:
for c in captions:
    for sent in c.sents:
        print(to_nltk_tree(sent.root).pretty_print())

        towed                       
  ________|_____________________     
 |   |        woman           across
 |   |     _____|_______        |    
 |   |    |             on     lake 
 |   |    |             |       |    
 is  .    A         waterskis   a   

None
         rides           
      _____|__________    
     |           |    on 
     |           |    |   
    lady        skis lake
  ___|_____      |    |   
 A       young water  a  

None
           is             
       ____|__________     
    woman             |   
  ____|____           |    
 |         in         |   
 |         |          |    
 |        vest      skiing
 |     ____|____      |    
 A    a        blue water 

None
          woman                            
  __________|____________                   
 |    |     |            in                
 |    |     |            |                  
 |    |     |       waterskiing            
 |    |     |     _______|________          
 |    |     |    |   

In [6]:
doc = captions[0]
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

A det man NOUN []
man ROOT man NOUN [A, preparing, .]
preparing acl man NOUN [desserts, in]
desserts dobj preparing VERB []
in prep preparing VERB [kitchen]
a det kitchen NOUN []
kitchen pobj in ADP [a, covered]
covered acl kitchen NOUN [in]
in prep covered VERB [frosting]
frosting pobj in ADP []
. punct man NOUN []


## Person Normalization

In [6]:
print(wn.synsets('person'))
ps = wn.synsets('person')
s = wn.synsets('chef')[0]
hyper = s.hypernym_paths()
for path in hyper:
    for syn in path:
        if syn in ps:
            print(syn, ps)

[Synset('person.n.01'), Synset('person.n.02'), Synset('person.n.03')]
Synset('person.n.01') [Synset('person.n.01'), Synset('person.n.02'), Synset('person.n.03')]
Synset('person.n.01') [Synset('person.n.01'), Synset('person.n.02'), Synset('person.n.03')]


In [70]:
print(caption_list)
cl = caption_list
ps = wn.synsets('person')
for i, cap in enumerate(captions): # each caption
    for j, token in enumerate(cap): # each token
        if len(wn.synsets(token.text)) > 0: # be sure that token has synset
            syn_token = wn.synsets(token.text)[0] # get the first synset
            path = syn_token.hypernym_paths() #get all the path from tte synset token
            for p in path: # iterate each path
                for syn in p: # each synset in path
                    if syn in ps:
                        print(token.text)
                        cl[i] = cl[i].replace(token.text, "person")
                        break
                break
print(cl)

['A woman on waterskis is towed across a lake. ', 'A young lady rides water skis on a lake', 'A woman in a blue vest is water skiing', 'A young woman in a bikini and life jacket waterskiing.', 'A woman in a bikini riding water skis while being towed by a boat.']
woman
lady
woman
woman
woman
['A person on waterskis is towed across a lake. ', 'A young person rides water skis on a lake', 'A person in a blue vest is water skiing', 'A young person in a bikini and life jacket waterskiing.', 'A person in a bikini riding water skis while being towed by a boat.']


In [71]:
captions2 = [nlp(c) for c in cl]

In [72]:
for c in captions2:
    for sent in c.sents:
        print(to_nltk_tree(sent.root).pretty_print())

        towed                        
  ________|______________________     
 |   |        person           across
 |   |     _____|________        |    
 |   |    |              on     lake 
 |   |    |              |       |    
 is  .    A          waterskis   a   

None
           rides           
       ______|__________    
      |            |    on 
      |            |    |   
    person        skis lake
  ____|______      |    |   
 A         young water  a  

None
            is             
       _____|__________     
    person             |   
  ____|_____           |    
 |          in         |   
 |          |          |    
 |         vest      skiing
 |     _____|____      |    
 A    a         blue water 

None
          person                            
  __________|_____________                   
 |    |     |             in                
 |    |     |             |                  
 |    |     |        waterskiing            
 |    |     |      _______|____

In [73]:
for c in captions:
    for sent in c.sents:
        print(to_nltk_tree(sent.root).pretty_print())

        towed                       
  ________|_____________________     
 |   |        woman           across
 |   |     _____|_______        |    
 |   |    |             on     lake 
 |   |    |             |       |    
 is  .    A         waterskis   a   

None
         rides           
      _____|__________    
     |           |    on 
     |           |    |   
    lady        skis lake
  ___|_____      |    |   
 A       young water  a  

None
           is             
       ____|__________     
    woman             |   
  ____|____           |    
 |         in         |   
 |         |          |    
 |        vest      skiing
 |     ____|____      |    
 A    a        blue water 

None
          woman                            
  __________|____________                   
 |    |     |            in                
 |    |     |            |                  
 |    |     |       waterskiing            
 |    |     |     _______|________          
 |    |     |    |   

In [67]:
for k in list(data['train2014'].keys())[:10]:
    print(k)
    print(data['train2014'][k]["tags"])
    print(data['train2014'][k]["captions"])
    print("")

352211
['woman', 'water', 'bikini', 'ski', 'lake', 'lady', 'waterski', 'vest', 'life', 'jacket', 'waterskiing', 'boat', 'skiing']
['A woman on waterskis is towed across a lake. ', 'A young lady rides water skis on a lake', 'A woman in a blue vest is water skiing', 'A young woman in a bikini and life jacket waterskiing.', 'A woman in a bikini riding water skis while being towed by a boat.']

123117
['man', 'tie', 'box', 'person', 'dress', 'shirt', 'neck', 'head']
['a man goofing around with a tie in a box', 'A person holding a tie in a box on his neck.', 'A man in a dress shirt holding a tie.', 'A man posing with a box he is holding up by his head', 'A man with a tie in a box.']

37358
['statue', 'horse', 'building', 'silhouette', 'sculpture', 'traffic', 'signal', 'man', 'middle', 'intersection', 'row', 'city', 'horseback', 'background']
['A sculpture of a man on a horse.', 'The statue is in the middle of the city. ', 'The silhouette of a horse is elevated near a row of tall buildings.'