This notebook demonstrates usage of different nltk Tree methods

In [1]:
from nltk.parse import corenlp

In [2]:
from nltk.tree import Tree

In [3]:
t = Tree(1, [2, Tree(3, [4]), 5])
print(t)

(1 2 (3 4) 5)


In [4]:
s = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
print(s)

(S (NP I) (VP (V saw) (NP him)))


In [5]:
s.pretty_print()

     S         
  ___|___       
 |       VP    
 |    ___|___   
 NP  V       NP
 |   |       |  
 I  saw     him



In [6]:
output = "(ROOT (S(NP (PRP You))(VP(VP (VBP grasp)(NP (NNS concepts))(ADVP (RB easily)))(CC and)(VP (MD may)(VP (VB become)(ADJP (JJ impatient))(PP (IN with)(NP(NP (DT those))(SBAR(WHNP (WP who))(S(VP (VBP do) (RB n't)(VP (VB learn)(ADVP (RB as) (RB quickly)))))))))))(. .)))"

In [7]:
parsetree = Tree.fromstring(output)
print(parsetree)

(ROOT
  (S
    (NP (PRP You))
    (VP
      (VP (VBP grasp) (NP (NNS concepts)) (ADVP (RB easily)))
      (CC and)
      (VP
        (MD may)
        (VP
          (VB become)
          (ADJP (JJ impatient))
          (PP
            (IN with)
            (NP
              (NP (DT those))
              (SBAR
                (WHNP (WP who))
                (S
                  (VP
                    (VBP do)
                    (RB n't)
                    (VP (VB learn) (ADVP (RB as) (RB quickly)))))))))))
    (. .)))


In [8]:
parsetree.pretty_print()

                                                   ROOT                                               
                                                    |                                                  
                                                    S                                                 
  __________________________________________________|_______________________________________________   
 |                         VP                                                                       | 
 |            _____________|________                                                                |  
 |           |             |        VP                                                              | 
 |           |             |    ____|________                                                       |  
 |           |             |   |             VP                                                     | 
 |           |             |   |     ________|____________           

In [9]:
# access children (which are also trees)
parsetree[0][1][2][1][2].pretty_print()

       PP                                      
  _____|____                                    
 |          NP                                 
 |      ____|________                           
 |     |            SBAR                       
 |     |     ________|_____                     
 |     |    |              S                   
 |     |    |              |                    
 |     |    |              VP                  
 |     |    |     _________|____                
 |     |    |    |   |          VP             
 |     |    |    |   |      ____|___            
 |     NP  WHNP  |   |     |       ADVP        
 |     |    |    |   |     |     ___|______     
 IN    DT   WP  VBP  RB    VB   RB         RB  
 |     |    |    |   |     |    |          |    
with those who   do n't  learn  as      quickly



In [10]:
parsetree[0][1][2][1][2].label()

'PP'

In [11]:
parsetree.productions()

[ROOT -> S,
 S -> NP VP .,
 NP -> PRP,
 PRP -> 'You',
 VP -> VP CC VP,
 VP -> VBP NP ADVP,
 VBP -> 'grasp',
 NP -> NNS,
 NNS -> 'concepts',
 ADVP -> RB,
 RB -> 'easily',
 CC -> 'and',
 VP -> MD VP,
 MD -> 'may',
 VP -> VB ADJP PP,
 VB -> 'become',
 ADJP -> JJ,
 JJ -> 'impatient',
 PP -> IN NP,
 IN -> 'with',
 NP -> NP SBAR,
 NP -> DT,
 DT -> 'those',
 SBAR -> WHNP S,
 WHNP -> WP,
 WP -> 'who',
 S -> VP,
 VP -> VBP RB VP,
 VBP -> 'do',
 RB -> "n't",
 VP -> VB ADVP,
 VB -> 'learn',
 ADVP -> RB RB,
 RB -> 'as',
 RB -> 'quickly',
 . -> '.']

In [12]:
def get_child_labels(t):
    """
    :param t: an nltk.tree.Tree
    :returns: list of labels of the children.
    """
    labels = []
    for child in t:
        labels.append(child.label())
        
    return labels

In [13]:
def simple_find_parallel_sentence(t):
    """
    From a sentence, finds two sub-sentences describing the same subject in parallel.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."

    :param t: an nltk.tree.Tree
    :returns: 2-ple of nltk.tree.Tree
    """
    subsentences = []
    labels = get_child_labels(t)
    
    if 'NP' in labels:
        np_label = labels.index('NP')
        print('NP found.')
        if 'VP' in labels:
            index = labels.index('VP')
            labels2 = get_child_labels(t[index])
            if labels2.count('VP') >= 2:
                print('found parallel VP subssentences.')
                for child in t[index]:
                    if child.label() == 'VP':
                        subsentences.append(child.leaves())
        elif 'ADJP' in labels:
            index = labels.index('ADJP')
            labels2 = get_child_labels(t[index])
            if labels2.count('ADJP') >= 2:
                print('found parallel ADJP sub-sentences.')
                for child in t[index]:
                    if child.label() == 'ADJP':
                        subsentences.append(child.leaves())
        print('new sentences are :')
        for sent in subsentences:
            print(t[np_label].leaves(), sent)
            
        # todo: return n new trees representing the n new sentences or return n new string sentences.
        
    else:
        print('Sentence may be invalid.')
    

In [14]:
simple_find_parallel_sentence(parsetree[0])

NP found.
found parallel VP subssentences.
new sentences are :
['You'] ['grasp', 'concepts', 'easily']
['You'] ['may', 'become', 'impatient', 'with', 'those', 'who', 'do', "n't", 'learn', 'as', 'quickly']


In [15]:
parser = corenlp.CoreNLPParser(url='http://localhost:9000')

In [16]:
parsed = parser.raw_parse('The quick brown fox jumps over the lazy dog.')

In [17]:
type(parsed)

list_iterator

In [18]:
collected = []
for i in parsed:
    collected.append(i)

In [19]:
print(collected)

[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('VP', [Tree('VBZ', ['jumps']), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]


In [20]:
collected[0].pretty_print()

                     ROOT                          
                      |                             
                      S                            
       _______________|__________________________   
      |                         VP               | 
      |                _________|___             |  
      |               |             PP           | 
      |               |     ________|___         |  
      NP              |    |            NP       | 
  ____|__________     |    |     _______|____    |  
 DT   JJ    JJ   NN  VBZ   IN   DT      JJ   NN  . 
 |    |     |    |    |    |    |       |    |   |  
The quick brown fox jumps over the     lazy dog  . 



In [21]:
parsed2 = parser.raw_parse('You are hardworking and intelligent.')
collected = []
for i in parsed2:
    collected.append(i)
parsetree2 = collected[0]
parsetree2.pretty_print()

            ROOT                            
             |                               
             S                              
  ___________|____________________________   
 |                   VP                   | 
 |    _______________|___                 |  
 |   |                  ADJP              | 
 |   |        ___________|________        |  
 NP  |      ADJP         |       ADJP     | 
 |   |       |           |        |       |  
PRP VBP     VBG          CC       JJ      . 
 |   |       |           |        |       |  
You are hardworking     and  intelligent  . 



In [22]:
simple_find_parallel_sentence(parsetree2[0])

NP found.
new sentences are :


so 'You are hardworking and intelligent.' is a different syntactic structure from 'you grasp concepts easily and may become impatient with those who don't learn as quickly.'. This means i need to rewite my simple_find_parallel_sentence() function.

In [23]:
def simple_find_parallel_sentence2(t):
    """
    From a sentence, finds two sub-sentences describing the same subject in parallel.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."

    :param t: an nltk.tree.Tree
    :returns: 2-ple of nltk.tree.Tree
    """
    subsentences = []
    labels = get_child_labels(t)
    
    if 'NP' in labels:
        np_label = labels.index('NP')
        print('NP found.')
        if 'VP' in labels:
            print('NP and VP found on same level.')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            if labels2.count('VP') >= 2:
                '''
                e.g.
                
                '''
                print('found parallel VP.')
                for child in t[vp_index]:
                    if child.label() == 'VP':
                        subsentences.append(child.leaves())
            elif labels2.count('ADJP') == 1: 
                print('found ADJP under VP.')
                adjp_index = labels2.index('ADJP')
                labels3 = get_child_labels(t[vp_index][adjp_index])
                if labels3.count('ADJP') >= 2:
                    '''
                    e.g.
                                ROOT                            
                                 |                               
                                 S                              
                      ___________|____________________________   
                     |                   VP                   | 
                     |    _______________|___                 |  
                     |   |                  ADJP              | 
                     |   |        ___________|________        |  
                     NP  |      ADJP         |       ADJP     | 
                     |   |       |           |        |       |  
                    PRP VBP     VBG          CC       JJ      . 
                     |   |       |           |        |       |  
                    You are hardworking     and  intelligent  . 
                    '''
                    print('found parallel ADJP')
                    for child in t[vp_index][adjp_index]:
                        if child.label() == 'ADJP':
                            subsentences.append(child.leaves())

        elif 'ADJP' in labels:
            index = labels.index('ADJP')
            labels2 = get_child_labels(t[index])
            if labels2.count('ADJP') >= 2:
                print('found parallel ADJP.')
                for child in t[index]:
                    if child.label() == 'ADJP':
                        subsentences.append(child.leaves())
        print('new sentences are :')
        for sent in subsentences:
            print(t[np_label].leaves(), sent)
            
        # todo: return n new trees representing the n new sentences or return n new string sentences.
        # todo: Rewrite code to fit more general cases. Right now, the logic is very 'hard-coded' and 'brute-force'.
        
    else:
        print('Sentence may be invalid.')
    

In [24]:
simple_find_parallel_sentence2(parsetree2[0])

NP found.
NP and VP found on same level.
found ADJP under VP.
found parallel ADJP
new sentences are :
['You'] ['hardworking']
['You'] ['intelligent']


try another potentially erroneous parse tree...

In [25]:
parsed3 = parser.raw_parse('You are not hardworking and intelligent.')
collected = []
for i in parsed3:
    collected.append(i)
parsetree3 = collected[0]
parsetree3.pretty_print()

                     ROOT                       
                      |                          
                      S                         
  ____________________|_______________________   
 |       VP                                   | 
 |    ___|____________                        |  
 |   |               ADJP                     | 
 |   |        ________|_______________        |  
 NP  |      ADJP              |      ADJP     | 
 |   |    ___|________        |       |       |  
PRP VBP  RB          VBG      CC      JJ      . 
 |   |   |            |       |       |       |  
You are not      hardworking and intelligent  . 



In [26]:
simple_find_parallel_sentence2(parsetree3[0])

NP found.
NP and VP found on same level.
found ADJP under VP.
found parallel ADJP
new sentences are :
['You'] ['not', 'hardworking']
['You'] ['intelligent']


The above sentence illustrates simple sentence ambiguity. The intended meaning of the sentence is 'you are not hardworking' and 'you are not inteligent'. This is a limitation of the parser. My sentence splitting code depends on the correctness of the parser. For now though, simple non-ambiguous cases should work with my sentence splitter. 

In [27]:
def get_parsetree(parser, sent):
    """
    Return parse tree from sentence.

    :param parser: an nltk.parse.corenlp.CoreNLPParser
    :returns: nltk.tree.Tree
    """
    parsed = parser.raw_parse(sent)
    collected = []
    for i in parsed:
        collected.append(i)
    parsetree = collected[0]
    return parsetree

In [28]:
parsetree4 = get_parsetree(parser, 'Bobby is happy and carefree.')
parsetree4.pretty_print()

           ROOT                      
            |                         
            S                        
   _________|______________________   
  |              VP                | 
  |     _________|___              |  
  NP   |            ADJP           | 
  |    |     ________|______       |  
 NNP  VBZ   JJ       CC     JJ     . 
  |    |    |        |      |      |  
Bobby  is happy     and  carefree  . 



In [29]:
simple_find_parallel_sentence2(parsetree4[0])

NP found.
NP and VP found on same level.
found ADJP under VP.
new sentences are :


JJ pairs are not accounted for. Need to modify simple_find_parallel_sentence2().

In [30]:
def simple_find_parallel_sentence3(t):
    """
    From a sentence, finds two sub-sentences describing the same subject in parallel.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."

    :param t: an nltk.tree.Tree
    :returns: 2-ple of nltk.tree.Tree
    """
    subsentences = []
    labels = get_child_labels(t)
    
    if 'NP' in labels:
        np_label = labels.index('NP')
        print('NP found.')
        if 'VP' in labels:
            print('NP and VP found on same level.')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            if labels2.count('VP') >= 2:
                '''
                e.g.
                
                '''
                print('found parallel VP.')
                for child in t[vp_index]:
                    if child.label() == 'VP':
                        subsentences.append(child.leaves())
            elif labels2.count('ADJP') == 1: 
                print('found ADJP under VP.')
                adjp_index = labels2.index('ADJP')
                labels3 = get_child_labels(t[vp_index][adjp_index])
                if labels3.count('ADJP') + labels3.count('JJ') >= 2:
                    '''
                    e.g.
                                ROOT                            
                                 |                               
                                 S                              
                      ___________|____________________________   
                     |                   VP                   | 
                     |    _______________|___                 |  
                     |   |                  ADJP              | 
                     |   |        ___________|________        |  
                     NP  |      ADJP         |       ADJP     | 
                     |   |       |           |        |       |  
                    PRP VBP     VBG          CC       JJ      . 
                     |   |       |           |        |       |  
                    You are hardworking     and  intelligent  . 
                    '''
                    print('found parallel ADJP')
                    for child in t[vp_index][adjp_index]:
                        if child.label() == 'ADJP' or child.label() == 'JJ':
                            subsentences.append(child.leaves())
                            

        elif 'ADJP' in labels:
            index = labels.index('ADJP')
            labels2 = get_child_labels(t[index])
            if labels2.count('ADJP') >= 2:
                print('found parallel ADJP.')
                for child in t[index]:
                    if child.label() == 'ADJP':
                        subsentences.append(child.leaves())
        print('new sentences are :')
        for sent in subsentences:
            print(t[np_label].leaves(), sent)
            
        # todo: return n new trees representing the n new sentences or return n new string sentences.
        # todo: Rewrite code to fit more general cases. Right now, the logic is very 'hard-coded' and 'brute-force'.
        
    else:
        print('Sentence may be invalid.')
    

In [31]:
simple_find_parallel_sentence3(parsetree4[0])

NP found.
NP and VP found on same level.
found ADJP under VP.
found parallel ADJP
new sentences are :
['Bobby'] ['happy']
['Bobby'] ['carefree']


parsetree4 works now.

In [32]:
parsetree5 = get_parsetree(parser, 'Bobby is unusually happy and carefree.')
parsetree5.pretty_print()
simple_find_parallel_sentence3(parsetree5[0])

                     ROOT                 
                      |                    
                      S                   
   ___________________|_________________   
  |           VP                        | 
  |     ______|_______                  |  
  NP   |             ADJP               | 
  |    |       _______|__________       |  
 NNP  VBZ     RB      JJ   CC    JJ     . 
  |    |      |       |    |     |      |  
Bobby  is unusually happy and carefree  . 

NP found.
NP and VP found on same level.
found ADJP under VP.
found parallel ADJP
new sentences are :
['Bobby'] ['happy']
['Bobby'] ['carefree']


In [33]:
parsetree6 = get_parsetree(parser, "Bobby is carefree and one of the most enigmatic persons I've known.")
parsetree6.pretty_print()
simple_find_parallel_sentence3(parsetree6[0])

                                      ROOT                                                  
                                       |                                                     
                                       S                                                    
        _______________________________|__________________________________________________   
       |                |                                S                                | 
       |                |         _______________________|______________________          |  
       |                |      NP-TMP                                   |       |         | 
       |                |    ____|_____                                 |       |         |  
       S                |   |          PP                               |       |         | 
   ____|___             |   |     _____|____                            |       |         |  
  |        VP           |   |    |          NP                   

The parser already detects 2 sentence - let's modify simple_find_parallel_sentences3() to cover this case.

In [34]:
def simple_find_parallel_sentence4(t):
    """
    From a sentence, finds two sub-sentences describing the same subject in parallel.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."

    :param t: an nltk.tree.Tree
    :returns: 2-ple of nltk.tree.Tree
    """
    subsentences = []
    labels = get_child_labels(t)
    
    if labels.count('S') >= 2:
        print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                subsentences.append(child.leaves())
        print('new sentences are :')
        for sent in subsentences:
            print(sent)
    elif 'NP' in labels:
        np_label = labels.index('NP')
        print('NP found.')
        if 'VP' in labels:
            print('NP and VP found on same level.')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            if labels2.count('VP') >= 2:
                print('found parallel VP.')
                for child in t[vp_index]:
                    if child.label() == 'VP':
                        subsentences.append(child.leaves())
            elif labels2.count('ADJP') == 1: 
                print('found ADJP under VP.')
                adjp_index = labels2.index('ADJP')
                labels3 = get_child_labels(t[vp_index][adjp_index])
                if labels3.count('ADJP') + labels3.count('JJ') >= 2:
                    '''
                    e.g.
                                ROOT                            
                                 |                               
                                 S                              
                      ___________|____________________________   
                     |                   VP                   | 
                     |    _______________|___                 |  
                     |   |                  ADJP              | 
                     |   |        ___________|________        |  
                     NP  |      ADJP         |       ADJP     | 
                     |   |       |           |        |       |  
                    PRP VBP     VBG          CC       JJ      . 
                     |   |       |           |        |       |  
                    You are hardworking     and  intelligent  . 
                    '''
                    print('found parallel ADJP')
                    for child in t[vp_index][adjp_index]:
                        if child.label() == 'ADJP' or child.label() == 'JJ':
                            subsentences.append(child.leaves())
                            

        elif 'ADJP' in labels:
            index = labels.index('ADJP')
            labels2 = get_child_labels(t[index])
            if labels2.count('ADJP') >= 2:
                print('found parallel ADJP.')
                for child in t[index]:
                    if child.label() == 'ADJP':
                        subsentences.append(child.leaves())
        print('new sentences are :')
        for sent in subsentences:
            print(t[np_label].leaves(), sent)
            
        # todo: return n new trees representing the n new sentences or return n new string sentences.
        # todo: Rewrite code to fit more general cases. Right now, the logic is very 'hard-coded' and 'brute-force'.
        
    else:
        print('Sentence may be invalid.')

In [35]:
simple_find_parallel_sentence4(parsetree6[0])

>=2 'S' found on same level.
new sentences are :
['Bobby', 'is', 'carefree']
['one', 'of', 'the', 'most', 'enigmatic', 'persons', 'I', "'ve", 'known']


In [36]:
parsetree7 = get_parsetree(parser, "Bobby is carefree and he is one of the most enigmatic persons I've known.")
parsetree7.pretty_print()
simple_find_parallel_sentence4(parsetree7[0])

                                       ROOT                                                  
                                        |                                                     
                                        S                                                    
        ________________________________|__________________________________________________   
       |                |       S                                                          | 
       |                |    ___|___                                                       |  
       |                |   |       VP                                                     | 
       |                |   |    ___|________                                              |  
       |                |   |   |            NP                                            | 
       |                |   |   |    ________|___________                                  |  
       |                |   |   |   |                  

The limitation of this particular case is that the parse tree does not identify the common subject. Although upon reading it, the human reader will be able to identify the common subject.

Now that we have modified the code to cover a few common cases, let's test the function on a set of new sentences.

In [37]:
parsetreex = get_parsetree(parser, "Bobby is an amazing father and close friend.")
parsetreex.pretty_print()
simple_find_parallel_sentence4(parsetreex[0])

parsetreex = get_parsetree(parser, "If you have aries in rising, you tend to be loud and superficial.")
parsetreex.pretty_print()
simple_find_parallel_sentence4(parsetreex[0])

                       ROOT                          
                        |                             
                        S                            
   _____________________|__________________________   
  |        VP                                      | 
  |     ___|____________                           |  
  |    |                NP                         | 
  |    |          ______|_______________           |  
  NP   |         NP           |         NP         | 
  |    |    _____|______      |     ____|____      |  
 NNP  VBZ  DT    JJ     NN    CC   JJ        NN    . 
  |    |   |     |      |     |    |         |     |  
Bobby  is  an amazing father and close     friend  . 

NP found.
NP and VP found on same level.
new sentences are :
                                          ROOT                                           
                                           |                                              
                                           S     

Clearly, there are a lot more structures that the simple sentence splitter does not cover. Syntax is complex indeed. 

In [38]:
parsetreex = get_parsetree(parser, "You are quick to grasp new concepts and equally quick to lose interest in an idea or project once your curiosity has been satisfied.")
parsetreex._repr_png_()
simple_find_parallel_sentence4(parsetreex[0])

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

In [None]:
text = "If you have Mercury in Gemini, you have an extremely adaptable, energetic, active, alert, curious, and versatile mind."
parsetreex = get_parsetree(parser, text)
parsetreex._repr_png_()
simple_find_parallel_sentence4(parsetreex[0])