In [1]:
from nltk.parse import corenlp
from nltk.tree import Tree

In [2]:
# Remember to start Stanford CoreNLP java server separately.
parser = corenlp.CoreNLPParser(url='http://localhost:9000')

### Key utility functions

In [3]:
def get_parsetree(parser, sent):
    """
    Return parse tree from sentence.

    :param parser: an nltk.parse.corenlp.CoreNLPParser
    :returns: nltk.tree.Tree
    """
    parsed = parser.raw_parse(sent)
    collected = []
    for i in parsed:
        collected.append(i)
    parsetree = collected[0]
    return parsetree

In [4]:
def get_child_labels(t):
    """
    :param t: an nltk.tree.Tree
    :returns: list of labels of the children.
    """
    labels = []
    for child in t:
        labels.append(child.label())
        
    return labels

In [5]:
def test(parser, func, text, pprint=False, expected=None):
    print('input:'), 
    print(text, '\n')
    parsetreex = get_parsetree(parser, text)
    if pprint == True:
        parsetreex.pretty_print()
    
    result = func(parsetreex[0])
    print('output:') 
    print(result)
    print()
    if expected:
        print('expected:')
        print(expected, '\n')
        if result == expected:
            print('result: PASSING\n')
        else:
            print('result: FAILING\n')
            parsetreex.pretty_print()
    print('\n')

### Functions for `simple_find_parallel_sentence8`

In [35]:
def find_sub(t):
    """
    :param t: an nltk.tree.Tree
    :returns: list. a list representing parallel subcomponents.
    """
    # todo: take ordering into account. 
    
    commons_lh = []
    commons_rh = []
    subcomponents = []
    
    child_labels = get_child_labels(t)
    if t.height() > 2 and 'CC' in child_labels:
        # when height == 2, we only have leaves (string type) left in the tree.
        cc_index = child_labels.index('CC')
        for child in t[0:cc_index]:
            if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                subcomponents.append(child.leaves())
            else:
                commons_lh.append(child.leaves())
        for child in t[cc_index:len(t)]:
            if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                subcomponents.append(child.leaves())
            else:
                commons_rh.append(child.leaves())
                
    else:
        for child in t:
            if type(child) == Tree:
                results = find_sub(child)
                commons_lh.extend(results[0])
                subcomponents.extend(results[1])
                commons_rh.extend(results[2])
                #print(subcomponents)
            else:
                # a leaf (a string type)
                commons_lh.append(child)
                
    return commons_lh, subcomponents, commons_rh 

In [36]:
def simple_find_parallel_sentence8(t):
    """
    From a sentence, finds, generates and prints parallel sub-sentences describing the same subject, if they exist.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."
    
    Looks for parallel noun phrases (NP, NN), parallel verbs (VP, V) and parallel adjectives (ADJP, JJ).
    Parallel phrases means these phrases are children of the same node.
    Also takes parallel sentences (S) into account.
    Also takes conditions ("if" or SBAR) into account.
    
    This iteration attempts to use a recursive approach to find the parallel parts once it finds the first 
    VP co-occuring (parallel) with an NP.
    
    :param t: an nltk.tree.Tree
    :returns: list of simpler sentences.
    """
    preconditions = []  # conditions. e.g. "if you are x".
    qualifiers = []  # phrases that qualify the parallel VPs.
    subsentences = []
    final_sents = []
    
    labels = get_child_labels(t)
    
    for child in t:
        if child.label() == 'SBAR':
            #print('precondition found.')
            preconditions.extend(child.leaves())
    
    if labels.count('S') >= 2:
        #print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                subsentences.append(child.leaves())
        #print('new sentences are :')
        for sent in subsentences:
            final_sent_arr = []
            final_sent_arr.extend(preconditions)
            #final_sent_arr.extend(t[np_label].leaves()) # skipped
            final_sent_arr.extend(qualifiers)
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
    
    elif 'NP' in labels:
        np_label = labels.index('NP')
        #print('NP found.')
        
        if 'VP' in labels:
            #print('NP and VP found on same level (under same node).')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            
            # if a child of the first VP is not VP, NP or ADJP, it is likely some qualifier (e.g. a VBP)?  
            #for child in t[vp_index]:
            #    if child.label() not in ['VP', 'NP', 'ADJP', 'S']:
            #        qualifiers.append(child.leaves())
            
            # check if 'CC' exists; todo: refactor?.
            leaves_pos = t[vp_index].pos()
            cc_exists = False
            for pair in leaves_pos:
                if pair[1] in ['CC']:
                    cc_exists = True
                    break
                else: 
                    cc_exists == False
            
            # recurse until 'CC' child is found or leaf is reached, then return and accumulate any parallel parts found.  
            if cc_exists:
                results = find_sub(t[vp_index]) # recursive step.
                qualifiers.extend(results[0])
                subsentences.extend(results[1])

                #print('new sentences are :')
                for sent in subsentences:
                    final_sent_arr = []
                    final_sent_arr.extend(preconditions)
                    final_sent_arr.extend(t[np_label].leaves())
                    final_sent_arr.extend(qualifiers)
                    final_sent_arr.extend(sent)
                    final_sent = " ".join(final_sent_arr) + '.'
                    final_sents.append(final_sent)
                    #print(final_sent)
                #print()
                return final_sents
            else:
                Exception('Sentence not decomposable.')
        
    else:
        raise Exception('Sentence structure not covered by function.')

### Test `simple_find_parallel_sentence_8` with standard test cases.

In [37]:
'''
TEST CASES - simple
'''

# parallel NN
text = "Bobby is a good father and great friend."
expected = [
    'Bobby is a good father.',
    'Bobby is a great friend.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

# parallel "negative" NN 
text = "Bobby is not a good father and great friend."
expected = [
    'Bobby is not a good father.',
    'Bobby is not a great friend.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

# parallel ADJP -> PASS
text = "Bobby is very handsome and super smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is super smart.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

# parallel ADJP AND JJ 
text = "Bobby is very handsome and smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is smart.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

# parallel VP 
text = "You are quick to grasp new concepts and equally quick to lose interest in an idea or project once your curiosity has been satisfied."
expected = ['You are quick to grasp new concepts.',
            'You are equally quick to lose interest in an idea or project once your curiosity has been satisfied.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)


'''
MORE TEST CASES - with conditional statements.
'''

# conditional and 2 adjectives
text = "If you have aries in rising, you are loud and superficial."
expected = [
    'If you have aries in rising, you are loud.',
    'If you have aries in rising, you are superficial.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

# conditional and 2 "nested" adjectives
text = "If you have aries in rising, you tend to be loud and superficial"
expected = [
    'If you have aries in rising, you tend to be loud.',
    'If you have aries in rising, you tend to be superficial.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

input:
Bobby is a good father and great friend. 

output:
['Bobby is a good father.', 'Bobby is great friend.']

expected:
['Bobby is a good father.', 'Bobby is a great friend.'] 

result: FAILING

                    ROOT                          
                     |                             
                     S                            
   __________________|__________________________   
  |        VP                                   | 
  |     ___|_________                           |  
  |    |             NP                         | 
  |    |        _____|_______________           |  
  NP   |       NP          |         NP         | 
  |    |    ___|_____      |     ____|____      |  
 NNP  VBZ  DT  JJ    NN    CC   JJ        NN    . 
  |    |   |   |     |     |    |         |     |  
Bobby  is  a  good father and great     friend  . 



input:
Bobby is not a good father and great friend. 

output:
['Bobby is not a good father.', 'Bobby is not great friend.']

expec

In [38]:
'''
MORE TEST CASES (2)
'''

text = "You strive for perfection and can be quite the person to live with or to be around because your standards for yourself and others are so high."
expected = [
    'You strive for perfection.',
    'You can be quite the person to live with or to be around because your standards for yourself and others are so high.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

# conditional and 2 "nested" adjectives
text = "Your senses and emotions are quite strong and it would be wise for you to listen to your intuitive side as you can be quite psychic."
expected = [
    'Your senses and emotions are quite strong.',
    'it would be wise for you to listen to your intuitive side as you can be quite psychic.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)

input:
You strive for perfection and can be quite the person to live with or to be around because your standards for yourself and others are so high. 

output:
['You strive for perfection.', 'You can be quite the person to live with or to be around because your standards for yourself and others are so high.']

expected:
['You strive for perfection.', 'You can be quite the person to live with or to be around because your standards for yourself and others are so high.'] 

result: PASSING



input:
Your senses and emotions are quite strong and it would be wise for you to listen to your intuitive side as you can be quite psychic. 

output:
['Your senses and emotions are quite strong.', 'it would be wise for you to listen to your intuitive side as you can be quite psychic.']

expected:
['Your senses and emotions are quite strong.', 'it would be wise for you to listen to your intuitive side as you can be quite psychic.'] 

result: PASSING





In [39]:
'''
MORE TEST CASES (3)
'''

# condition appearing after adjective phrase.
text = "You are graceful and poised because you are calm and confident."
expected = [
    'You are graceful because you are calm and confident.',
    'You are poised because you are calm and confident.'
]
test(parser, simple_find_parallel_sentence8, text, expected=expected)


input:
You are graceful and poised because you are calm and confident. 

output:
['You are because you are graceful.', 'You are because you are poised.', 'You are because you are calm.', 'You are because you are confident.']

expected:
['You are graceful because you are calm and confident.', 'You are poised because you are calm and confident.'] 

result: FAILING

                               ROOT                                      
                                |                                         
                                S                                        
  ______________________________|______________________________________   
 |                              VP                                     | 
 |    __________________________|_________                             |  
 |   |            |                      SBAR                          | 
 |   |            |              _________|____                        |  
 |   |            |             |     

In [40]:
'''
MORE TEST CASES (4) - from the example astrology dataset.
'''

# 
text = "Comfort is important to you and you hate getting your hands dirty."
test(parser, simple_find_parallel_sentence8, text)

text = "Debate and argument appeal to you."
test(parser, simple_find_parallel_sentence8, text)

text = "Guard against possessiveness, jealousy, and taking the easy way out in your relationships."
test(parser, simple_find_parallel_sentence8, text)

text = "People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered."
test(parser, simple_find_parallel_sentence8, text)

text = "You aim to please in practical and earthy ways."
test(parser, simple_find_parallel_sentence8, text)


input:
Comfort is important to you and you hate getting your hands dirty. 

output:
['Comfort is important to you.', 'you hate getting your hands dirty.']



input:
Debate and argument appeal to you. 

output:
None



input:
Guard against possessiveness, jealousy, and taking the easy way out in your relationships. 

output:
None



input:
People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered. 

output:
['People with Virgo rising tend to be self-centered practical.', 'People with Virgo rising tend to be self-centered ,.', 'People with Virgo rising tend to be self-centered analytical.', 'People with Virgo rising tend to be self-centered ,.', 'People with Virgo rising tend to be self-centered discriminating.', 'People with Virgo rising tend to be self-centered ,.', 'People with Virgo rising tend to be self-centered fastidious.', 'Pe

### Compare `simple_find_parallel_sentence7` with `simple_find_parallel_sentence8`

In [41]:
def simple_find_parallel_sentence7(t):
    """
    From a sentence, finds the earliest parallel sub-sentences describing the same subject, if it exists.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."
    
    Takes into account parallel noun phrases (NP, NN), parallel verbs (VP, V) and parallel adjectives (ADJP, JJ).
    Also takes parallel sentences (S) into account .
    Also takes conditions ("if" or SBAR) into account.
    
    :param t: an nltk.tree.Tree
    :returns: 2-ple of nltk.tree.Tree
    """
    preconditions = []  # conditions. e.g. "if you are x".
    qualifiers = []  # phrases that qualify the subsentences.
    subsentences = []
    
    labels = get_child_labels(t)
    
    final_sents = []
    for child in t:
        if child.label() == 'SBAR':
            #print('precondition found.')
            preconditions.extend(child.leaves())
    
    if labels.count('S') >= 2:
        #print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                subsentences.append(child.leaves())
        #print('new sentences are :')
        for sent in subsentences:
            #print(preconditions, sent)
            final_sent_arr = []
            final_sent_arr.extend(preconditions)
            #final_sent_arr.extend(t[np_label].leaves())  # skipped.
            final_sent_arr.extend(qualifiers)
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
    
    elif 'NP' in labels:
        np_label = labels.index('NP')
        #print('NP found.')
        if 'VP' in labels:
            #print('NP and VP found on same level.')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            
            # if a child of the first VP is not VP, NP or ADJP, it is likely some qualifier?  
            for child in t[vp_index]:
                if child.label() not in ['VP', 'NP', 'ADJP']:
                    qualifiers.extend(child.leaves())
            
            if labels2.count('VP') >= 2:
                #print('found parallel VP.')
                for child in t[vp_index]:
                    if child.label() == 'VP':
                        subsentences.append(child.leaves())
                    
            elif labels2.count('ADJP') == 1: 
                #print('found ADJP under VP.')
                adjp_index = labels2.index('ADJP')
                labels3 = get_child_labels(t[vp_index][adjp_index])
                if labels3.count('ADJP') + labels3.count('JJ') >= 2:
                    #print('found parallel ADJP')
                    for child in t[vp_index][adjp_index]:
                        if child.label() == 'ADJP' or child.label() == 'JJ':
                            subsentences.append(child.leaves())

            elif labels2.count('NP') == 1:
                #print('found NP under VP.')
                np_index = labels2.index('NP')
                labels3 = get_child_labels(t[vp_index][np_index])
                if labels3.count('NP') + labels3.count('NN') >= 2:
                    #print('found parallel NP')
                    for child in t[vp_index][np_index]:
                        if child.label() == 'NP' or child.label() == 'NN':
                            subsentences.append(child.leaves())

        elif 'ADJP' in labels:
            index = labels.index('ADJP')
            labels2 = get_child_labels(t[index])
            if labels2.count('ADJP') >= 2:
                #print('found parallel ADJP.')
                for child in t[index]:
                    if child.label() == 'ADJP':
                        subsentences.append(child.leaves())
        
        #print('new sentences are :')
        for sent in subsentences:
            #print(preconditions, t[np_label].leaves(), qualifiers, sent)
            final_sent_arr = []
            final_sent_arr.extend(preconditions)
            final_sent_arr.extend(t[np_label].leaves())
            final_sent_arr.extend(qualifiers)
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
        #print()
            
        # todo: return n new trees representing the n new sentences or return n new string sentences.
        # todo: Rewrite code to fit more general cases. Right now, the logic is very 'hard-coded'.
        # todo: consider the possibility of a recursive algo.
        
    else:
        raise exception('Sentence structure not covered by function.')

### Standard test cases applied to `simple_find_parallel_sentence7`

In [42]:
'''
TEST CASES - simple
'''

# parallel NN
text = "Bobby is a good father and great friend."
expected = [
    'Bobby is a good father.',
    'Bobby is a great friend.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

# parallel "negative" NN 
text = "Bobby is not a good father and great friend."
expected = [
    'Bobby is not a good father.',
    'Bobby is not a great friend.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

# parallel ADJP -> PASS
text = "Bobby is very handsome and super smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is super smart.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

# parallel ADJP AND JJ 
text = "Bobby is very handsome and smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is smart.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

# parallel VP 
text = "You are quick to grasp new concepts and equally quick to lose interest in an idea or project once your curiosity has been satisfied."
expected = ['You are quick to grasp new concepts.',
            'You are equally quick to lose interest in an idea or project once your curiosity has been satisfied.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)


'''
MORE TEST CASES - with conditional statements.
'''

# conditional and 2 adjectives
text = "If you have aries in rising, you are loud and superficial."
expected = [
    'If you have aries in rising, you are loud.',
    'If you have aries in rising, you are superficial.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

# conditional and 2 "nested" adjectives
text = "If you have aries in rising, you tend to be loud and superficial"
expected = [
    'If you have aries in rising, you tend to be loud.',
    'If you have aries in rising, you tend to be superficial.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

input:
Bobby is a good father and great friend. 

output:
['Bobby is a good father.', 'Bobby is great friend.']

expected:
['Bobby is a good father.', 'Bobby is a great friend.'] 

result: FAILING

                    ROOT                          
                     |                             
                     S                            
   __________________|__________________________   
  |        VP                                   | 
  |     ___|_________                           |  
  |    |             NP                         | 
  |    |        _____|_______________           |  
  NP   |       NP          |         NP         | 
  |    |    ___|_____      |     ____|____      |  
 NNP  VBZ  DT  JJ    NN    CC   JJ        NN    . 
  |    |   |   |     |     |    |         |     |  
Bobby  is  a  good father and great     friend  . 



input:
Bobby is not a good father and great friend. 

output:
['Bobby is not a good father.', 'Bobby is not great friend.']

expec

In [43]:
'''
MORE TEST CASES (3)
'''

# condition appearing after adjective phrase.
text = "You are graceful and poised because you are calm and confident."
expected = [
    'You are graceful because you are calm and confident.',
    'You are poised because you are calm and confident.'
]
test(parser, simple_find_parallel_sentence7, text, expected=expected)

'''
MORE TEST CASES (4) - from the example astrology dataset.
'''

text = "Comfort is important to you and you hate getting your hands dirty."
test(parser, simple_find_parallel_sentence7, text)

text = "Debate and argument appeal to you."
test(parser, simple_find_parallel_sentence7, text)

text = "Guard against possessiveness, jealousy, and taking the easy way out in your relationships."
test(parser, simple_find_parallel_sentence7, text)

text = "People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered."
test(parser, simple_find_parallel_sentence7, text)

text = "You aim to please in practical and earthy ways."
test(parser, simple_find_parallel_sentence7, text)


input:
You are graceful and poised because you are calm and confident. 

output:
[]

expected:
['You are graceful because you are calm and confident.', 'You are poised because you are calm and confident.'] 

result: FAILING

                               ROOT                                      
                                |                                         
                                S                                        
  ______________________________|______________________________________   
 |                              VP                                     | 
 |    __________________________|_________                             |  
 |   |            |                      SBAR                          | 
 |   |            |              _________|____                        |  
 |   |            |             |              S                       | 
 |   |            |             |      ________|____                   |  
 |   |            |           

In [6]:
def find_sub(t):
    """
    :param t: an nltk.tree.Tree
    :returns: list. a list representing parallel subcomponents.
    """
    # todo: take ordering into account. 
    
    commons_lh = []
    commons_rh = []
    subcomponents = []
    
    if t.height() > 2:
        child_labels = get_child_labels(t)
        if 'CC' in child_labels:
            # when height == 2, we only have leaves (string type) left in the tree.
            cc_index = child_labels.index('CC')
            for child in t[0:cc_index]:
                if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                    subcomponents.append(child.leaves())
                elif child.label() not in [',']:
                    commons_lh.extend(child.leaves())
            for child in t[cc_index+1:len(t)]:
                if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                    subcomponents.append(child.leaves())
                elif child.label() not in [',']:
                    commons_rh.extend(child.leaves())
                
        else:
            for child in t:
                if type(child) == Tree:
                    results = find_sub(child)
                    commons_lh.extend(results[0])
                    subcomponents.extend(results[1])
                    commons_rh.extend(results[2])
                    #print(subcomponents)
                else:
                    # a leaf (a string type)
                    commons_lh.append(child)
    else:
        for child in t:
            # a leaf (a string type)
            commons_lh.append(child)
                
    return commons_lh, subcomponents, commons_rh

In [7]:
def simple_find_parallel_sentence9(t):
    """
    From a sentence, finds, generates and prints parallel sub-sentences describing the same subject, if they exist.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."
    
    Looks for parallel noun phrases (NP, NN), parallel verbs (VP, V) and parallel adjectives (ADJP, JJ).
    Parallel phrases means these phrases are children of the same node.
    Also takes parallel sentences (S) into account.
    Also takes conditions ("if" or SBAR) into account.
    
    This iteration attempts to use a recursive approach to find the parallel parts once it finds the first 
    VP co-occuring (parallel) with an NP.
    
    :param t: an nltk.tree.Tree
    :returns: list of simpler sentences.
    """
    preconditions = []  # conditions. e.g. "if you are x".
    commons_lh = []  # phrases that co-occur at the left of component
    components = [] 
    commons_rh = []  # phrases that co-occur at the right of component
    final_sents = []
    
    labels = get_child_labels(t)
    
    for child in t:
        if child.label() == 'SBAR':
            #print('precondition found.')
            preconditions.extend(child.leaves())
    
    if labels.count('S') >= 2:
        #print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                components.append(child.leaves())
        #print('new sentences are :')
        for sent in components:
            final_sent_arr = []
            final_sent_arr.extend(preconditions)
            #final_sent_arr.extend(t[np_label].leaves()) # skipped
            final_sent_arr.extend(commons_lh)
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
    
    elif 'NP' in labels:
        np_label = labels.index('NP')
        #print('NP found.')
        
        if 'VP' in labels:
            #print('NP and VP found on same level (under same node).')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            
            # if a child of the first VP is not VP, NP or ADJP, it is likely some qualifier (e.g. a VBP)?  
            #for child in t[vp_index]:
            #    if child.label() not in ['VP', 'NP', 'ADJP', 'S']:
            #        commons_lh.append(child.leaves())
            
            # check if 'CC' exists; todo: refactor?.
            leaves_pos = t[vp_index].pos()
            cc_exists = False
            for pair in leaves_pos:
                if pair[1] in ['CC']:
                    cc_exists = True
                    break
                else: 
                    cc_exists == False
            
            # recurse until 'CC' child is found or leaf is reached, then return and accumulate any parallel parts found.  
            if cc_exists:
                results = find_sub(t[vp_index]) # recursive step.
                commons_lh.extend(results[0])
                components.extend(results[1])
                commons_rh.extend(results[2])

                #print('new sentences are :')
                for sent in components:
                    final_sent_arr = []
                    final_sent_arr.extend(preconditions)
                    final_sent_arr.extend(t[np_label].leaves())
                    final_sent_arr.extend(commons_lh)
                    final_sent_arr.extend(sent)
                    final_sent_arr.extend(commons_rh)
                    
                    final_sent = " ".join(final_sent_arr) + '.'
                    final_sents.append(final_sent)
                    #print(final_sent)
                #print()
                return final_sents
            else:
                Exception('Sentence not decomposable.')
        
    else:
        raise Exception('Sentence structure not covered by function.')

In [88]:
'''
MORE TEST CASES (3)
'''

# condition appearing after adjective phrase.
text = "You are graceful and poised because you are calm and confident."
expected = [
    'You are graceful because you are calm and confident.',
    'You are poised because you are calm and confident.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

'''
MORE TEST CASES (4) - from the example astrology dataset.
'''

text = "Comfort is important to you and you hate getting your hands dirty."
test(parser, simple_find_parallel_sentence9, text)

text = "Debate and argument appeal to you."
test(parser, simple_find_parallel_sentence9, text)

text = "Guard against possessiveness, jealousy, and taking the easy way out in your relationships."
test(parser, simple_find_parallel_sentence9, text)

text = "People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered."
test(parser, simple_find_parallel_sentence9, text)

text = "You aim to please in practical and earthy ways."
test(parser, simple_find_parallel_sentence9, text)


input:
You are graceful and poised because you are calm and confident. 

output:
['You are because you are graceful poised.', 'You are because you are calm poised.', 'You are because you are confident poised.']

expected:
['You are graceful because you are calm and confident.', 'You are poised because you are calm and confident.'] 

result: FAILING

                               ROOT                                      
                                |                                         
                                S                                        
  ______________________________|______________________________________   
 |                              VP                                     | 
 |    __________________________|_________                             |  
 |   |            |                      SBAR                          | 
 |   |            |              _________|____                        |  
 |   |            |             |              S    

In [89]:
text = "Bob is very handsome and smart."
test(parser, simple_find_parallel_sentence9, text)

input:
Bob is very handsome and smart. 

output:
['Bob is very handsome.', 'Bob is very smart.']





In [90]:
text = "Bob is handsome and super smart."
test(parser, simple_find_parallel_sentence9, text)

input:
Bob is handsome and super smart. 

output:
['Bob is handsome.', 'Bob is super smart.']





- "You aim to please in practical and earthy ways." - solved in v9 from v8
- "Bob is very handsome and smart." - solved in v9 from v8

In [94]:
text = "People with Virgo rising tend to be practical, exacting, thoughtful, nice, hateful, good-looking, crazy, and somewhat self-centered."
test(parser, simple_find_parallel_sentence9, text, pprint=True)

input:
People with Virgo rising tend to be practical, exacting, thoughtful, nice, hateful, good-looking, crazy, and somewhat self-centered. 

                                                                                    ROOT                                                                                    
                                                                                     |                                                                                       
                                                                                     S                                                                                      
              _______________________________________________________________________|____________________________________________________________________________________   
             |                             VP                                                                                                                       

In [None]:
# This case fails because 'pretty' gets parsed as an adverb instead of a co-occuring adjective.
text = "People with Virgo rising tend to be practical, exacting, thoughtful, nice, hateful, pretty, crazy, and somewhat self-centered."

# This shows that the splitter may fail if the parse also fails to give the best parse.

In [81]:
text = "Bob is incredibly talented and he cares lovingly for the plants he takes care of."
test(parser, simple_find_parallel_sentence9, text)

input:
Bob is incredibly talented and he cares lovingly for the plants he takes care of. 

output:
['Bob is incredibly talented.', 'he cares lovingly for the plants he takes care of.']





In [82]:
text = "People with virgo rising tend to be hot-headed and passionate."
test(parser, simple_find_parallel_sentence9, text)

input:
People with virgo rising tend to be hot-headed and passionate. 

output:
['People with virgo rising tend to be hot-headed.', 'People with virgo rising tend to be passionate.']





In [9]:
'''
TEST CASES - simple
'''

# parallel NN
text = "Bobby is a good father and great friend."
expected = [
    'Bobby is a good father.',
    'Bobby is a great friend.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

# parallel "negative" NN 
text = "Bobby is not a good father and great friend."
expected = [
    'Bobby is not a good father.',
    'Bobby is not a great friend.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

# parallel ADJP -> PASS
text = "Bobby is very handsome and super smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is super smart.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

# parallel ADJP AND JJ 
text = "Bobby is very handsome and smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is very smart.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

# parallel VP 
text = "You are quick to grasp new concepts and equally quick to lose interest in an idea or project once your curiosity has been satisfied."
expected = ['You are quick to grasp new concepts.',
            'You are equally quick to lose interest in an idea or project once your curiosity has been satisfied.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)


'''
MORE TEST CASES - with conditional statements.
'''

# conditional and 2 adjectives
text = "If you have aries in rising, you are loud and superficial."
expected = [
    'If you have aries in rising, you are loud.',
    'If you have aries in rising, you are superficial.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

# conditional and 2 "nested" adjectives
text = "If you have aries in rising, you tend to be loud and superficial"
expected = [
    'If you have aries in rising, you tend to be loud.',
    'If you have aries in rising, you tend to be superficial.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

input:
Bobby is a good father and great friend. 

output:
['Bobby is a good father.', 'Bobby is great friend.']

expected:
['Bobby is a good father.', 'Bobby is a great friend.'] 

result: FAILING

                    ROOT                          
                     |                             
                     S                            
   __________________|__________________________   
  |        VP                                   | 
  |     ___|_________                           |  
  |    |             NP                         | 
  |    |        _____|_______________           |  
  NP   |       NP          |         NP         | 
  |    |    ___|_____      |     ____|____      |  
 NNP  VBZ  DT  JJ    NN    CC   JJ        NN    . 
  |    |   |   |     |     |    |         |     |  
Bobby  is  a  good father and great     friend  . 



input:
Bobby is not a good father and great friend. 

output:
['Bobby is not a good father.', 'Bobby is not great friend.']

expec

In [12]:
# conditional in front, and list of adjectives
text = "If you have aries in rising, you tend to be loud, superficial, excited, and self-centred."
expected = [
    'If you have aries in rising, you tend to be loud.',
    'If you have aries in rising, you tend to be superficial.',
    'If you have aries in rising, you tend to be excited.',
    'If you have aries in rising, you tend to be self-centred.'
]
test(parser, simple_find_parallel_sentence9, text, expected=expected)

input:
If you have aries in rising, you tend to be loud, superficial, excited, and self-centred. 

output:
['If you have aries in rising you tend to be loud.', 'If you have aries in rising you tend to be superficial.', 'If you have aries in rising you tend to be excited.', 'If you have aries in rising you tend to be self-centred.']

expected:
['If you have aries in rising, you tend to be loud.', 'If you have aries in rising, you tend to be superficial.', 'If you have aries in rising, you tend to be excited.', 'If you have aries in rising, you tend to be self-centred.'] 

result: FAILING

                                                   ROOT                                                               
                                                    |                                                                  
                                                    S                                                                 
          _____________________________________

In [22]:
# condition embedded within adj phrase.
text = "You are trustworthy and reliable because you are calm and confident."
test(parser, simple_find_parallel_sentence9, text, pprint=True)

input:
You are trustworthy and reliable because you are calm and confident. 

                                    ROOT                                      
                                     |                                         
                                     S                                        
  ___________________________________|______________________________________   
 |                                   VP                                     | 
 |    _______________________________|_________                             |  
 |   |               |                        SBAR                          | 
 |   |               |                _________|____                        |  
 |   |               |               |              S                       | 
 |   |               |               |      ________|____                   |  
 |   |               |               |     |             VP                 | 
 |   |               |               |     |    

above case fails so modify algorithm.

In [7]:
def find_sub(t):
    """
    recursively finds parallel components, and regenerates subsentences. 
    
    :param t: an nltk.tree.Tree
    :returns: list. a list representing subsentences.
    """
    # todo: take ordering into account. 
    
    commons_lh = []
    subcomponents = []
    commons_rh = []
    condition_rh = []
    
    if t.height() > 2:
        child_labels = get_child_labels(t)
        if 'SBAR' in child_labels:
            sbar_index = child_labels.index('SBAR')
            condition_rh.extend(t[sbar_index].leaves())
        if 'CC' in child_labels:
            # when height == 2, we only have leaves (string type) left in the tree.
            cc_index = child_labels.index('CC')
            for child in t[0:cc_index]:
                if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                    subcomponents.append(child.leaves())
                elif child.label() not in [',', 'SBAR']:
                    commons_lh.extend(child.leaves())
            for child in t[cc_index+1:len(t)]:
                if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                    subcomponents.append(child.leaves())
                elif child.label() not in [',', 'SBAR']:
                    commons_rh.extend(child.leaves())
            
        else:
            for child in t:
                if type(child) == Tree:
                    if child.label() != 'SBAR':
                        results = find_sub(child)
                        commons_lh.extend(results[0])
                        subcomponents.extend(results[1])
                        commons_rh.extend(results[2])
                        condition_rh.extend(results[3])
                else:
                    # a leaf (a string type)
                    commons_lh.append(child)  # not possible to reach this statement?
    else:
        for child in t:
            # a leaf (a string type)
            if t.label() in ['NN', 'NNS']:
                commons_rh.append(child)  # nouns tend to come after adjectives.
            else:
                commons_lh.append(child)
                
    return commons_lh, subcomponents, commons_rh, condition_rh


def simple_find_parallel_sentence10(t):
    """
    From a sentence, finds, generates and prints parallel sub-sentences describing the same subject, if they exist.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."
    
    Looks for parallel noun phrases (NP, NN), parallel verbs (VP, V) and parallel adjectives (ADJP, JJ).
    Parallel phrases means these phrases are children of the same node.
    Also takes parallel sentences (S) into account.
    Also takes conditions ("if" or SBAR) into account.
    
    This version attempts to use a recursive approach to find the parallel parts once it finds the first 
    VP co-occuring (parallel) with an NP.
    
    :param t: an nltk.tree.Tree
    :returns: list of simpler sentences.
    """
    preconditions = []  # conditions. e.g. "if you are x ..."
    commons_lh = []  # phrases that co-occur at the left of component
    components = [] 
    commons_rh = []  # phrases that co-occur at the right of component
    condition_rh = []  # conditions. e.g. "... because you are x."
    
    final_sents = []
    
    labels = get_child_labels(t)
    
    for child in t:
        if child.label() == 'SBAR':
            #print('precondition found.')
            preconditions.extend(child.leaves())
    
    if labels.count('S') >= 2:
        #print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                components.append(child.leaves())
        #print('new sentences are :')
        for sent in components:
            final_sent_arr = []
            final_sent_arr.extend(preconditions)
            #final_sent_arr.extend(t[np_label].leaves()) # skipped
            final_sent_arr.extend(commons_lh)
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
    
    elif 'NP' in labels:
        np_label = labels.index('NP')
        #print('NP found.')
        
        if 'VP' in labels:
            #print('NP and VP found on same level (under same node).')
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            
            # if a child of the first VP is not VP, NP or ADJP, it is likely some qualifier (e.g. a VBP)?  
            #for child in t[vp_index]:
            #    if child.label() not in ['VP', 'NP', 'ADJP', 'S']:
            #        commons_lh.append(child.leaves())
            
            # check if 'CC' exists; todo: refactor?.
            leaves_pos = t[vp_index].pos()
            cc_exists = False
            for pair in leaves_pos:
                if pair[1] in ['CC']:
                    cc_exists = True
                    break
                else: 
                    cc_exists == False
            
            # recurse until 'CC' child is found or leaf is reached, then return and accumulate any parallel parts found.  
            if cc_exists:
                results = find_sub(t[vp_index]) # recursive step.
                commons_lh.extend(results[0])
                components.extend(results[1])
                commons_rh.extend(results[2])
                condition_rh.extend(results[3])

                #print('new sentences are :')
                for c in components:
                    final_sent_arr = []
                    final_sent_arr.extend(preconditions)
                    final_sent_arr.extend(t[np_label].leaves())
                    final_sent_arr.extend(commons_lh)
                    final_sent_arr.extend(c)
                    final_sent_arr.extend(commons_rh)
                    final_sent_arr.extend(condition_rh)
                    
                    final_sent = " ".join(final_sent_arr) + '.'
                    final_sents.append(final_sent)    
                #print(preconditions)
                #print(commons_lh)
                #print(commons_rh)
                #print(condition_rh)
                return final_sents
            else:
                Exception('Sentence not decomposable.')
        
    else:
        raise Exception('Sentence structure not covered by function.')

In [8]:
# condition embedded within adj phrase.
text = "You are trustworthy and reliable because you are calm and confident."
expected = [
    "You are trustworthy because you are calm and confident.",
    "You are reliable because you are calm and confident."
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

# new SBAR at front
text = "Although you are shy by nature, you do not hesitate to defend the voiceless and help the poor."
expected = ['Although you are shy by nature you do not hesitate to defend the voiceless.', 
            'Although you are shy by nature you do not hesitate to help the poor.']
test(parser, simple_find_parallel_sentence10, text, expected=expected)

input:
You are trustworthy and reliable because you are calm and confident. 

output:
['You are trustworthy because you are calm and confident.', 'You are reliable because you are calm and confident.']

expected:
['You are trustworthy because you are calm and confident.', 'You are reliable because you are calm and confident.'] 

result: PASSING



input:
Although you are shy by nature, you do not hesitate to defend the voiceless and help the poor. 

output:
['Although you are shy by nature you do not hesitate to defend the voiceless.', 'Although you are shy by nature you do not hesitate to help the poor.']

expected:
['Although you are shy by nature you do not hesitate to defend the voiceless.', 'Although you are shy by nature you do not hesitate to help the poor.'] 

result: PASSING





In [9]:
# NEW TEST CASES.

text = "I saw an ant and an elephant by the fire."
expected = ['I saw an ant by the fire.',
            'I saw an elephant by the fire.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)


text = "You are the most wonderful and capable captain the world has ever seen."
expected = [
    "You are the most wonderful captain the world has ever seen.",
    "You are the most capable captain the world has ever seen."
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

text = "This was a wonderful and mind-opening video."
expected = [
    "This was a wonderful video.",
    "This was a mind-opening video."
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

text = "The burger was one of the most affordable and satisfying burgers I've ever had."
expected = ["The burger was one of the most affordable burgers I 've ever had.",
            "The burger was one of the most satisfying burgers I 've ever had."
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)


input:
I saw an ant and an elephant by the fire. 

output:
['I saw by the an ant fire.', 'I saw by the an elephant fire.']

expected:
['I saw an ant by the fire.', 'I saw an elephant by the fire.'] 

result: FAILING

                        ROOT                                  
                         |                                     
                         S                                    
  _______________________|__________________________________   
 |                       VP                                 | 
 |    ___________________|_____________________             |  
 |   |               NP                        PP           | 
 |   |        _______|________              ___|___         |  
 NP  |       NP      |        NP           |       NP       | 
 |   |    ___|___    |    ____|_____       |    ___|___     |  
PRP VBD  DT      NN  CC  DT         NN     IN  DT      NN   . 
 |   |   |       |   |   |          |      |   |       |    |  
 I  saw  an     ant a

In [11]:
'''
TEST CASES - simple
'''

# parallel NN
text = "Bobby is a good father and great friend."
expected = [
    'Bobby is a good father.',
    'Bobby is a great friend.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

# parallel "negative" NN 
text = "Bobby is not a good father and great friend."
expected = [
    'Bobby is not a good father.',
    'Bobby is not a great friend.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

# parallel ADJP -> PASS
text = "Bobby is very handsome and super smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is super smart.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

# parallel ADJP AND JJ 
text = "Bobby is very handsome and smart"
expected = [
    'Bobby is very handsome.',
    'Bobby is very smart.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

# parallel VP 
text = "You are quick to grasp new concepts and equally quick to lose interest in an idea or project once your curiosity has been satisfied."
expected = ['You are quick to grasp new concepts.',
            'You are equally quick to lose interest in an idea or project once your curiosity has been satisfied.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)


'''
MORE TEST CASES - with conditional statements.
'''

# conditional and 2 adjectives
text = "If you have aries in rising, you are loud and superficial."
expected = [
    'If you have aries in rising, you are loud.',
    'If you have aries in rising, you are superficial.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

# conditional and 2 "nested" adjectives
text = "If you have aries in rising, you tend to be loud and superficial"
expected = [
    'If you have aries in rising, you tend to be loud.',
    'If you have aries in rising, you tend to be superficial.'
]
test(parser, simple_find_parallel_sentence10, text, expected=expected)

input:
Bobby is a good father and great friend. 

output:
['Bobby is a good father.', 'Bobby is great friend.']

expected:
['Bobby is a good father.', 'Bobby is a great friend.'] 

result: FAILING

                    ROOT                          
                     |                             
                     S                            
   __________________|__________________________   
  |        VP                                   | 
  |     ___|_________                           |  
  |    |             NP                         | 
  |    |        _____|_______________           |  
  NP   |       NP          |         NP         | 
  |    |    ___|_____      |     ____|____      |  
 NNP  VBZ  DT  JJ    NN    CC   JJ        NN    . 
  |    |   |   |     |     |    |         |     |  
Bobby  is  a  good father and great     friend  . 



input:
Bobby is not a good father and great friend. 

output:
['Bobby is not a good father.', 'Bobby is not great friend.']

expec

In [77]:
def find_sub(t):
    """
    recursively finds words/phrases (refer to them as 'components') joined by a 'CC' symbol. 
    
    :param t: an nltk.tree.Tree
    :returns: an nltk.tree.Tree or None. Represents the phrase that contains the 'CC' symbol. None is returned if
    no such phrase is found.
    """ 
    # recursion ends once 'CC' symbol is found or leaves are reached.
    
    subtrees = []
    
    if t.height() > 2:
        # if tree height == 2, we only have leaves (string type) left in the tree.
        child_labels = get_child_labels(t)
        if 'CC' in child_labels:
            subtrees.append(t)
        else:
            for child in t:
                if type(child) == Tree:
                    if child.label() != 'SBAR':
                        # can't remember what this condition is for.
                        result = find_sub(child)  # recurse deeper.
                        subtrees.extend(result)  # accumulate results.
    return subtrees


def sent_lineariser(t):
    """
    From a sentence, generates constituent sentences describing the same subject, if they exist.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."
    
    Looks for parallel noun phrases (NP, NN), parallel verbs (VP, V) and parallel adjectives (ADJP, JJ).
    Parallel phrases means these phrases are children of the same node.
    Also takes parallel sentences (S) into account.
    Also takes conditions ("if" or SBAR) into account.
    
    This version attempts to use a recursive approach to find the parallel parts once it finds the first 
    VP co-occuring (parallel) with an NP.
    
    :param t: an nltk.tree.Tree
    :returns: list of simpler sentences.
    """
    components = [] 
    left = []
    right = []
    
    final_sents = []
    
    labels = get_child_labels(t)
    
#     for child in t:
#         if child.label() == 'SBAR':
#             #print('precondition found.')
#             preconditions.extend(child.leaves())
    
    if labels.count('S') >= 2:
        #print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                components.append(child.leaves())
        #print('new sentences are :')
        for sent in components:
            final_sent_arr = []
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
    
    elif 'NP' in labels:
        np_label = labels.index('NP')
        
        if 'VP' in labels:
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            
            # check if 'CC' exists; todo: refactor?.
            leaves_POS = t[vp_index].pos()
            cc_exists = False
            for pair in leaves_POS:
                if pair[1] in ['CC']:
                    cc_exists = True
                    break
                else: 
                    cc_exists == False
            
            # find words/phrases joined by 'CC'.  
            if cc_exists:
                sub_trees = find_sub(t[vp_index])  # recursive step.
                if len(sub_trees) > 1:
                    print(len(sub_trees))
                    raise Exception('Sentence structure not supported.')
                
                # find phrases to the left and right of the components group.
                original_sent_arr = t.leaves() 
                original_sent = " ".join(original_sent_arr)  # doesn't deal with punctuation tokens well, e.g. ','.
                components_phrase_arr = sub_trees[0].leaves()
                components_phrase = " ".join(components_phrase_arr)
                comp_index = original_sent.find(components_phrase)
                left = original_sent[:comp_index]
                right = original_sent[comp_index+len(components_phrase):]
                #print('left ', left)
                #print(components_phrase)
                #print('right ', right)
                
                for child in sub_trees[0]:
                    if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                        components.append(child.leaves())
                
                for c in components:
                    mid = " ".join(c)
                    final_sent = left + mid + right
                    final_sents.append(final_sent) 
                return final_sents
            else:
                Exception('Sentence not decomposable.')
        
    else:
        raise Exception('Sentence structure not covered by function.')

In [78]:
text = "If you have aries in rising, you tend to be loud and superficial."
expected = [
    'If you have aries in rising, you tend to be loud.',
    'If you have aries in rising, you tend to be superficial.'
]
test(parser, sent_lineariser, text, expected=expected, pprint=True)

input:
If you have aries in rising, you tend to be loud and superficial. 

                                          ROOT                                           
                                           |                                              
                                           S                                             
          _________________________________|___________________________________________   
        SBAR                           |   |             |                             | 
  _______|_____                        |   |             |                             |  
 |             S                       |   |             VP                            | 
 |    _________|____                   |   |     ________|___                          |  
 |   |              VP                 |   |    |            S                         | 
 |   |    __________|___               |   |    |            |                         |  
 |   |   |          

In [146]:
def run_tests(func):
    '''
    TEST CASES - simple
    '''

    # parallel NN
    text = "Bobby is a good father and great friend."
    expected = [
        'Bobby is a good father.',
        'Bobby is a great friend.'
    ]
    test(parser, func, text, expected=expected)

    # parallel "negative" NN 
    text = "Bobby is not a good father and great friend."
    expected = [
        'Bobby is not a good father.',
        'Bobby is not a great friend.'
    ]
    test(parser, func, text, expected=expected)

    # parallel ADJP -> PASS
    text = "Bobby is very handsome and super smart."
    expected = [
        'Bobby is very handsome.',
        'Bobby is super smart.'
    ]
    test(parser, func, text, expected=expected)

    # parallel ADJP AND JJ 
    text = "Bobby is very handsome and smart."
    expected = [
        'Bobby is very handsome.',
        'Bobby is very smart.'
    ]
    test(parser, func, text, expected=expected)

    # parallel VP 
    text = "You are quick to grasp new concepts and equally quick to lose interest in an idea or project once your curiosity has been satisfied."
    expected = ['You are quick to grasp new concepts.',
                'You are equally quick to lose interest in an idea or project once your curiosity has been satisfied.'
    ]
    test(parser, func, text, expected=expected)


    '''
    MORE TEST CASES - with conditional statements.
    '''

    # conditional and 2 adjectives
    text = "If you have aries in rising, you are loud and superficial."
    expected = [
        'If you have aries in rising, you are loud.',
        'If you have aries in rising, you are superficial.'
    ]
    test(parser, func, text, expected=expected)

    # conditional and 2 "nested" adjectives
    text = "If you have aries in rising, you tend to be loud and superficial."
    expected = [
        'If you have aries in rising, you tend to be loud.',
        'If you have aries in rising, you tend to be superficial.'
    ]
    test(parser, func, text, expected=expected)
    
    '''
    NEW TEST CASES
    '''

    text = "I saw an ant and an elephant by the fire."
    expected = ['I saw an ant by the fire.',
                'I saw an elephant by the fire.'
    ]
    test(parser, func, text, expected=expected)


    text = "You are the most wonderful and capable captain the world has ever seen."
    expected = [
        "You are the most wonderful captain the world has ever seen.",
        "You are the most capable captain the world has ever seen."
    ]
    test(parser, func, text, expected=expected)

    text = "This was a wonderful and mind-opening video."
    expected = [
        "This was a wonderful video.",
        "This was a mind-opening video."
    ]
    test(parser, func, text, expected=expected)

    text = "The burger was one of the most affordable and satisfying burgers I've ever had."
    expected = ["The burger was one of the most affordable burgers I 've ever had.",
                "The burger was one of the most satisfying burgers I 've ever had."
    ]
    test(parser, func, text, expected=expected)
    
    '''
    MORE TEST CASES
    '''
    # condition embedded within adj phrase.
    text = "You are trustworthy and reliable because you are calm and confident."
    expected = [
        "You are trustworthy because you are calm and confident.",
        "You are reliable because you are calm and confident."
    ]
    test(parser, func, text, expected=expected)

    # new SBAR at front
    text = "Although you are shy by nature, you do not hesitate to defend the voiceless and help the poor."
    expected = ['Although you are shy by nature, you do not hesitate to defend the voiceless.', 
                'Although you are shy by nature, you do not hesitate to help the poor.']
    test(parser, func, text, expected=expected)
    
    

run_tests(sent_lineariser)


input:
Bobby is a good father and great friend. 

output:
['Bobby is a good father .', 'Bobby is great friend .']

expected:
['Bobby is a good father.', 'Bobby is a great friend.'] 

result: FAILING

                    ROOT                          
                     |                             
                     S                            
   __________________|__________________________   
  |        VP                                   | 
  |     ___|_________                           |  
  |    |             NP                         | 
  |    |        _____|_______________           |  
  NP   |       NP          |         NP         | 
  |    |    ___|_____      |     ____|____      |  
 NNP  VBZ  DT  JJ    NN    CC   JJ        NN    . 
  |    |   |   |     |     |    |         |     |  
Bobby  is  a  good father and great     friend  . 



input:
Bobby is not a good father and great friend. 

output:
['Bobby is not a good father .', 'Bobby is not great friend .']

e

output:
['You are the wonderful captain the world has ever seen .', 'You are the capable captain the world has ever seen .']

expected:
['You are the most wonderful captain the world has ever seen.', 'You are the most capable captain the world has ever seen.'] 

result: FAILING

                                 ROOT                                          
                                  |                                             
                                  S                                            
  ________________________________|__________________________________________   
 |                                VP                                         | 
 |    ____________________________|_______                                   |  
 |   |                                    NP                                 | 
 |   |                ____________________|___________________               |  
 |   |               |                                       SBAR           

## Negative notes on test results for `sent_lineariser()` (v11):

### 1. Not capturing adverbs. `simple_find_parallel_sentence10()` was able to.
output:
'Bobby is handsome', 'Bobby is smart'

expected:
'Bobby is very handsome.', 'Bobby is very smart.'

### 2. Generates whitespace before punctuation marks
but this is a relatively unimportant issue.

---

## Positive notes on test results for sent_lineariser():

### 1. Passes a specific case that simple_find_parallel_sentence10() fails. v10 was messing up the ordering.
output:
'I saw an ant by the fire .', 'I saw an elephant by the fire .'

expected:
'I saw an ant by the fire.', 'I saw an elephant by the fire.'

### 2. This implementation ensures that word ordering is preserved.

_conclusion: we should think of how to keep the v12 approach while also dealing with adverbs!_ 

In [172]:
def find_sub(t):
    """
    recursively finds words/phrases (refer to them as 'components') joined by a 'CC' symbol. 
    
    :param t: an nltk.tree.Tree
    :returns: an nltk.tree.Tree or None. Represents the phrase that contains the 'CC' symbol. None is returned if
    no such phrase is found.
    """ 
    # recursion ends once 'CC' symbol is found or leaves are reached.
    
    subtrees = []
    
    if t.height() > 2:
        # if tree height == 2, we only have leaves (string type) left in the tree.
        child_labels = get_child_labels(t)
        if 'CC' in child_labels:
            subtrees.append(t)
        else:
            for child in t:
                if type(child) == Tree:
                    if child.label() != 'SBAR':
                        # can't remember what this condition is for.
                        result = find_sub(child)  # recurse deeper.
                        subtrees.extend(result)  # accumulate results.
    return subtrees


def clean_comma(s):
    # recursion because it is unknown how many ' ,' substrings there might be.
    idx = s.find(' ,')
    if idx > -1:
        s = s[:idx] + s[idx+1:]
        s = clean_comma(s)
    return s

def clean_sent(s):
    '''
    Helper function. Removes whitespace before period '.' tokens and ',' tokens.
    '''
    if len(s) > 2:
        if s[-2:] == ' .':
            s = s[:-2] + '.'
            
    s = clean_comma(s)
    return s


def sent_lineariser2(t):
    """
    From a sentence, generates constituent sentences describing the same subject, if they exist.

        e.g. "You grasp concepts easily and may become impatient with those who don't learn as quickly."
        can be broken down into "You grasp concepts easily" and "You may become impatient with those who don't learn as quickly."
    
    Looks for parallel noun phrases (NP, NN), parallel verbs (VP, V) and parallel adjectives (ADJP, JJ).
    Parallel phrases means these phrases are children of the same node.
    Also takes parallel sentences (S) into account.
    Also takes conditions ("if" or SBAR) into account.
    
    This version attempts to use a recursive approach to find the parallel parts once it finds the first 
    VP co-occuring (parallel) with an NP.
    
    :param t: an nltk.tree.Tree
    :returns: list of simpler sentences.
    """
    components = []  # find a better/more intuitive name for components.
    left = None
    right = None
    
    final_sents = []
    
    labels = get_child_labels(t)
    
#     for child in t:
#         if child.label() == 'SBAR':
#             #print('precondition found.')
#             preconditions.extend(child.leaves())
    
    if labels.count('S') >= 2:
        #print(">=2 'S' found on same level.")
        for child in t:
            if child.label() == 'S':
                components.append(child.leaves())
        #print('new sentences are :')
        for sent in components:
            final_sent_arr = []
            final_sent_arr.extend(sent)
            final_sent = " ".join(final_sent_arr) + '.'
            final_sents.append(final_sent)
        return final_sents
    
    elif 'NP' in labels:
        np_label = labels.index('NP')
        
        if 'VP' in labels:
            vp_index = labels.index('VP')
            labels2 = get_child_labels(t[vp_index])
            
            # check if 'CC' exists; todo: refactor?.
            leaves_POS = t[vp_index].pos()
            cc_exists = False
            for pair in leaves_POS:
                if pair[1] in ['CC']:
                    cc_exists = True
                    break
                else: 
                    cc_exists == False
            
            if cc_exists:
                # find words/phrases joined by 'CC'.
                sub_trees = find_sub(t[vp_index])  # recursive step.
                if len(sub_trees) > 1:
                    print('no. of subtrees found: ', len(sub_trees))
                    raise Exception('Sentence structure not supported.')
                components_tree = sub_trees[0]
                # find phrases to the left and right of the components group.
                original_sent_arr = t.leaves() 
                original_sent = " ".join(original_sent_arr)  # doesn't deal with punctuation tokens well, e.g. ','.
                components_phrase_arr = components_tree.leaves()
                components_phrase = " ".join(components_phrase_arr)
                comp_index = original_sent.find(components_phrase)
                left = original_sent[:comp_index].strip()
                right = original_sent[comp_index+len(components_phrase):].strip()
                #print('left ', left)
                #print(components_phrase)
                #print('right ', right)
                for child in components_tree:
                    if child.label() in ['ADJP', 'JJ', 'VP', 'VB', 'NP', 'NN']:
                        components.append(child.leaves())
                
                # find adverbs if they exist, and preserve ordering.
                # this is for the particular case of adverb co-occuring with conjunct phrases.
                # e.g. You are the [MOST wonderful and capable] captain the world has ever seen.
                # it is assumed that this modifier occurs at extreme left or right of components group.
                components_list = list(components_tree)
                premodifier = None
                postmodifier = None
                if components_list[0].label() in ['RB', 'RBR', 'RBS']:
                    premodifier = components_list[0].leaves()
                if components_list[-1].label() in ['RB', 'RBR', 'RBS']:
                    postmodifier = components_list[-1].leaves()
                if premodifier:
                    premodifier_str = " ".join(premodifier)
                    left = left + ' ' + premodifier_str
                if postmodifier:
                    postmodifier_str = " ".join(postmodifier)
                    right = postmodifier_str + ' ' + right 
                
#                 # remove whitespace before period '.' tokens.
#                 if right != '.':
#                     if len(right) > 2:
#                         if right[-2:] == ' .':
#                             right = right[:-2] + '.'
                
                for leaf in components:
                    mid = " ".join(leaf).strip()   
                    final_sent = left + ' ' + mid + ' ' + right                    
                    final_sent = clean_sent(final_sent)
                    final_sents.append(final_sent)
                return final_sents
            else:
                Exception('Sentence not decomposable.')
        
    else:
        raise Exception('Sentence structure not covered by function.')

In [173]:
# new SBAR at front
text = "Although you are shy by nature, you do not hesitate to defend the voiceless and help the poor."
expected = ['Although you are shy by nature, you do not hesitate to defend the voiceless.', 
            'Although you are shy by nature, you do not hesitate to help the poor.']
test(parser, sent_lineariser2, text, expected=expected)

input:
Although you are shy by nature, you do not hesitate to defend the voiceless and help the poor. 

output:
['Although you are shy by nature, you do not hesitate to defend the voiceless.', 'Although you are shy by nature, you do not hesitate to help the poor.']

expected:
['Although you are shy by nature, you do not hesitate to defend the voiceless.', 'Although you are shy by nature, you do not hesitate to help the poor.'] 

result: PASSING





In [174]:
run_tests(sent_lineariser2)

input:
Bobby is a good father and great friend. 

output:
['Bobby is a good father.', 'Bobby is great friend.']

expected:
['Bobby is a good father.', 'Bobby is a great friend.'] 

result: FAILING

                    ROOT                          
                     |                             
                     S                            
   __________________|__________________________   
  |        VP                                   | 
  |     ___|_________                           |  
  |    |             NP                         | 
  |    |        _____|_______________           |  
  NP   |       NP          |         NP         | 
  |    |    ___|_____      |     ____|____      |  
 NNP  VBZ  DT  JJ    NN    CC   JJ        NN    . 
  |    |   |   |     |     |    |         |     |  
Bobby  is  a  good father and great     friend  . 



input:
Bobby is not a good father and great friend. 

output:
['Bobby is not a good father.', 'Bobby is not great friend.']

expec

## Notes on test results for `sent_lineariser2()` (v12):

### 1. All test cases pass except for those where the off-the-shelf parser failed!
I can only deal with the given parse tree and virtually all known test cases pass! lineariser should be ready for use.

In [None]:
# long sentence
text = "You do not hesitate to defend the voiceless and help the poor because you have a righteous heart."
test(parser, sent_lineariser2, text, expected=expected)