In [1]:
from anytree import Node, RenderTree
from internal import lineariser

In [2]:
lin = lineariser.Lineariser()



In [3]:
lin.linearise('bob is happy and satisfied, and very full.')

['bob is happy and satisfied.', 'bob is very full.']

In [4]:
old_sentences = [
    'bob is happy and satisfied.',
    'bob is happy and satisfied, and very full.'
]
new_sentences = []

In [5]:
def linearise_hierarchy(n: Node, l: lineariser.Lineariser):
    """
    Recursive algorithm that linearises a sentence into its component sentences.
    Recursively grows the tree that represents the hierachical components of the sentence,
    according to the Lineariser's rules.
        e.g. 
    :param n: Node containing sentence string and, if they exist, its parent and children.
    :param l: lineariser.Lineariser object
    :returns: n, a tree as Node
    """
    
    new_sents = l.linearise(n.name)  # n.name contains the sentence string.
    if new_sents != None:
        for sent in new_sents:
            linearise_hierarchy(Node(sent, parent=n), l)
    return n

In [6]:
def test(s, lin, tprint=False):
    """
    :param s: sentence string to linearise hierarchically.
    :param lin: lineariser.Lineariser
    :param tprint: boolean. set to True to print tree structure.
    """
    test_node = Node(s)
    root_node = linearise_hierarchy(test_node, lin)
    if tprint:
        for pre, fill, node in RenderTree(root_node):
            print("%s%s" % (pre, node.name))
        print()
    return root_node

In [7]:
root_node = test('bob is happy and satisfied, and very full.', lin, tprint=True)

for n in root_node.leaves:
    print(n.name) 

bob is happy and satisfied, and very full.
├── bob is happy and satisfied.
│   ├── bob is happy.
│   └── bob is satisfied.
└── bob is very full.

bob is happy.
bob is satisfied.
bob is very full.


In [8]:
sent = "Bob is happy, funny, and lively, and he loves a good drink."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and lively, and he loves a good drink.
├── Bob is happy , funny , and lively.
│   ├── Bob is happy lively.
│   └── Bob is funny lively.
└── he loves a good drink.



In [9]:
# Todo: Debug. Whitespaces were not removed before the commas.

In [10]:
sent = "Bob is happy, funny, and lively, and he is also hilarious and genuine."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and lively, and he is also hilarious and genuine.
├── Bob is happy , funny , and lively.
│   ├── Bob is happy lively.
│   └── Bob is funny lively.
└── he is also hilarious and genuine.
    ├── he is also hilarious.
    └── he is also genuine.



In [11]:
sent = "You need to learn to digest every experience and assimilate it without bitterness, regret, spite and resentment."
root_node = test(sent, lin, tprint=True)

You need to learn to digest every experience and assimilate it without bitterness, regret, spite and resentment.
├── You need to learn to digest every experience.
└── You need to learn to assimilate it without bitterness, regret, spite and resentment.
    ├── You need to learn to assimilate it without bitterness.
    ├── You need to learn to assimilate it without regret.
    ├── You need to learn to assimilate it without spite.
    └── You need to learn to assimilate it without resentment.



## Import example astrology corpus.

In [12]:
import json
import nltk
import pandas as pd

sentences = []

'''
specify path to json files and feature to pull.
e.g. [(PATH_TO_JSON_FILE, FEATURE_NAME)]
'''
data_list = [
    ('../data/planets-in-signs/planets-in-signs_ascendant_data.json', 'Virgo Rising'),
    ('../data/planets-in-signs/planets-in-signs_moon_data.json', 'Moon in Cancer'),
    ('../data/planets-in-signs/planets-in-signs_mercury_data.json', 'Mercury in Gemini'),
    ('../data/planets-in-signs/planets-in-signs_venus_data.json', 'Venus in Taurus')
]

for fpath, feature in data_list:
    with open(fpath, 'r') as f:
        data_dict = json.load(f)

    group = data_dict[feature]
    sentences.extend(nltk.sent_tokenize(group[0]))

print(len(sentences), '\n')

60 



In [13]:
# for index, sent in enumerate(sentences):
#     print(str(index) + ': ' + sent + '\n')

In [14]:
linearized_sentences = {i:[] for i in range(len(sentences))}
print(linearized_sentences)

for i, sent in enumerate(sentences):
    root_node = Node(sent)
    root_node = linearise_hierarchy(root_node, lin)
    for n in root_node.leaves:
        linearized_sentences[i].append(n.name)
print(len(linearized_sentences))

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [], 14: [], 15: [], 16: [], 17: [], 18: [], 19: [], 20: [], 21: [], 22: [], 23: [], 24: [], 25: [], 26: [], 27: [], 28: [], 29: [], 30: [], 31: [], 32: [], 33: [], 34: [], 35: [], 36: [], 37: [], 38: [], 39: [], 40: [], 41: [], 42: [], 43: [], 44: [], 45: [], 46: [], 47: [], 48: [], 49: [], 50: [], 51: [], 52: [], 53: [], 54: [], 55: [], 56: [], 57: [], 58: [], 59: []}
60


In [15]:
for k, v in linearized_sentences.items():
    print(str(k), ':')
    for sent in v:
        print(sent)
    print()

0 :
People with Virgo rising tend to be practical somewhat self-centered.
People with Virgo rising tend to be analytical somewhat self-centered.
People with Virgo rising tend to be discriminating somewhat self-centered.
People with Virgo rising tend to be fastidious somewhat self-centered.
People with Virgo rising tend to be careful somewhat self-centered.
People with Virgo rising tend to be exacting somewhat self-centered.
People with Virgo rising tend to be attentive to details somewhat self-centered.
People with Virgo rising tend to be methodical somewhat self-centered.
People with Virgo rising tend to be quiet somewhat self-centered.
People with Virgo rising tend to be unassuming somewhat self-centered.
People with Virgo rising tend to be shy somewhat self-centered.
People with Virgo rising tend to be critical somewhat self-centered.
People with Virgo rising tend to be thoughtful somewhat self-centered.

1 :
You have an ingenious mind.
You have an active mind.
You have an alert min

In [16]:
with open('../data/output/lineariser_output_3.json', 'w', encoding='utf-8') as f:
    json.dump(linearized_sentences, f, ensure_ascii=False, indent=4)

In [17]:
"""
count no. of correctly linearised sentences over total sentences. how many of the incorrect
ones are due to parser? how many are due to linearisation algorithm?
By observation, 8 of the original sentences were clearly incorrectly linearized. 52/60 = 86.67% accuracy.
Of these mistakes, 
2 are due to implicit references
1 is due to parser being wrong or sentence being ungrammatical
5 seems to be linearisation mistakes
"""

'\ncount no. of correctly linearised sentences over total sentences. how many of the incorrect\nones are due to parser? how many are due to linearisation algorithm?\nBy observation, 8 of the original sentences were clearly incorrectly linearized. 52/60 = 86.67% accuracy.\nOf these mistakes, \n2 are due to implicit references\n1 is due to parser being wrong or sentence being ungrammatical\n5 seems to be linearisation mistakes\n'