In [1]:
from anytree import Node, RenderTree
from internal import lineariser

In [2]:
lin = lineariser.Lineariser()



In [3]:
lin.linearise('bob is happy and satisfied, and very full.')

['bob is happy and satisfied.', 'bob is very full.']

In [4]:
old_sentences = [
    'bob is happy and satisfied.',
    'bob is happy and satisfied, and very full.'
]
new_sentences = []

In [4]:
def linearise_hierarchy(n: Node, l: lineariser.Lineariser):
    """
    Recursive algorithm that linearises a sentence into its component sentences.
    Recursively grows the tree that represents the hierachical components of the sentence,
    according to the Lineariser's rules.
        e.g. 
    :param n: Node containing sentence string and, if they exist, its parent and children.
    :param l: lineariser.Lineariser object
    :returns: n, a tree as Node
    """
    
    new_sents = l.linearise(n.name)  # n.name contains the sentence string.
    if new_sents != None:
        for sent in new_sents:
            linearise_hierarchy(Node(sent, parent=n), l)
    return n

In [5]:
def test(s, lin, tprint=False):
    """
    :param s: sentence string to linearise hierarchically.
    :param lin: lineariser.Lineariser
    :param tprint: boolean. set to True to print tree structure.
    """
    test_node = Node(s)
    root_node = linearise_hierarchy(test_node, lin)
    if tprint:
        for pre, fill, node in RenderTree(root_node):
            print("%s%s" % (pre, node.name))
        print()
    return root_node

In [6]:
root_node = test('bob is happy and satisfied, and very full.', lin, tprint=True)

for n in root_node.leaves:
    print(n.name) 

bob is happy and satisfied, and very full.
├── bob is happy and satisfied.
│   ├── bob is happy.
│   └── bob is satisfied.
└── bob is very full.

bob is happy.
bob is satisfied.
bob is very full.


In [7]:
sent = "Bob is happy, funny, and lively, and he loves a good drink."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and lively, and he loves a good drink.
├── Bob is happy , funny , and lively.
│   ├── Bob is happy lively.
│   └── Bob is funny lively.
└── he loves a good drink.



In [8]:
# Todo: Debug. Whitespaces were not removed before the commas.

In [9]:
sent = "Bob is happy, funny, and lively, and he is also hilarious and genuine."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and lively, and he is also hilarious and genuine.
├── Bob is happy , funny , and lively.
│   ├── Bob is happy lively.
│   └── Bob is funny lively.
└── he is also hilarious and genuine.
    ├── he is also hilarious.
    └── he is also genuine.



In [10]:
sent = "You need to learn to digest every experience and assimilate it without bitterness, regret, spite and resentment."
root_node = test(sent, lin, tprint=True)

You need to learn to digest every experience and assimilate it without bitterness, regret, spite and resentment.
├── You need to learn to digest every experience.
└── You need to learn to assimilate it without bitterness, regret, spite and resentment.
    ├── You need to learn to assimilate it without bitterness.
    ├── You need to learn to assimilate it without regret.
    ├── You need to learn to assimilate it without spite.
    └── You need to learn to assimilate it without resentment.



In [11]:
sent = "You enjoy indulging yourself and the ones you love and it is too easy for you to be extravagant and perhaps to put too much value in material things."
root_node = test(sent, lin, tprint=True)

You enjoy indulging yourself and the ones you love and it is too easy for you to be extravagant and perhaps to put too much value in material things.
├── You enjoy indulging yourself.
└── You enjoy indulging the ones you love and it is too easy for you to be extravagant and perhaps to put too much value in material things.
    ├── You enjoy indulging the ones you love.
    └── it is too easy for you to be extravagant and perhaps to put too much value in material things.
        ├── it is too easy for you to be extravagant.
        └── it is too easy for you to put too much value in material things.



## Import example astrology corpus.

In [2]:
import json
import nltk
import pandas as pd

sentences = []

'''
specify path to json files and feature to pull.
e.g. [(PATH_TO_JSON_FILE, FEATURE_NAME)]
'''
data_list = [
    ('../data/planets-in-signs/planets-in-signs_ascendant_data.json', 'Virgo Rising'),
    ('../data/planets-in-signs/planets-in-signs_moon_data.json', 'Moon in Cancer'),
    ('../data/planets-in-signs/planets-in-signs_mercury_data.json', 'Mercury in Gemini'),
    ('../data/planets-in-signs/planets-in-signs_venus_data.json', 'Venus in Taurus')
]

for fpath, feature in data_list:
    with open(fpath, 'r') as f:
        data_dict = json.load(f)

    group = data_dict[feature]
    sentences.extend(nltk.sent_tokenize(group[0]))

print(len(sentences), '\n')

60 



In [13]:
# for index, sent in enumerate(sentences):
#     print(str(index) + ': ' + sent + '\n')

In [14]:
linearized_sentences = {i:[] for i in range(len(sentences))}
print(linearized_sentences)

for i, sent in enumerate(sentences):
    root_node = Node(sent)
    root_node = linearise_hierarchy(root_node, lin)
    for n in root_node.leaves:
        linearized_sentences[i].append(n.name)
print(len(linearized_sentences))

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [], 14: [], 15: [], 16: [], 17: [], 18: [], 19: [], 20: [], 21: [], 22: [], 23: [], 24: [], 25: [], 26: [], 27: [], 28: [], 29: [], 30: [], 31: [], 32: [], 33: [], 34: [], 35: [], 36: [], 37: [], 38: [], 39: [], 40: [], 41: [], 42: [], 43: [], 44: [], 45: [], 46: [], 47: [], 48: [], 49: [], 50: [], 51: [], 52: [], 53: [], 54: [], 55: [], 56: [], 57: [], 58: [], 59: []}
60


In [15]:
for k, v in linearized_sentences.items():
    print(str(k), ':')
    for sent in v:
        print(sent)
    print()

0 :
People with Virgo rising tend to be practical somewhat self-centered.
People with Virgo rising tend to be analytical somewhat self-centered.
People with Virgo rising tend to be discriminating somewhat self-centered.
People with Virgo rising tend to be fastidious somewhat self-centered.
People with Virgo rising tend to be careful somewhat self-centered.
People with Virgo rising tend to be exacting somewhat self-centered.
People with Virgo rising tend to be attentive to details somewhat self-centered.
People with Virgo rising tend to be methodical somewhat self-centered.
People with Virgo rising tend to be quiet somewhat self-centered.
People with Virgo rising tend to be unassuming somewhat self-centered.
People with Virgo rising tend to be shy somewhat self-centered.
People with Virgo rising tend to be critical somewhat self-centered.
People with Virgo rising tend to be thoughtful somewhat self-centered.

1 :
You have an ingenious mind.
You have an active mind.
You have an alert min

In [16]:
with open('../data/output/lineariser_output_3.json', 'w', encoding='utf-8') as f:
    json.dump(linearized_sentences, f, ensure_ascii=False, indent=4)

In [17]:
"""
count no. of correctly linearised sentences over total sentences. how many of the incorrect
ones are due to parser? how many are due to linearisation algorithm?
By observation, 8 of the original sentences were clearly incorrectly linearized. 52/60 = 86.67% accuracy.
Of these mistakes, 
2 are due to implicit references
1 is due to parser being wrong or sentence being ungrammatical
5 seems to be linearisation mistakes
"""

'\ncount no. of correctly linearised sentences over total sentences. how many of the incorrect\nones are due to parser? how many are due to linearisation algorithm?\nBy observation, 8 of the original sentences were clearly incorrectly linearized. 52/60 = 86.67% accuracy.\nOf these mistakes, \n2 are due to implicit references\n1 is due to parser being wrong or sentence being ungrammatical\n5 seems to be linearisation mistakes\n'

In [29]:
with open('../data/output/lineariser_output_3.json', 'r', encoding='utf-8') as f:
    raw_lin = json.load(f)

In [30]:
bad_indices = [0, 21, 22, 32, 33, 34, 37]

for ind in bad_indices:
    display(raw_lin[str(ind)], sentences[ind])
    raw_lin[str(ind)] = [sentences[ind]]  # raw_lin's dict values are lists.

['People with Virgo rising tend to be practical somewhat self-centered.',
 'People with Virgo rising tend to be analytical somewhat self-centered.',
 'People with Virgo rising tend to be discriminating somewhat self-centered.',
 'People with Virgo rising tend to be fastidious somewhat self-centered.',
 'People with Virgo rising tend to be careful somewhat self-centered.',
 'People with Virgo rising tend to be exacting somewhat self-centered.',
 'People with Virgo rising tend to be attentive to details somewhat self-centered.',
 'People with Virgo rising tend to be methodical somewhat self-centered.',
 'People with Virgo rising tend to be quiet somewhat self-centered.',
 'People with Virgo rising tend to be unassuming somewhat self-centered.',
 'People with Virgo rising tend to be shy somewhat self-centered.',
 'People with Virgo rising tend to be critical somewhat self-centered.',
 'People with Virgo rising tend to be thoughtful somewhat self-centered.']

'People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered.'

['Anyone who lives with you must accept ups.',
 'Anyone who lives with you must accept downs.',
 'Anyone who lives with you must appreciate your need for times of withdrawal.']

'Anyone who lives with you must accept your ups and downs and appreciate your need for times of withdrawal.'

['You are also very sympathetic.',
 'You understand unspoken of others.',
 'You understand feelings of others.',
 'You understand needs of others.']

'You are also very sympathetic and understand the unspoken feelings and needs of others.'

['You have the memory of an elephant.',
 'you must learn to forgive.',
 'you forget past hurts.']

'You have the memory of an elephant, but you must learn to forgive and forget past hurts.'

['If you have Mercury in Gemini, you have extremely adaptable, energetic, active, alert, curious mind.',
 'If you have Mercury in Gemini, you have versatile mind.']

'If you have Mercury in Gemini, you have an extremely adaptable, energetic, active, alert, curious, and versatile mind.'

['You seek knowledge.',
 'through reading or conversation , or through travel and talking with the new people you constantly meet.']

'You seek knowledge, either through reading or conversation, or through travel and talking with the new people you constantly meet.'

['You are clever.',
 'You witty.',
 'You have a joke or comeback ready for any situation.']

'You are clever, witty, and always have a joke or comeback ready for any situation.'

In [31]:
raw_lin

{'0': ['People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered.'],
 '1': ['You have an ingenious mind.',
  'You have an active mind.',
  'You have an alert mind.'],
 '2': ['Gaining knowledge and putting it to good use are important to you.'],
 '3': ['You strive for perfection.',
  'You can be quite the person to live with or to be around because your standards for yourself and others are so high.'],
 '4': ['At times others can never be "good" enough to meet those high standards.'],
 '5': ["Finding fault with what's wrong with things is your forte."],
 '6': ['Sometimes, though, this can make relationships sour as you often turn your critical eye on the one you love and the things they do.'],
 '7': ['Pessimism and being too self-critical are two faults you should try to improve upon.'],
 '8': ['You may tend to worry too much, especi

In [34]:
# count no. of sentences after cleaned linearisation
count = 0
for k,v in raw_lin.items():
    count = count + len(v)
display(count)

88

In [35]:
# save cleaned, linearized sentences
with open('../data/output/lineariser_output_3_clean.json', 'w', encoding='utf-8') as f:
    json.dump(raw_lin, f, ensure_ascii=False, indent=4)

In [37]:
# generate all pairs in the set of sentences.

import itertools

s = []
for k,v in raw_lin.items():
    s.extend(v)

pairs_list = list(itertools.combinations(s, 2))
print('no. of sentences: ', len(s))
print('no. of sentence pairs: ', len(pairs_list), '\n')

no. of sentences:  88
no. of sentence pairs:  3828 



In [38]:
pairs_list[0]  # tuple.

('People with Virgo rising tend to be practical, analytical, discriminating, fastidious, careful, exacting, attentive to details, methodical, quiet, unassuming, shy, critical, thoughtful, and somewhat self-centered.',
 'You have an ingenious mind.')

In [39]:
data = []
for pair in pairs_list:
    data.append([pair[0], pair[1]])

df = pd.DataFrame(data=data, columns=['first', 'second'])
print(df)
path = input('key in file path to save csv to: ')
df.to_csv(path)

                                                  first  \
0     People with Virgo rising tend to be practical,...   
1     People with Virgo rising tend to be practical,...   
2     People with Virgo rising tend to be practical,...   
3     People with Virgo rising tend to be practical,...   
4     People with Virgo rising tend to be practical,...   
...                                                 ...   
3823  Self-control needs to be developed and added t...   
3824  Self-control needs to be developed and added t...   
3825  Guard against possessiveness, jealousy, and ta...   
3826  Guard against possessiveness, jealousy, and ta...   
3827               You aim to please in practical ways.   

                                                 second  
0                           You have an ingenious mind.  
1                              You have an active mind.  
2                               You have an alert mind.  
3     Gaining knowledge and putting it to good use a...  
4