In [1]:
from anytree import Node, RenderTree
from internal import lineariser

In [2]:
lin = lineariser.Lineariser()



In [3]:
lin.linearise('bob is happy and satisfied, and very full.')

['bob is happy and satisfied.', 'bob is very full.']

In [4]:
def linearise_hierarchy(n: Node, l: lineariser.Lineariser):
    """
    Recursive algorithm that linearises a sentence into its component sentences.
    Recursively grows the tree that represents the hierachical components of the sentence,
    according to the Lineariser's rules.
        e.g. 
    :param n: Node containing sentence string and, if they exist, its parent and children.
    :param l: lineariser.Lineariser object
    :returns: n, a tree as Node
    """
    
    new_sents = l.linearise(n.name)  # n.name contains the sentence string.
    if new_sents != None:
        for sent in new_sents:
            linearise_hierarchy(Node(sent, parent=n), l)
    return n

In [5]:
def test(s, lin, tprint=False):
    """
    :param s: sentence string to linearise hierarchically.
    :param lin: lineariser.Lineariser
    :param tprint: boolean. set to True to print tree structure.
    """
    test_node = Node(s)
    root_node = linearise_hierarchy(test_node, lin)
    if tprint:
        for pre, fill, node in RenderTree(root_node):
            print("%s%s" % (pre, node.name))
        print()
    return root_node

In [6]:
root_node = test('bob is happy and satisfied, and very full.', lin, tprint=True)

for n in root_node.leaves:
    print(n.name) 

bob is happy and satisfied, and very full.
├── bob is happy and satisfied.
│   ├── bob is happy.
│   └── bob is satisfied.
└── bob is very full.

bob is happy.
bob is satisfied.
bob is very full.


In [7]:
sent = "Bob is happy, funny, and lively, and he loves a good drink."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and lively, and he loves a good drink.
├── Bob is happy , funny , and lively.
│   ├── Bob is happy.
│   ├── Bob is funny.
│   └── Bob is lively.
└── he loves a good drink.



In [8]:
sent = "Bob is happy, funny, and a great friend."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and a great friend.
├── Bob is happy.
├── Bob is funny.
└── Bob is a great friend.



In [9]:
parsetree = lin.get_parsetree("Bob is happy, funny, and a great friend.")
parsetree.pretty_print()

                        ROOT                         
                         |                            
                         S                           
  _______________________|_________________________   
 |             VP                                  | 
 |    _________|_________                          |  
 |   |                  UCP                        | 
 |   |     ______________|_____________            |  
 NP  |   ADJP  |   ADJP  |    |        NP          | 
 |   |    |    |    |    |    |    ____|_____      |  
NNP VBZ   JJ   ,    JJ   ,    CC  DT   JJ    NN    . 
 |   |    |    |    |    |    |   |    |     |     |  
Bob  is happy  ,  funny  ,   and  a  great friend  . 



In [10]:
subtrees = lin.find_sub(parsetree)
subtrees[0].pretty_print()

                UCP                     
   ______________|____________           
 ADJP  |   ADJP  |   |        NP        
  |    |    |    |   |    ____|_____     
  JJ   ,    JJ   ,   CC  DT   JJ    NN  
  |    |    |    |   |   |    |     |    
happy  ,  funny  ,  and  a  great friend



In [11]:
sent = "Bob is happy, funny, and lively, and he is also hilarious and genuine."
root_node = test(sent, lin, tprint=True)

Bob is happy, funny, and lively, and he is also hilarious and genuine.
├── Bob is happy , funny , and lively.
│   ├── Bob is happy.
│   ├── Bob is funny.
│   └── Bob is lively.
└── he is also hilarious and genuine.
    ├── he is also hilarious.
    └── he is also genuine.



In [12]:
sent = "You need to learn to digest every experience and assimilate it without bitterness, regret, spite and resentment."
root_node = test(sent, lin, tprint=True)

You need to learn to digest every experience and assimilate it without bitterness, regret, spite and resentment.
├── You need to learn to digest every experience.
└── You need to learn to assimilate it without bitterness, regret, spite and resentment.
    ├── You need to learn to assimilate it without bitterness.
    ├── You need to learn to assimilate it without regret.
    ├── You need to learn to assimilate it without spite.
    └── You need to learn to assimilate it without resentment.



In [13]:
sent = "You enjoy indulging yourself and the ones you love and it is too easy for you to be extravagant and perhaps to put too much value in material things."
root_node = test(sent, lin, tprint=True)

You enjoy indulging yourself and the ones you love and it is too easy for you to be extravagant and perhaps to put too much value in material things.
├── You enjoy indulging yourself.
└── You enjoy indulging the ones you love and it is too easy for you to be extravagant and perhaps to put too much value in material things.
    ├── You enjoy indulging the ones you love.
    └── it is too easy for you to be extravagant and perhaps to put too much value in material things.
        ├── it is too easy for you to be extravagant.
        └── it is too easy for you to put too much value in material things.



## Import example astrology corpus.

In [2]:
import json
import nltk
import pandas as pd

sentences = []

'''
specify path to json files and feature to pull.
e.g. [(PATH_TO_JSON_FILE, FEATURE_NAME)]
'''
data_list = [
    ('../data/planets-in-signs/planets-in-signs_ascendant_data.json', 'Virgo Rising'),
    ('../data/planets-in-signs/planets-in-signs_moon_data.json', 'Moon in Cancer'),
    ('../data/planets-in-signs/planets-in-signs_mercury_data.json', 'Mercury in Gemini'),
    ('../data/planets-in-signs/planets-in-signs_venus_data.json', 'Venus in Taurus')
]

for fpath, feature in data_list:
    with open(fpath, 'r') as f:
        data_dict = json.load(f)

    group = data_dict[feature]
    sentences.extend(nltk.sent_tokenize(group[0]))

print(len(sentences), '\n')

60 



### Or import sentences from csv/tsv.

In [18]:
import pandas as pd
with open("../data/Astro1.tsv") as f:
    sents_df = pd.read_csv(f, delimiter='\t', header=None)
sents_df.head()

Unnamed: 0,0
0,You never give up and find it difficult to cha...
1,"Whatever you have set your sights on, you refu..."
2,You are patient and usually slow to anger unle...
3,You are reliable and consistent and can handle...
4,"You prefer a regular routine, with definite ho..."


In [19]:
sentences = sents_df[0].values

### Linearise sentences

In [20]:
linearized_sentences = {i:[] for i in range(len(sentences))}
print(linearized_sentences)

for i, sent in enumerate(sentences):
    root_node = Node(sent)
    root_node = linearise_hierarchy(root_node, lin)
    for n in root_node.leaves:
        linearized_sentences[i].append(n.name)
print(len(linearized_sentences))

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [], 14: [], 15: [], 16: [], 17: [], 18: [], 19: []}
20


In [21]:
for k, v in linearized_sentences.items():
    print(str(k), ':')
    for sent in v:
        print(sent)
    print()

0 :
You never give up.
You never find it difficult to change your mind or course in mid-stream.

1 :
Whatever you have set your sights on, you refuse to give up or let go of it.

2 :
You are patient unless someone takes you too far.
You are usually slow to anger unless someone takes you too far.

3 :
You are reliable.
You are consistent.
You can handle more of the workload than most around you.

4 :
You prefer a regular routine, defined responsibilities.
You prefer a regular routine, defined tasks.

5 :
You enjoy building and seeing the results of your hard work.

6 :
Keeping things going is your strength, especially once someone else starts them.

7 :
Money and possessions are important to you for the feelings of security they bring.

8 :
You are handy with your hands.
You may be ambidextrous or mechanically inclined.

9 :
Debate and argument appeal to you.

10 :
You grasp concepts easily.
You may become impatient with those who do n't learn as quickly.

11 :
You are very direct in th

In [23]:
import json
path = input("input path to save grouped decomposed sentences:\n")
with open(path, 'w', encoding='utf-8') as f:
    json.dump(linearized_sentences, f, ensure_ascii=False, indent=4)

input path to save grouped decomposed sentences:
 ../data/Astro1_decomposed.json


In [17]:
"""
count no. of correctly linearised sentences over total sentences. how many of the incorrect
ones are due to parser? how many are due to linearisation algorithm?
By observation, 8 of the original sentences were clearly incorrectly linearized. 52/60 = 86.67% accuracy.
Of these mistakes, 
2 are due to implicit references
1 is due to parser being wrong or sentence being ungrammatical
5 seems to be linearisation mistakes
"""

'\ncount no. of correctly linearised sentences over total sentences. how many of the incorrect\nones are due to parser? how many are due to linearisation algorithm?\nBy observation, 8 of the original sentences were clearly incorrectly linearized. 52/60 = 86.67% accuracy.\nOf these mistakes, \n2 are due to implicit references\n1 is due to parser being wrong or sentence being ungrammatical\n5 seems to be linearisation mistakes\n'

In [24]:
with open(path, 'r', encoding='utf-8') as f:
    raw_lin = json.load(f)

In [25]:
bad_indices = [11, 13]  # manually identify the indices of badly decomposed sentences.

# replace badly decomposed sentences with the original sentence:
for ind in bad_indices:
    display(raw_lin[str(ind)], sentences[ind])
    raw_lin[str(ind)] = [sentences[ind]]  # raw_lin's dict values are lists.

['You are very direct in thought , action.',
 'speech and everyone knows how you feel.']

'You are very direct in thought, action, and speech and everyone knows how you feel.'

['You tend to be industrious, efficient, practical, exacting, scientific, organized, shy, thrifty.',
 'You tend to be industrious, efficient, practical, exacting, scientific, organized, shy, studious.',
 'You tend to be industrious, efficient, practical, exacting, scientific, organized, shy, economical.',
 'You tend to be industrious, efficient, practical, exacting, scientific, organized, shy, punctual.']

'You tend to be industrious, efficient, practical, exacting, scientific, organized, shy, thrifty, studious, economical, and punctual.'

In [26]:
raw_lin

{'0': ['You never give up.',
  'You never find it difficult to change your mind or course in mid-stream.'],
 '1': ['Whatever you have set your sights on, you refuse to give up or let go of it.'],
 '2': ['You are patient unless someone takes you too far.',
  'You are usually slow to anger unless someone takes you too far.'],
 '3': ['You are reliable.',
  'You are consistent.',
  'You can handle more of the workload than most around you.'],
 '4': ['You prefer a regular routine, defined responsibilities.',
  'You prefer a regular routine, defined tasks.'],
 '5': ['You enjoy building and seeing the results of your hard work.'],
 '6': ['Keeping things going is your strength, especially once someone else starts them.'],
 '7': ['Money and possessions are important to you for the feelings of security they bring.'],
 '8': ['You are handy with your hands.',
  'You may be ambidextrous or mechanically inclined.'],
 '9': ['Debate and argument appeal to you.'],
 '10': ['You grasp concepts easily.',


In [27]:
# count no. of sentences after cleaned linearisation
count = 0
for k,v in raw_lin.items():
    count = count + len(v)
display(count)

47

In [28]:
# save cleaned, linearized sentences
final_path = input("input final path to save decomposed sentences.")
with open(final_path, 'w', encoding='utf-8') as f:
    json.dump(raw_lin, f, ensure_ascii=False, indent=4)

input final path to save decomposed sentences. ../data/Astro1_decomposed_clean.json


In [29]:
# generate all pairs in the set of sentences.

import itertools

s = []
for k,v in raw_lin.items():
    s.extend(v)

pairs_list = list(itertools.combinations(s, 2))
print('no. of sentences: ', len(s))
print('no. of sentence pairs: ', len(pairs_list), '\n')

no. of sentences:  47
no. of sentence pairs:  1081 



In [30]:
pairs_list[0]  # tuple.

('You never give up.',
 'You never find it difficult to change your mind or course in mid-stream.')

In [31]:
data = []
for pair in pairs_list:
    data.append([pair[0], pair[1]])

df = pd.DataFrame(data=data, columns=['first', 'second'])
print(df)
path = input('key in file path to save csv to: ')
df.to_csv(path)

                     first                                             second
0       You never give up.  You never find it difficult to change your min...
1       You never give up.  Whatever you have set your sights on, you refu...
2       You never give up.  You are patient unless someone takes you too far.
3       You never give up.  You are usually slow to anger unless someone t...
4       You never give up.                                  You are reliable.
...                    ...                                                ...
1076    You are sensitive.                                    You are gentle.
1077    You are sensitive.                                  You are romantic.
1078  You are sentimental.                                    You are gentle.
1079  You are sentimental.                                  You are romantic.
1080       You are gentle.                                  You are romantic.

[1081 rows x 2 columns]


key in file path to save csv to:  ../data/Astro1_decomposed_clean_pairs.csv
