In [1]:
!pip3 install -q benepar

In [2]:
import pkgutil

package = pkgutil.get_loader("benepar")
base_parser = package.path.replace('__init__', 'base_parser')
print('base_parser directory : ' + base_parser)

with open(base_parser, 'r') as fp:
    content = fp.read()
content = content.replace('import tensorflow as tf', 'import tensorflow.compat.v1 as tf\ntf.disable_v2_behavior()\n')
with open(base_parser, 'w') as fp:
    fp.write(content)    

base_parser directory : /opt/conda/lib/python3.7/site-packages/benepar/base_parser.py


In [3]:
import spacy
import re
import nltk
import pprint 
import benepar
import benepar.spacy_plugin
from nltk import Tree
# from nltk import ParentedTree as Tree

pp = pprint.PrettyPrinter(indent=4)
model = 'benepar_en2'
benepar.download(model)
nlp = spacy.load('en')
nlp.add_pipe(benepar.spacy_plugin.BeneparComponent(model))

[nltk_data] Downloading package benepar_en2 to /usr/share/nltk_data...


In [4]:
# utils
def cannonicalize(treestring):
    if isinstance(treestring, spacy.tokens.span.Span):
        tree = Tree.fromstring(treestring._.parse_string)
    else:
        tree = Tree.fromstring(treestring)
    for idx, _ in enumerate(tree.leaves()):
        tree_location = tree.leaf_treeposition(idx)
        non_terminal = tree[tree_location[:-1]]
        non_terminal[0] = non_terminal[0] + "__" + str(idx)
    return tree

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_
    
def clean_label(label):
    ret = label.split('__')
    if label.endswith(')') :
        ret = ret[0] + ')'
    else :
        ret = ret[0]
    return ret

def clean(tree):
    if isinstance(tree, Tree):
        label = clean_label(tree.label())
        return Tree(label, [clean(tree[i]) for i in range(len(tree))])
    else :
        return clean_label(tree)
    
def parse_label(tree):
    cur = tree.label().split()
    if len(cur) > 1:
        word = cur[1]
        if word.endswith(')'):
            word = word[1:-1]
        return cur[0], word
    return cur[0], None

def get_postags(tree):
    postags = {}
    def _traverse(_tree):
        nonlocal postags
        for child in _tree:
            if not isinstance(child, Tree):
                postags[child], _ = parse_label(_tree)
            else:
                _traverse(child)
    _traverse(tree)
    return postags 

def print_cp(tree):
    if isinstance(tree, spacy.tokens.token.Token):
        clean(to_nltk_tree(tree)).pretty_print()
    elif isinstance(tree, spacy.tokens.span.Span):
        clean(to_nltk_tree(tree.root)).pretty_print()
    elif isinstance(tree, Tree):
        clean(tree).pretty_print()
        
def print_dp(tree):
    if isinstance(tree, spacy.tokens.span.Span):
        for token in tree:
            print(token.text, token.dep_, token.head.text, token.pos_, [child for child in token.children])
        print_cp(tree)
    else:
        dep_tree, deps = tree
        for dep in deps:
            print(dep['text'], dep['dep'], dep['head_text'], penn2univ[dep['pos']], end=' ')
            print('[' + ', '.join(dep['children']) + ']')
        print_cp(dep_tree)

In [5]:
postag_conv = """
#	=>	SYM	_	#
$	=>	SYM	_	$, C$, US$, A$, HK$
''	=>	PUNCT	PunctSide=Fin|PunctType=Quot	'', '
,	=>	PUNCT	PunctType=Comm	,, 2, an
-LRB-	=>	PUNCT	PunctSide=Ini|PunctType=Brck	
-RRB-	=>	PUNCT	PunctSide=Fin|PunctType=Brck	
.	=>	PUNCT	PunctType=Peri	., ?, !
:	=>	PUNCT	_	--, :, ;, ..., -
AFX	=>	ADJ	Hyph=Yes	
CC	=>	CCONJ	_	and, or, but, &, nor
CD	=>	NUM	NumType=Card	million, billion, one, two, three
DT	=>	DET	_	the, a, an, this, some
EX	=>	PRON	AdvType=Ex	there
FW	=>	X	Foreign=Yes	de, perestroika, glasnost, vs., naczelnik
HYPH	=>	PUNCT	PunctType=Dash	
IN	=>	ADP	_	of, in, for, on, that
JJ	=>	ADJ	Degree=Pos	new, other, last, such, first
JJR	=>	ADJ	Degree=Cmp	more, higher, lower, less, better
JJS	=>	ADJ	Degree=Sup	most, least, largest, latest, best
LS	=>	X	NumType=Ord	3, 2, 1, 4, First
MD	=>	VERB	VerbType=Mod	will, would, could, can, may
NIL	=>	X	_	), }
NN	=>	NOUN	Number=Sing	%, company, year, market, share
NNP	=>	PROPN	Number=Sing	Mr., U.S., Corp., New, Inc.
NNPS	=>	PROPN	Number=Plur	Securities, Democrats, Americans, Brothers, Airlines
NNS	=>	NOUN	Number=Plur	years, shares, sales, companies, prices
PDT	=>	DET	AdjType=Pdt	all, such, half, both, nary
POS	=>	PART	Poss=Yes	's, '
PRP	=>	PRON	PronType=Prs	it, he, they, I, we
PRP$	=>	DET	Poss=Yes|PronType=Prs	its, his, their, our, her
RB	=>	ADV	Degree=Pos	n't, not, also, only, as
RBR	=>	ADV	Degree=Cmp	more, earlier, less, higher, further
RBS	=>	ADV	Degree=Sup	most, best, least, hardest, Worst
RP	=>	ADP	PartType=Vbp	up, out, off, down, in
SYM	=>	SYM	_	a, c, \*, \*\*, b
TO	=>	PART	PartType=Inf|VerbForm=Inf	to, na
UH	=>	INTJ	_	yes, well, no, OK, oh
VB	=>	VERB	VerbForm=Inf	be, have, make, buy, get
VBD	=>	VERB	Tense=Past|VerbForm=Fin	said, was, were, had, did
VBG	=>	VERB	Aspect=Prog|Tense=Pres|VerbForm=Part	including, being, according, going, making
VBN	=>	VERB	Aspect=Perf|Tense=Past|VerbForm=Part	been, expected, made, based, sold
VBP	=>	VERB	Tense=Pres|VerbForm=Fin	are, have, do, say, 're
VBZ	=>	VERB	Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	is, has, says, 's, does
WDT	=>	DET	PronType=Int,Rel	which, that, what, whatever, .what
WP	=>	PRON	PronType=Int,Rel	who, what, whom, whoever
WP$	=>	DET	Poss=Yes|PronType=Int,Rel	whose
WRB	=>	ADV	PronType=Int,Rel	when, how, where, why, whenever
``	=>	PUNCT	PunctSide=Ini|PunctType=Quot	``, `, non-``
"""
conversion_table = [pos.split('\t')[:3] for pos in postag_conv.split('\n') if pos]
penn2univ = {t[0]:t[2] for t in conversion_table}
# pp.pprint(penn2univ)

In [6]:
headrules = """Parent-Non-terminal Direction Priority-List
ADJP Left NNS QP NN $ ADVP JJ VBN VBG ADJP JJR NP JJS DT FW RBR RBS SBAR RB
ADVP Right RB RBR RBS FW ADVP TO CD JJR JJ IN NP JJS NN
CONJP Right CC RB IN
FRAG Right
INTJ Left
LST Right LS :
NAC Left NN NNS NNP NNPS NP NAC EX $ CD QP PRP VBG JJ
JJS JJR ADJP FW
PP Right IN TO VBG VBN RP FW
PRN Left
PRT Right RP
QP Left $ IN NNS NN JJ RB DT CD NCD QP JJR JJS
RRC Right VP NP ADVP ADJP PP
S Left TO IN VP S SBAR ADJP UCP NP
SBAR Left WHNP WHPP WHADVP WHADJP S IN DT SQ SINV SBAR FRAG
SBARQ Left SQ S SINV SBARQ FRAG
SINV Left VBZ VBD VBP VB MD VP S SINV ADJP NP
SQ Left VBZ VBD VBP VB MD VP SQ
UCP Right
VP Left VP VBD VBN VBZ VB VBG VBP TO MD ADJP NN NNS
WHADJP Left CC WRB JJ ADJP
WHADVP Right CC WRB
WHNP Left WDT WP WP $ WHADJP WHPP WHNP
WHPP Right IN TO FW
"""

headrules = [h for h in headrules.split('\n')[1:] if h]
rules = {}
for cur in headrules:
    rule = cur.strip().split()
    label, dxn = rule[0], rule[1]
    priority = rule[2:]
    rules[label] = {
        'tag' : label,
        'dxn' : dxn,
        'priority' : priority
    }

# pp.pprint(rules)

In [7]:
def inherit(tree, child):
    current, _ = parse_label(tree)
    _, head = parse_label(child)
    tree.set_label(current + ' (' + head + ')')
    return tree

# assumed all children are headified
def NP_headify(tree):
    current, word = parse_label(tree)
    if word is not None:
        return tree
    
    assert current == 'NP'
        
    if parse_label(tree[-1])[0] == 'POS':
        return inherit(tree, tree[-1])
    
    tags = [['NN', 'NNP', 'NNPS', 'NNS', 'NX', 'POS', 'JJR'], 
            ['$', 'ADJP', 'PRN'], 
            ['CD'], 
            ['JJ', 'JJS', 'RB', 'QP']]
    
    for child in reversed(tree):
        if parse_label(child)[0] in tags[0]:
            return inherit(tree, child)
    
    for child in tree:
        if parse_label(child)[0] == 'NP':
            return inherit(tree, child)
    
    for t in range(1, 4):
        for child in reversed(tree):
            if parse_label(child)[0] in tags[t]:
                return inherit(tree, child)
    
    return inherit(tree, tree[-1])

def headify(tree):
    current, word = parse_label(tree)
    if word is not None:
        return tree
    
    if len(tree) == 1 :
        if isinstance(tree[0], Tree):
            tree[0] = headify(tree[0])
            _, head = parse_label(tree[0])
        else:
            head = tree[0]
        tree.set_label(current + ' (' + head + ')')
        return tree
    
    for i, child in enumerate(tree):
        tree[i] = headify(tree[i])
        
    if current == 'NP':
        return NP_headify(tree)
    
    elif current in rules.keys():
        rule = rules[current]
        indices = list(range(len(tree)))
        
        if rule['dxn'] == 'Right':
            indices = list(reversed(indices))
            
        for ptag in rule['priority']:
            for i in indices:
                if parse_label(tree[i])[0] == ptag:
                    return inherit(tree, tree[i])
        
        return inherit(tree, tree[0])
    
    return inherit(tree, tree[0])

In [8]:
def dependency_tree(tree):
    adj = {}
    word_index = {}
    for i, word in enumerate(tree.leaves()):
        adj[word] = []
        word_index[word] = i
        
    def get_adj_list(headified_tree):
        nonlocal adj
        if len(headified_tree) == 1 and (not isinstance(headified_tree[0], Tree)):
            return

        head_index = -1
        par_label, par_word = parse_label(headified_tree)
        for i, child in enumerate(headified_tree):
            child_label, child_word = parse_label(child)
            if par_word == child_word:
                head_index = i
                break

        for i, child in enumerate(headified_tree):
            get_adj_list(child)
            if i == head_index: 
                continue
            conj = False
            if abs(i - head_index) == 2:
                j = (i + head_index) // 2
                child_label, child_word = parse_label(headified_tree[j])
                if child_label == 'CC':
                    conj = True
            child_label, child_word = parse_label(child)
            loc = 'left right'.split()[i > head_index]
            edge = (child_word, (par_label, child_label, loc, conj))
            adj[par_word].append(edge)

    postags = get_postags(tree)
    univtags = { k:penn2univ[v] for k,v in postags.items() }
    htree = headify(tree)
    get_adj_list(htree)
    _, root = parse_label(htree)
    
    def get_dependency(parent, child, dxn, conj):
        '''
        parent  = tuple of (tag/phrase_label, word)
        child   = tuple of (tag/phrase_label, word)
        dxn     = direction to which this dependency lies w.r.t head
        '''
        par_word, par_tag = parent
        child_word, child_tag = child
        
        if univtags[child_word] == 'PUNCT':
            return 'punct'
        
        if conj == True:
            return 'conj'
        
        if postags[child_word] == 'CC':
            return 'cc'
        
        if par_tag == 'NP':
            if univtags[child_word] == 'DET':
                return 'det'
            if univtags[child_word] == 'NUM':
                return 'nummod'
            if child_tag == 'PP':
                return 'prep'
            if child_tag.startswith('JJ') or child_tag == 'ADJP':
                return 'amod'
            
        if par_tag == 'PP':
            if child_tag == 'NP':
                return 'pobj'
        
        if par_tag == 'S' or par_tag == 'SQ':
            if child_tag == 'NP':
                return 'nsubj'
            if child_tag == 'VP':
                return 'aux'
        
        if par_tag == 'VP':
            if child_tag == 'S':
                return 'xcomp'
            if child_tag == 'ADJP':
                return 'acomp'
            if child_tag == 'ADVP':
                return 'advmod'
            if child_tag == 'SBAR':
                return 'ccomp'
            if child_tag == 'VP' or univtags[child_word] == 'VERB':
                return 'aux'
            if postags[child_word] == 'TO':
                if dxn == 'left':
                    return 'aux'
                else :
                    return 'prep'
            if child_tag == 'PP':
                return 'prep'
            if child_tag == 'PRT':
                return 'prt'
            if child_tag == 'NP':
                return 'dobj'
            
        if 'RB' in postags[child_word]:
            return 'advmod'
            
        if child_tag == 'S':
            return 'xcomp'
            
        return '<UNK>'
    
    dependencies = [{
                'text' : clean_label(root),
                'dep' : 'ROOT',
                'head_text' : clean_label(root),
                'pos' : postags[root],
                'children' : [clean_label(child) for child, edge in adj[root]],
                'word_id' : word_index[root]
            }]
    
    def form_tree(node):
        if len(adj[node]) == 0:
            return node
        ret = Tree(node, [])
        for child, edge in adj[node]:
            node_label, child_label, dxn, conj = edge
            dep = get_dependency((node, node_label), (child, child_label), dxn, conj)
            dep_obj = {
                'text' : clean_label(child),
                'dep' : dep,
                'head_text' : clean_label(node),
                'pos' : postags[child],
                'children' : [clean_label(child) for child, edge in adj[child]],
                'word_id' : word_index[child]
            }
            dependencies.append(dep_obj)
            ret.append(form_tree(child))
        return ret
    
    return form_tree(root), sorted(dependencies, key=lambda x: x['word_id'])

In [9]:
sentences = [
    'I told him that I am going.',
    'He flew to the field and kicked and died painfully.',
    'He walked and kicked but talked cheerfully.',
    'The young and wild kid fell down.',
    'He went into the wild and through the civilization and towards the sun.',
    'The big and scary house shined under the sun.',
    'He eats mangoes and he plays but she runs fast.',
    'He ran quickly over a truck with a car.',
    'He gave me a slap swiftly.',
    'The time for action is now.',
    'It\'s never too late to do something.',
    'Vinken will join the board as a nonexecutive director Nov 29.',
    'I would like to join.',
    'I will not give in to them.',
    'His car was broken.'
    'The man is charming the girls.',
    'The man is charming.',
    'A high price for this would be 40 dollars.',
    'When are we going to visit the temple?',
]
doc = nlp(' '.join(sentences))

In [10]:
def latex_print_dp(deps):
    if isinstance(deps, spacy.tokens.span.Span):
        postags = get_postags(Tree.fromstring(deps._.parse_string))
        tmp = []
        for i, token in enumerate(deps):
            dep = {
                'text' : token.text,
                'dep' : token.dep_,
                'head_text' : token.head.text,
                'pos' : postags[token.text],
                'children' : [str(child) for child in token.children],
                'word_id' : i
            }
            tmp.append(dep)
        deps = tmp
        
    ret = """\\begin{{dependency}}\t
        \\begin{{deptext}}[column sep=0.6cm]
        \t{} \\\\
        \t{} \\\\
        \\end{{deptext}}
        {}
\\end{{dependency}}"""
    s1 = ' \\& '.join([dep['text'] for dep in deps])
    s2 = ' \\& '.join([dep['pos'].replace('$', '\\$') for dep in deps])
    word_index = {dep['text']:1+dep['word_id'] for i, dep in enumerate(deps)}
    typed_deps = []
    for dep in deps:
        par = word_index[dep['head_text']]
        child = word_index[dep['text']]
        label = dep['dep']
        if label != 'ROOT':
            curr = '\depedge{{{}}}{{{}}}{{{}}}'.format(par, child, label)
        else:
            curr = '\deproot{{{}}}{{root}}'.format(par)
        typed_deps.append(curr)
    s3 = '\n\t'.join(typed_deps)
    return ret.format(s1, s2, s3)
    
def latex_tree_cp(tree):
    ctree = clean(tree)
    def _traverse(_tree):
        label = _tree.label()
        _tree.set_label('{' + label + '}')
        for i, child in enumerate(_tree):
            if isinstance(child, Tree):
                _traverse(child)
            else:
                _tree[i] = '\\' + 'textit{' + child + '}'
    _traverse(ctree)
    ret = ctree.pformat_latex_qtree()
    ret = ret.replace('\n', ' ')
    ret = re.sub(' +', ' ', ret).replace('\\', '')
    ret = '\\' + ret.replace('textit', '\\' + 'textit')
    return ret

In [14]:
sent = list(doc.sents)[-1]
sent_tree = cannonicalize(sent)
print_cp(sent_tree)
print('\n')
print_cp(headify(sent_tree))
print('\n')
print_dp(sent)
print('\n')
print_dp(dependency_tree(sent_tree))

                     SBARQ                             
   ____________________|_____________________________   
  |                    SQ                            | 
  |      ______________|_____                        |  
  |     |   |                VP                      | 
  |     |   |     ___________|____                   |  
  |     |   |    |                S                  | 
  |     |   |    |                |                  |  
  |     |   |    |                VP                 | 
  |     |   |    |      __________|___               |  
  |     |   |    |     |              VP             | 
  |     |   |    |     |      ________|___           |  
WHADVP  |   NP   |     |     |            NP         | 
  |     |   |    |     |     |         ___|____      |  
 WRB   VBP PRP  VBG    TO    VB       DT       NN    . 
  |     |   |    |     |     |        |        |     |  
 When  are  we going   to  visit     the     temple  ? 



                                     

In [15]:
print(latex_print_dp(sent))
print(latex_print_dp(dependency_tree(sent_tree)[1]))
print(latex_tree_cp(sent_tree))
print(latex_tree_cp(headify(sent_tree)))
print(latex_tree_cp(dependency_tree(sent_tree)[0]))

\begin{dependency}	
        \begin{deptext}[column sep=0.6cm]
        	When \& are \& we \& going \& to \& visit \& the \& temple \& ? \\
        	WRB \& VBP \& PRP \& VBG \& TO \& VB \& DT \& NN \& . \\
        \end{deptext}
        \depedge{4}{1}{advmod}
	\depedge{4}{2}{aux}
	\depedge{4}{3}{nsubj}
	\deproot{4}{root}
	\depedge{6}{5}{aux}
	\depedge{4}{6}{xcomp}
	\depedge{8}{7}{det}
	\depedge{6}{8}{dobj}
	\depedge{4}{9}{punct}
\end{dependency}
\begin{dependency}	
        \begin{deptext}[column sep=0.6cm]
        	When \& are \& we \& going \& to \& visit \& the \& temple \& ? \\
        	WRB \& VBP \& PRP \& VBG \& TO \& VB \& DT \& NN \& . \\
        \end{deptext}
        \depedge{2}{1}{advmod}
	\deproot{2}{root}
	\depedge{2}{3}{nsubj}
	\depedge{2}{4}{aux}
	\depedge{6}{5}{aux}
	\depedge{4}{6}{xcomp}
	\depedge{8}{7}{det}
	\depedge{6}{8}{dobj}
	\depedge{2}{9}{punct}
\end{dependency}
\Tree [.{SBARQ (are)} [.{WHADVP (When)} [.{WRB (When)} \textit{When} ] ] [.{SQ (are)} [.{VBP (are)} \texti

In [16]:
import sys

old_stdout = sys.stdout # keep reference to existing stdout
sys.stdout = open('output.txt', 'w+')

for sent in list(doc.sents):
    sent_tree = cannonicalize(sent)
    print_cp(sent_tree)
    print('\n')
    print_cp(headify(sent_tree))
    print('\n')
    print_dp(sent)
    print('\n')
    print_dp(dependency_tree(sent_tree))
    print('-' * 150)

sys.stdout = old_stdout # restore stdout

In [17]:
# requirements
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name != "pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}>={}".format(*r))

spacy>=2.3.2
nltk>=3.2.4
