In [91]:
import xml.etree.ElementTree as et, os
import collections.abc       as abc
import pandas                as pd
import qgrid

from os.path                 import join
from framenet.util           import flatten, curry, flatmap, take
from framenet.builder        import build
from glob                    import iglob
from collections             import OrderedDict
from typing                  import Callable, List, Sequence
from multipledispatch        import dispatch
from framenet.util           import cata
from pprint                  import pformat, pprint
from framenet.ecg.generation import (root_for, base_for, Tree, T, 
                                     unstack, unstack_one, unstack_all,
                                     EtTree, TestTree as TT)

# For Pandas
pd.options.display.max_rows    = 99
pd.options.display.max_columns = 199

# tagify = lambda tag, item: '%sID' % tag if item == 'ID' else item

# The ICSI namespace we use below, as in 'fn:sentence'
fn = {'fn': 'http://framenet.icsi.berkeley.edu'}

lu10  = root_for['lu']('lu10')
asets = lu10.findall('.//fn:sentence', fn)

# Test: unstacking one XML file

In [92]:
# The ICSI namespace we use below, as in 'fn:sentence'
NS = {'fn': 'http://framenet.icsi.berkeley.edu'}

# This creates a loader for file 'lu10.xml' from the 'lu' folder 
lu10 = root_for['lu']('lu10')

# Find all <sentence ...> tags
lu10_sentences = lu10.findall('.//fn:sentence', fn)

lu10_df = pd.DataFrame(flatmap(unstack_one, asets))
# qgrid.nbinstall(overwrite=True)

In [93]:
qgrid.show_grid(lu10_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

## Now, let's define a function that does the above:

Concretely, given an XML element, return a list of dictionaries. We need the list of dictionaries to finally create the `DataFrame` object.

### Algorithm:

1. Gather all the LU files by creating loaders for each one;
1. Extract the <sentence...> elements;
1. Apply the above to each one;
1. There's no 4!!!

In [94]:
from glob    import iglob 
from os.path import join

lu_roots = map(root_for['lu'], iglob(join(base_for['lu'], '*.xml')))
lu_sents = lambda: flatten(r.findall('.//fn:sentence', NS) for r in lu_roots)

ft_roots = map(root_for['fulltext'], iglob(join(base_for['fulltext'], '*.xml')))
ft_sents = lambda: flatten(r.findall('.//fn:sentence', NS) for r in ft_roots)

In [95]:
# Warning: this will take a long time. Only need to do this once!

LU_PICKLE = 'lu.pkl'

if os.access(LU_PICKLE, os.R_OK):
    # Read lu_df back in
    %time lu_df = pd.read_pickle(LU_PICKLE)
else:
    # Save to a file in the current directory
    %time lu_df = pd.DataFrame(list(unstack_all(lu_sents())))
    lu_df.to_pickle(LU_PICKLE)

CPU times: user 2.96 s, sys: 1.94 s, total: 4.91 s
Wall time: 5.04 s


## Let's do the same for the full text annotations

In [96]:
# Same as above
FT_PICKLE = 'fulltext.pkl'

if os.access(FT_PICKLE, os.R_OK):
    # Read lu_df back in
    %time ft_df = pd.read_pickle(FT_PICKLE)
else:
    # Save to a file in the current directory
    %time ft_df = pd.DataFrame(list(unstack_all(ft_sents())))
    ft_df.to_pickle(FT_PICKLE)

CPU times: user 317 ms, sys: 291 ms, total: 608 ms
Wall time: 619 ms


## TODO

1. function :: Frame -> Core FEs
1. Use them to select records

In [97]:
# Read FN data in 
%time fn, fnb = build()

CPU times: user 31 µs, sys: 43 µs, total: 74 µs
Wall time: 78.9 µs


In [98]:
# Load Cause_motion
Cause_motion   = fn.get_frame('Cause_motion')
cm_core_fe_ids = set(int(e.ID) for e in Cause_motion.elements if e.coreType == 'Core')
cm_nc_fe_ids   = set(int(e.ID) for e in Cause_motion.elements if e.coreType != 'Core')

In [99]:
# Test
assert len(Cause_motion.lexicalUnits) == 38
assert cm_core_fe_ids == {228, 229, 230, 231, 232, 234, 4991, 5532, 7031}

In [100]:
# LUs for Cause_motion
fnb.build_lus_for_frame('Cause_motion', fn)

These lexical units have already been built.


In [101]:
# Pick annotation IDs for Cause_motion
cm_aset_ids  = set(int(ann.ID) for ann in Cause_motion.annotations)
assert len(cm_aset_ids) == 823

## Frame + ASet $\rightarrow$ DataFrame

In [86]:
from framenet.util import aget
from collections   import defaultdict

def lus_for(frame):
    "LUs for `frame` as a pd.DataFrame."
    cs = ('annotationSet.ID', 'annotationSet.LU')
    lu = aget('ID', 'lu')
    return pd.DataFrame([dict(zip(cs, lu(ann))) for ann in Cause_motion.annotations])
    

def select_by_annoset(df, aset_ids):
    return df.loc[df['annotationSet.ID'].isin(aset_ids)]



In [102]:
cm_df = select_by_annoset(lu_df, cm_aset_ids)

assert len(cm_df) == 10788

In [103]:
qgrid.show_grid(cm_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

In [104]:
cm_plus_df = pd.merge(cm_df, lu_for_cm, how='outer', on='annotationSet.ID')

In [110]:
# Set layer.name to effective Target
cm_plus_df.loc[
    cm_plus_df['label.name'] == 'Target', 
    'label.name'
] = cm_plus_df['annotationSet.LU']

In [111]:
cm_plus_df

Unnamed: 0,annotationSet.ID,annotationSet.status,label.end,label.feID,label.itype,label.name,label.start,layer.name,layer.rank,sentence.ID,sentence.aPos,sentence.corpID,sentence.docID,sentence.paragNo,sentence.sentNo,text.contents,annotationSet.LU
0,4530292,MANUAL,114.0,,,scoot.v,108.0,scoot.v,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
1,4530292,MANUAL,106.0,228.0,,Agent,106.0,FE,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
2,4530292,MANUAL,119.0,229.0,,Theme,116.0,FE,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
3,4530292,MANUAL,136.0,232.0,,Goal,121.0,FE,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
4,4530292,MANUAL,106.0,,,Ext,106.0,GF,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
5,4530292,MANUAL,119.0,,,Obj,116.0,GF,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
6,4530292,MANUAL,136.0,,,Dep,121.0,GF,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
7,4530292,MANUAL,106.0,,,NP,106.0,PT,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
8,4530292,MANUAL,119.0,,,NP,116.0,PT,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v
9,4530292,MANUAL,136.0,,,PP,121.0,PT,1,2824493,0,115.0,23013.0,23.0,4,"Cupboard doors were flying , the trash can in ...",scoot.v


In [107]:
by_sid_start = cm_plus_df.groupby(['sentence.ID', 'label.start'], as_index=True)

In [108]:
gs = [g for _, g in by_sid_start]
gs[1]

Unnamed: 0,annotationSet.ID,annotationSet.status,label.end,label.feID,label.itype,label.name,label.start,layer.name,layer.rank,sentence.ID,sentence.aPos,sentence.corpID,sentence.docID,sentence.paragNo,sentence.sentNo,text.contents,annotationSet.LU
1116,642838,MANUAL,11.0,,,Target,2.0,catapult.v,1,580494,92227906,,,,0,I catapulted the tiny beasts across the creek ...,catapult.v


In [83]:
gs[1].pivot(columns='label.name', values='layer.name')

label.name,Target
1116,catapult.v


In [84]:
gs[0].pivot(columns='label.name', values='layer.name')

label.name,Agent,Ext,NP
1102,FE,,
1106,,GF,
1110,,,PT


In [112]:
qgrid.show_grid(cm_plus_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

## Now, let's try to do the same without Pandas

In [113]:
import gzip

from framenet.util           import take, iget, groupby
from framenet.data           import annotation as ann
from pickle                  import dump, load
from pprint                  import pformat


@curry
def crit(aset, d):
    return d['annotationSet.ID'] in aset

In [114]:
LU_RECS = 'lu_recs'
lu_recs = ann.get_object(LU_RECS, unstack_all(lu_sents()))
len(lu_recs)

Reading pickle for lu_recs


7658278

## Cause_motion

In [124]:
CM_RECS  = 'cm_recs'
cm_asets = ann.annoset_for('Cause_motion')
cm_crit  = crit(cm_asets)
cm_recs  = ann.get_object(CM_RECS, filter(crit, lu_recs))
assert len(cm_recs) == 10788

Reading pickle for cm_recs


In [118]:
pd.options.display.max_rows=500

# cols1 = ['sentence.ID', 'label.start', 
#          'label.itype', 'label.name', 'label.feID', 'layer.name']

# cols2 = ['sentence.ID', 'label.start', 
#          'layer.name', 'label.name', 'label.feID']

# cols3 = ['sentence.ID', 'label.start', 'label.name', 
#          'label.name:Ext', 'label.name:Dep', 'label.name:Obj']

# cols4 = ['sentence.ID', 'label.start', 
#          'label.name:Ext', 'label.name:Dep', 'label.name:Obj']

cols5 = ['label.start', 'layer.name', 'label.name', 'label.itype']

In [119]:
layers            = ['GF', 'PT', 'FE', 'Target', 'Other']
# cm_recs_in_layers = filter(lambda d: d.get('layer.name', None) in layers, cm_recs)
select_cols1      = iget(*cols5, default=9999)
# group_key_cols    = iget(*cols1[:2], default=9999) 
sentence_id       = iget('sentence.ID') 
label_start       = iget('label.start', default=9999)

def lmap(f, it):
    return [f(i) for i in it]

def lmap_plus(f, it):
    xs = list(map(f, it))
    try:
        return [(n, dict(map(iget(1, 2), g))) for n, g in groupby(iget(0), xs)]
#         return [(n, list(g)) for n, g in groupby(iget(0), xs)]
    except TypeError:
        print('Exception:')
        pprint(xs)
        raise

grouped_by_sentence = groupby(sentence_id, cm_recs)
grouped_by_sentence_and_start = (
    (k, groupby(label_start, g)) 
    for k, g in grouped_by_sentence
)

In [120]:
# Let's see what's in there
grouped_and_groups = [(n, lmap_plus(select_cols1, g)) for n, g in grouped_by_sentence]
# grouped_and_groups = [(n, list(g)) for n, g in grouped_by_sentence]
print(len(grouped_and_groups))

821


In [121]:
from multipledispatch import dispatch
from collections      import Sequence, namedtuple
from framenet.util    import (compose, groupwise, unique, juxt, aget, 
                              reduceby, iget, remove)
from framenet.data.annotation import Pattern, Graph
from pprint           import pprint

gf_tgt = compose(iget('GF', 'Target', default=None), 
                 iget(1))

def gf_or_tgt(group):
    "Turn a group into a pattern."
    f, s = gf_tgt(group) 
    return f or s

# Test
g1 = [(0,  {'FE': 'Agent', 'GF': 'Ext', 'PT': 'NP'}),
      (2,  {'Target': 'Target'}),
      (13, {'FE': 'Theme', 'GF': 'Obj', 'PT': 'NP'}),
      (29, {'FE': 'Path',  'GF': 'Dep', 'PT': 'PP'}),
      (50, {'FE': 'Goal',  'GF': 'Dep', 'PT': 'PP'})]    


pattern = list(map(gf_or_tgt, g1))
pprint(pattern)
    
vertices = Ext, Target, Obj, Dep = ['Ext', 'Target', 'Obj', 'Dep']
edges    = ((Ext, [Target]),
            (Target, [Obj]), 
            (Obj, [Dep]),
            (Dep, [Dep, None]),
            (None, [None]))

matcher = Pattern(Graph(vertices=vertices, edges=edges))

@curry
def match(matcher, group):
    return matcher.match(list(map(gf_or_tgt, group)))

assert match(matcher, g1)

['Ext', 'Target', 'Obj', 'Dep', 'Dep']


In [122]:
groups       = [g for _, g in grouped_and_groups]
#################################################

# pprint(groups[:10])
groups_count = len(groups)
fe           = compose(iget('FE',       default=None), iget(1))
fe_gf        = compose(iget('FE', 'GF', default=None), iget(1))
fes          = [[fe(r) for r in g] for g in groups if match(matcher, g)]
fes_count    = len(fes)

# print(fes[:2])
print('Matching Ext T Obj Dep+ pattern:', len(fes), len(fes) / groups_count)

len([fe for fe in fes if fe[0] == 'Agent']) / fes_count

Matching Ext T Obj Dep+ pattern: 439 0.5347137637028014


0.979498861047836

In [127]:
nonmatching = [tuple(fe(r) for r in g) for g in groups if match(matcher, g)]

from collections import Counter

nm_conts = Counter(nonmatching)
nm_conts.most_common()

[(('Agent', None, 'Theme', 'Goal', None), 187),
 (('Agent', None, 'Theme', 'Path', None), 102),
 (('Agent', None, 'Theme', 'Source', None), 42),
 (('Agent', None, 'Theme', 'Result', None), 15),
 (('Agent', None, 'Theme', 'Manner', 'Goal', None), 11),
 (('Agent', None, 'Theme', 'Path', 'Goal', None), 11),
 (('Agent', None, 'Theme', 'Path', 'Path', None), 6),
 (('Agent', None, 'Theme', 'Manner', 'Path', None), 5),
 (('Agent', None, 'Theme', 'Area', None), 5),
 (('Agent', None, 'Theme', 'Source', 'Goal', None), 5),
 (('Cause', None, 'Theme', 'Goal', None), 4),
 (('Agent', None, 'Theme', 'Path', 'Distance', None), 3),
 (('Agent', None, 'Theme', 'Path', 'Manner', None), 3),
 (('Agent', None, 'Theme', 'Initial_state', None), 3),
 (('Agent', None, 'Goal', 'Theme', None), 2),
 (('Agent', None, 'Theme', 'Goal', 'Manner', None), 2),
 (('Agent', None, 'Theme', 'Manner', 'Goal'), 2),
 (('Agent', None, 'Theme', 'Source', 'Path', None), 2),
 (('Agent', None, 'Theme', 'Depictive', 'Goal', None), 2),


In [128]:
Node   = namedtuple('Node', 'id FE GF')
Link   = namedtuple('Link', 'source target')

layers = [remove(lambda x: x == (None, None), [fe_gf(r) for r in g]) 
          for g in groups if match(g)]
nodes  = lambda group: [Node('%d:%s' % (i, fe), str(fe), str(gf)) 
                        for i, (fe, gf) in enumerate(group)]
links  = lambda node:  [Link(*pair) for pair in groupwise(2, node)] 
nss    = map(nodes, layers)

def to_csv(link_and_count):
    link, count = link_and_count
    return (link.source.id, link.source.FE, link.source.GF,
            link.target.id, link.target.FE, link.target.GF, 
            count)

def cols(link):
    return ['%s_%s' % (st, k) 
            for st, n in zip(('source', 'target'), link)
            for k in n._asdict().keys()]

assert cols(Link(Node('id', 'fe', 'gf'), Node('id2', 'fe2', 'gf2'))) == [
    'source_id', 'source_FE', 'source_GF',
    'target_id', 'target_FE', 'target_GF'
]

# list(take(10, flatmap(links, map(nodes, layers)))
layers[:10]

[[('Agent', 'Ext'), ('Theme', 'Obj'), ('Path', 'Dep'), ('Goal', 'Dep')],
 [('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', 'Dep')],
 [('Agent', 'Ext'), ('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', 'Dep')],
 [('Agent', 'Ext'), ('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', 'Dep')],
 [('Theme', 'Ext'), ('Source', 'Dep'), ('Agent', None)],
 [('Theme', 'Ext'), ('Goal', 'Dep'), ('Agent', None)],
 [('Theme', 'Ext'), ('Goal', 'Dep'), ('Agent', None)],
 [('Theme', 'Ext'), ('Goal', 'Dep'), ('Goal', 'Dep'), ('Agent', None)],
 [('Theme', 'Ext'), ('Source', 'Dep'), ('Goal', 'Dep'), ('Agent', None)],
 [('Theme', 'Ext'), ('Source', 'Dep'), ('Agent', None)]]

In [101]:
from framenet.data.annotation import Pattern, Graph

def normalize(group, normalizer):
    "Normalize group by applying `normalizer` to each key."
    
def as_layers(groups):
    vertices     = Ext, T, Dep = ['Ext', 'Target', 'Dep']
    ext_dep      = Pattern(Graph(vertices, ((Ext, [T]), 
                                            (T, [Dep]), 
                                            (Dep, [Dep, None]), 
                                            (None, [None]))))
    match2       = ext_dep.match
    non_m_layers = [remove(lambda x: x == (None, None), [fe_gf(r) for r in g])
                    for g in groups 
                    if not match2([gf_or_tgt(r) for r in g]) and not match(g)]
    return non_m_layers

In [106]:
print(len(as_layers(groups)))
ls2 = as_layers(groups)
ls2[:10]

201


[[('Agent', 'Ext'), ('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', 'Dep')],
 [('Agent', 'Ext'), ('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', 'Dep')],
 [('Theme', 'Obj'), ('Goal', 'Dep'), ('Agent', None)],
 [('Goal', 'Obj'), ('Theme', 'Dep'), ('Agent', None)],
 [('Goal', 'Obj'), ('Theme', 'Dep'), ('Agent', None)],
 [('Theme', 'Obj'), ('Goal', 'Dep'), ('Agent', None)],
 [('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', None)],
 [('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', None)],
 [('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', None)],
 [('Agent', 'Ext'), ('Theme', 'Obj'), ('Manner', 'Dep'), ('Goal', None)]]

In [108]:
nonm1 = Counter(map(tuple, ls2))
nonm1.most_common()

[((('Agent', 'Ext'), ('Goal', 'Dep'), ('Theme', 'Obj')), 29),
 ((('Theme', 'Obj'), ('Goal', 'Dep'), ('Agent', None)), 14),
 ((('Agent', 'Ext'), ('Source', 'Dep'), ('Theme', 'Obj')), 13),
 ((('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', None)), 10),
 ((('Goal', 'Dep'), ('Theme', 'Obj'), ('Agent', None)), 8),
 ((('Agent', 'Ext'), ('Path', 'Dep'), ('Theme', 'Obj')), 8),
 ((('Theme', 'Obj'), ('Path', 'Dep'), ('Agent', None)), 7),
 ((('Theme', 'Obj'), ('Theme', 'Obj'), ('Agent', 'Ext'), ('Goal', 'Dep')), 6),
 ((('Agent', 'Ext'), ('Agent', 'Ext'), ('Theme', 'Obj'), ('Path', 'Dep')), 5),
 ((('Theme', 'Obj'), ('Source', 'Dep'), ('Agent', None)), 5),
 ((('Theme', 'Obj'), ('Agent', 'Ext'), ('Goal', 'Dep')), 4),
 ((('Goal', 'Dep'), ('Theme', None)), 4),
 ((('Theme', 'Ext'), ('Theme', 'Ext'), ('Path', 'Dep'), ('Agent', None)), 3),
 ((('Agent', 'Ext'), ('Agent', 'Ext'), ('Theme', 'Obj'), ('Goal', 'Dep')), 3),
 ((('Goal', 'Obj'), ('Theme', 'Dep'), ('Agent', None)), 3),
 ((('Agent', 'Ext'), ('Theme', 

In [91]:
def write_csv(fname, layers, base='.'):
    path = join(base, '%s.csv' % fname)
    with open(path, 'w+') as sout:
        ident           = lambda x: x
        count           = lambda ys, _: ys + 1
        link_and_counts = reduceby(ident, 0, count, flatmap(links, map(nodes, layers)))
        for i, lc in enumerate(link_and_counts):
            if i == 0:
                print(','.join( cols(lc[0]) + ['count']), file=sout)
            print(','.join(str(lc) for lc in to_csv(lc)), file=sout)
        print('Written %d records.' % i)

In [92]:
write_csv('ExtTDep+None', as_layers(groups))

Written 68 records.


In [None]:
json = {
    'nodes': list(map(dict, unique(flatmap(nodes, layers)))),
    'links': list(flatmap(links, map(nodes, layers)))
}
len(json['links'])

In [None]:
import json
with open('sankey.json', 'w+b') as sout:
    json.dump(json, sout)

## To do:

* Remove non-core stuff!
* include 
    - CNI
    - Target
* and modify the pattern matcher to select on those
* also: add `Other` (as `layer.name`)
* add `sentence.ID`
* pattern counts, with freq of freq.
    - and histograms
* other frames: `Placing`, `Cause_fluidic_motion`