In [3]:
import xml.etree.ElementTree as et, os
import collections.abc       as abc
import pandas                as pd
import qgrid

from os.path                 import join
from framenet.util           import flatten, curry, flatmap, take
from framenet.builder        import build
from glob                    import iglob
from collections             import OrderedDict
from typing                  import Callable, List, Sequence
from multipledispatch        import dispatch
from framenet.util           import cata
from pprint                  import pformat, pprint
from framenet.ecg.generation import (root_for, base_for, Tree, 
                                     T, unstack, 
                                     EtTree, TestTree as TT)

tagify = lambda tag, item: '%sID' % tag if item == 'ID' else item

t = [1, ['a', 'b', 'c'],
     2, [3, ['d', 'e']],
     4, [5, [6, ['f']]]]

# The ICSI namespace we use below, as in 'fn:sentence'
fn = {'fn': 'http://framenet.icsi.berkeley.edu'}

lu10  = root_for['lu']('lu10')
asets = lu10.findall('.//fn:sentence', fn)

In [4]:
def _tag(s):
    return s[s.rindex('}') + 1:]

pd.DataFrame([{'%s.%s' % (_tag(a.tag), k): v for k, v in a.items()} for a in asets][:5])

Unnamed: 0,sentence.ID,sentence.aPos,sentence.sentNo
0,692522,69015207,0
1,692525,25551231,0
2,692526,62444254,0
3,692527,49421526,0
4,692533,22279664,0


In [3]:
@dispatch(abc.Callable, Tree)
def unstack2(special, tree):
    cs = tree.children()
    if not cs:
        return [[tree.value()]]
    else:
        c, *cs1 = cs
        if special and special(c):
            rest = [unstack2(special, cs)]
        else:
            rest = [unstack2(special, c) for c in cs]
        return [[tree.value()] + ts for tss in rest for ts in tss]
  

@dispatch(abc.Callable, abc.Sequence)
def unstack2(special, trees):
    if not trees:
        return []
    else:
        t, *ts = trees
        tss    = flatmap(lambda t1: unstack2(special, t1), ts)
        return [ts1 + ts2 for ts1 in unstack2(special, t) for ts2 in tss]

# Unstacking one XML file

In [5]:
# The ICSI namespace we use below, as in 'fn:sentence'
fn = {'fn': 'http://framenet.icsi.berkeley.edu'}

# This creates a loader for file 'lu10.xml' from the 'lu' folder 
lu10           = root_for['lu']('lu10')

# Find all <sentence ...> tags
lu10_sentences = lu10.findall('.//fn:sentence', fn)

def unstack_one(elem):
    "Unstack the whole XML subtree at `elem`."
    return [dict(flatten(vs)) for vs in unstack(2, EtTree(elem))]

lu10_df = pd.DataFrame(flatmap(unstack_one, asets))
# qgrid.nbinstall(overwrite=True)

In [20]:
lu10_df['label.feID'].dtype

dtype('float64')

In [10]:
qgrid.show_grid(lu10_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

## Now, let's define a function that does the above:

Concretely, given an XML element, return a list of dictionaries. We need the list of dictionaries to finally create the `DataFrame` object.

In [6]:
unstack_all = lambda elements: flatmap(unstack_one, elements)

## Next:

1. Gather all the LU files by creating loaders for each one;
1. Extract the <sentence...> elements;
1. Apply the above to each one;
1. There's no 4!!!

In [12]:
from glob    import iglob 
from os.path import join

lu_roots = map(root_for['lu'], iglob(join(base_for['lu'], '*.xml')))
lu_sents = flatten(r.findall('.//fn:sentence', fn) for r in lu_roots)

In [13]:
# Warning: this will take a long time!
lu_df = pd.DataFrame(list(unstack_all(lu_sents)))

In [7]:
# Save to `lu.csv` in the cuurent directory
lu_df.to_pickle('lu.pkl')

NameError: name 'lu_df' is not defined

In [23]:
lu_df['label.start'].dtype

dtype('float64')

## Let's do the same for the full text annotations

In [41]:
ft_roots = map(root_for['fulltext'], iglob(join(base_for['fulltext'], '*.xml')))
ft_sents = flatten(r.findall('.//fn:sentence', fn) for r in ft_roots)

In [42]:
ft_df = pd.DataFrame(list(unstack_all(ft_sents)))

In [37]:
pd.options.display.max_rows = 99
ft_df.to_pickle('fulltext.pkl')

In [48]:
ft_df['label.start'].dtype

dtype('O')

## TODO

1. function :: Frame -> Core FEs
1. Use them to select records

In [197]:
# Read lu_df back in
lu_df = pd.read_pickle('lu.pkl')

# Read FN data in 
fn, fnb = build()

In [198]:
Cause_motion   = fn.get_frame('Cause_motion')
cm_core_fe_ids = [int(e.ID) for e in Cause_motion.elements if e.coreType == 'Core']
cm_nc_fe_ids   = [int(e.ID) for e in Cause_motion.elements if e.coreType != 'Core']

In [202]:
len(Cause_motion.lexicalUnits)

38

In [10]:
cm_core_fe_ids

[228, 229, 230, 231, 232, 234, 4991, 5532, 7031]

In [11]:
# LUs for Cause_motion
fnb.build_lus_for_frame('Cause_motion', fn)

In [12]:
# Pick annotation IDs for Cause_motion
cm_aset_ids  = [int(ann.ID) for ann in Cause_motion.annotations]

In [139]:
cm_df = lu_df.loc[
    lu_df['annotationSet.ID'].isin(cm_aset_ids) 
    & ~ lu_df['label.feID'].isin(cm_nc_fe_ids)
]
len(cm_df)

10689

In [140]:
qgrid.show_grid(cm_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

### Note: Let's verify none the records in cm_df involve noncore FEs!

In [119]:
assert len(cm_df[cm_df['label.feID'].isin(cm_nc_fe_ids)]) == 0

In [182]:
cm_ext_df = pd.concat([cm_df, 
                       pd.get_dummies(cm_df.loc[:, ['label.name']], prefix_sep=':')], 
                      axis=1)

In [183]:
pd.options.display.max_rows=500

cols1 = ['sentence.ID', 'label.start', 'label.itype', 'label.name', 'label.feID', 'layer.name']
cols2 = ['sentence.ID', 'label.start', 'label.name', 'layer.name']
cols3 = ['sentence.ID', 'label.start', 'label.name', 
         'label.name:Ext', 'label.name:Dep', 'label.name:Obj']
cols4 = ['sentence.ID', 'label.start', 
         'label.name:Ext', 'label.name:Dep', 'label.name:Obj']

In [196]:
gs = cm_ext_df.loc[:, cols3].groupby(cols3[:3])
gs.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label.name:Ext,label.name:Dep,label.name:Obj
sentence.ID,label.start,label.name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
580494,0.0,Agent,0,0,0
580494,0.0,Ext,1,0,0
580494,0.0,NP,0,0,0
580494,2.0,Target,0,0,0
580494,13.0,NP,0,0,0
580494,13.0,Obj,0,0,1
580494,13.0,Theme,0,0,0
580494,29.0,Dep,0,1,0
580494,29.0,PP,0,0,0
580494,29.0,Path,0,0,0


In [None]:
for n, g in gs:
    print(n, g)

In [156]:
gs2 = cm_df.loc[:, cols[1:]].groupby(['label.start', 'label.name'])
gs2.first().fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,label.itype,label.feID,layer.name
label.start,label.name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,Agent,,228,FE
0.0,Ant,,,Other
0.0,Cause,,4991,FE
0.0,Dep,,,GF
0.0,Ext,,,GF
0.0,Idiom,,,Sent
0.0,Metaphor,,,Sent
0.0,NP,,,PT
0.0,Obj,,,GF
0.0,PP,,,PT
