In [12]:
import xml.etree.ElementTree as et, os
import collections.abc       as abc
import pandas                as pd
import qgrid

from os.path                 import join
from framenet.util           import flatten, curry, flatmap, take
from framenet.builder        import build
from glob                    import iglob
from collections             import OrderedDict
from typing                  import Callable, List, Sequence
from multipledispatch        import dispatch
from framenet.util           import cata
from pprint                  import pformat, pprint
from framenet.ecg.generation import (root_for, base_for, Tree, 
                                     T, unstack, 
                                     EtTree, TestTree as TT)

# For Pandas
pd.options.display.max_rows    = 99
pd.options.display.max_columns = 199


tagify = lambda tag, item: '%sID' % tag if item == 'ID' else item

# The ICSI namespace we use below, as in 'fn:sentence'
fn = {'fn': 'http://framenet.icsi.berkeley.edu'}

lu10  = root_for['lu']('lu10')
asets = lu10.findall('.//fn:sentence', fn)

# Test: unstacking one XML file

In [15]:
# The ICSI namespace we use below, as in 'fn:sentence'
fn = {'fn': 'http://framenet.icsi.berkeley.edu'}

# This creates a loader for file 'lu10.xml' from the 'lu' folder 
lu10           = root_for['lu']('lu10')

# Find all <sentence ...> tags
lu10_sentences = lu10.findall('.//fn:sentence', fn)

def unstack_one(elem):
    "Unstack the whole XML subtree at `elem`."
    return [dict(flatten(vs)) for vs in unstack(2, EtTree(elem))]

lu10_df = pd.DataFrame(flatmap(unstack_one, asets))
# qgrid.nbinstall(overwrite=True)

In [16]:
qgrid.show_grid(lu10_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

## Now, let's define a function that does the above:

Concretely, given an XML element, return a list of dictionaries. We need the list of dictionaries to finally create the `DataFrame` object.

In [17]:
unstack_all = lambda elements: flatmap(unstack_one, elements)

## Next:

1. Gather all the LU files by creating loaders for each one;
1. Extract the <sentence...> elements;
1. Apply the above to each one;
1. There's no 4!!!

In [18]:
from glob    import iglob 
from os.path import join

lu_roots = map(root_for['lu'], iglob(join(base_for['lu'], '*.xml')))
lu_sents = flatten(r.findall('.//fn:sentence', fn) for r in lu_roots)

In [13]:
# Warning: this will take a long time. Only need to do this once!
lu_df = pd.DataFrame(list(unstack_all(lu_sents)))

# Save to `lu.csv` in the cuurent directory
lu_df.to_pickle('lu.pkl')

## Let's do the same for the full text annotations

In [41]:
ft_roots = map(root_for['fulltext'], iglob(join(base_for['fulltext'], '*.xml')))
ft_sents = flatten(r.findall('.//fn:sentence', fn) for r in ft_roots)

In [42]:
ft_df = pd.DataFrame(list(unstack_all(ft_sents)))

In [37]:
# Same as above: only need to do this once.
ft_df.to_pickle('fulltext.pkl')

## TODO

1. function :: Frame -> Core FEs
1. Use them to select records

In [19]:
# Read lu_df back in
lu_df = pd.read_pickle('lu.pkl')

# Read FN data in 
fn, fnb = build()

In [20]:
Cause_motion   = fn.get_frame('Cause_motion')
cm_core_fe_ids = [int(e.ID) for e in Cause_motion.elements if e.coreType == 'Core']
cm_nc_fe_ids   = [int(e.ID) for e in Cause_motion.elements if e.coreType != 'Core']

In [21]:
len(Cause_motion.lexicalUnits)

38

In [22]:
cm_core_fe_ids

[228, 229, 230, 231, 232, 234, 4991, 5532, 7031]

In [23]:
# LUs for Cause_motion
fnb.build_lus_for_frame('Cause_motion', fn)

In [24]:
# Pick annotation IDs for Cause_motion
cm_aset_ids  = [int(ann.ID) for ann in Cause_motion.annotations]

In [25]:
cm_df = lu_df.loc[
    lu_df['annotationSet.ID'].isin(cm_aset_ids) 
    & ~ lu_df['label.feID'].isin(cm_nc_fe_ids)
]
len(cm_df)

10689

In [26]:
qgrid.show_grid(cm_df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

### Note: Let's verify none the records in cm_df involve noncore FEs!

In [27]:
assert len(cm_df[cm_df['label.feID'].isin(cm_nc_fe_ids)]) == 0

In [28]:
cm_ext_df = pd.concat([cm_df, 
                       pd.get_dummies(cm_df.loc[:, ['label.name']], prefix_sep=':')], 
                      axis=1)

In [29]:
pd.options.display.max_rows=500

cols1 = ['sentence.ID', 'label.start', 'label.itype', 'label.name', 'label.feID', 'layer.name']
cols2 = ['sentence.ID', 'label.start', 'label.name', 'layer.name']
cols3 = ['sentence.ID', 'label.start', 'label.name', 
         'label.name:Ext', 'label.name:Dep', 'label.name:Obj']
cols4 = ['sentence.ID', 'label.start', 
         'label.name:Ext', 'label.name:Dep', 'label.name:Obj']

In [30]:
gs = cm_ext_df.loc[:, cols3].groupby(cols3[:3])
gs.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label.name:Ext,label.name:Dep,label.name:Obj
sentence.ID,label.start,label.name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
580494,0.0,Agent,0,0,0
580494,0.0,Ext,1,0,0
580494,0.0,NP,0,0,0
580494,2.0,Target,0,0,0
580494,13.0,NP,0,0,0
580494,13.0,Obj,0,0,1
580494,13.0,Theme,0,0,0
580494,29.0,Dep,0,1,0
580494,29.0,PP,0,0,0
580494,29.0,Path,0,0,0
