In [1]:
import itertools
import json
import math
from itertools import combinations
from nltk import Tree
from parseq.datasets import CFQDatasetLoader
from parseq.grammar import taglisp_to_tree
from tqdm import tqdm

In [2]:
ds = CFQDatasetLoader().load("mcd1/modent")

CFQDatasetLoader: make data
CFQDatasetLoader: make data in 0.0 sec
loading split 'mcd1'
splitting off a random 10% of 'train' for 'iidvalid' using seed 42


KeyboardInterrupt: 

In [3]:
print(ds[11])
print(taglisp_to_tree(ds[11][1]))
print(ds[-21])
print(taglisp_to_tree(ds[-21][1]))

('Which character was influenced by a writer of M1', '(@R@ (@QUERY (@SELECT distinct ?x0 ) (@WHERE (@COND ?x0 a ns:fictional_universe.fictional_character ) (@COND ?x0 ns:influence.influence_node.influenced_by ?x1 ) (@COND ?x1 ns:film.writer.film m1 ) ) ) )', 'train')
(@R@
  (@QUERY
    (@SELECT (distinct ) (?x0 ))
    (@WHERE
      (@COND
        (?x0 )
        (a )
        (ns:fictional_universe.fictional_character ))
      (@COND
        (?x0 )
        (ns:influence.influence_node.influenced_by )
        (?x1 ))
      (@COND (?x1 ) (ns:film.writer.film ) (m1 )))))
('Who was a executive producer , producer , director , art director , star , writer , and editor of M1', '(@R@ (@QUERY (@SELECT distinct ?x0 ) (@WHERE (@COND ?x0 a ns:people.person ) (@COND ?x0 ns:film.actor.film/ns:film.performance.film m1 ) (@COND ?x0 ns:film.director.film m1 ) (@COND ?x0 ns:film.editor.film m1 ) (@COND ?x0 ns:film.film_art_director.films_art_directed m1 ) (@COND ?x0 ns:film.producer.films_executive_produ

In [140]:
c = 0
for ex in ds:
    print(taglisp_to_tree(ex[1]))
    c += 1
    if c >= 4:
        break


(@R@
  (@QUERY
    (@SELECT (count (* )))
    (@WHERE
      (filter (m0 ) (!= ) (m2 ))
      (filter (m0 ) (!= ) (m3 ))
      (@COND (m0 ) (a ) (ns:film.film_art_director ))
      (@COND
        (m0 )
        (@OR
          (ns:people.person.spouse_s/ns:people.marriage.spouse )
          (ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses))
        (m2 ))
      (@COND
        (m0 )
        (@OR
          (ns:people.person.spouse_s/ns:people.marriage.spouse )
          (ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses))
        (m3 )))))
(@R@
  (@QUERY
    (@SELECT (distinct ) (?x0 ))
    (@WHERE
      (@COND (?x0 ) (ns:film.film.directed_by ) (?x1 ))
      (@COND (?x0 ) (ns:film.film.edited_by ) (?x1 ))
      (@COND
        (?x0 )
        (@OR
          (ns:film.film.produced_by )
          (ns:film.film.production_companies ))
        (?x1 ))
      (@COND

In [116]:
class DivergenceComputer():

    orderless = {"@QUERY", "@AND", "@OR", "@WHERE"}
    variablesize = {"@AND", "@OR", "@WHERE"}

    def __init__(self, verbose=False):
        super(DivergenceComputer, self).__init__()
        self.verbose = verbose

    def _extract_atom(self, x:Tree):
        if x.label() in self.orderless:
            if x.label() in self.variablesize:
                childstr = "ARG*"
            else:
                childstr = " ".join(["ARG" for _ in range(len(x))])
        else:
            childstr = " ".join([f"ARG{i+1}" for i in range(len(x))])
        ret = f"({x.label()} {childstr})"
        return ret

    def extract_atoms(self, x:Tree):
        ret = []
        for child in x:
            ret = ret + self.extract_atoms(child)
        ret.append(self._extract_atom(x))
        return ret

    def extract_compounds(self, x:Tree):
        """ This method extracts simple compounds that consist of two elements: parent and child """
        if len(x) == 0:     # leaf
            retcomps = []
            retatom = self._extract_atom(x)
            retcomps += [retatom, "<>"]
            return retcomps, retatom
        else:
            compounds = []
            xstr = self._extract_atom(x)
            childgroups = []
            for i, child in enumerate(x):
                childcomps, childatom = self.extract_compounds(child)
                compounds = compounds + childcomps
                if x.label() in self.orderless:
                    connectstr = "ARG"
                else:
                    connectstr = f"ARG-{i}"
                for childcomp in childcomps:
                    childgroups.append((childcomp, connectstr))
            # TODO
            raise NotImplemented()
                # compounds.append(f"{xstr} - {connectstr} -> {childatom}")
            return compounds, xstr

    def extract_coocs(self, x:Tree):
        """ This method extracts co-occurrences across the entire tree """
        atoms = self.extract_atoms(x)
        coocs = []
        for i, atom in enumerate(atoms):
            for j, atom2 in enumerate(atoms):
                if i != j:
                    coocs.append(f"{atom},{atom2}")
        return coocs

    @staticmethod
    def compute_chernoff_coeff(dist1, dist2, alpha=0.5, weights=None):
        acc = 0
        for k in dist1:
            v1 = dist1[k]
            if k not in dist2:
                v2 = 0
            else:
                v2 = dist2[k]
            contrib = math.pow(v1, alpha) * math.pow(v2, 1 - alpha)
            if weights is not None:
                if k in weights:
                    contrib = contrib * weights[k]
            acc += contrib
        return acc

    def compute_atom_distributions(self, ds):
        atomses = dict()
        c = 100000000000
        for example in tqdm(ds):
            # print(example)
            atoms = self.extract_atoms(taglisp_to_tree(example[1]))
            # print(atoms)
            if example[2] not in atomses:
                atomses[example[2]] = dict()
            for atom in atoms:
                if atom not in atomses[example[2]]:
                    atomses[example[2]][atom] = 0
                atomses[example[2]][atom] += 1
            if c <= 0:
                break
            c -= 1
        for k, atoms in atomses.items():
            total = sum(atoms.values())
            for atoms_k in atoms:
                atoms[atoms_k] = atoms[atoms_k] / total
        return atomses

    def compute_compound_distributions(self, ds):
        compoundses = dict()
        c = 100000000000
        for example in tqdm(ds):
            # print(example)
            compounds = self.extract_compounds(taglisp_to_tree(example[1]))
            # print(atoms)
            if example[2] not in compoundses:
                compoundses[example[2]] = dict()
            for compound in compounds:
                if compound not in compoundses[example[2]]:
                    compoundses[example[2]][compound] = 0
                compoundses[example[2]][compound] += 1
            if c <= 0:
                break
            c -= 1
        for k, compounds in compoundses.items():
            total = sum(compounds.values())
            for compounds_k in compounds:
                compounds[compounds_k] = compounds[compounds_k] / total
        return compoundses

    def compute_cooc_distributions(self, ds):
        compoundses = dict()
        weights = dict()
        c = 100000000000
        totalex = 0
        for example in tqdm(ds):
            # print(example)
            compounds = self.extract_coocs(taglisp_to_tree(example[1]))
            # print(atoms)
            if example[2] not in compoundses:
                compoundses[example[2]] = dict()
            for compound in compounds:
                if compound not in compoundses[example[2]]:
                    compoundses[example[2]][compound] = 0
                compoundses[example[2]][compound] += 1
            for compound in set(compounds):
                if compound not in weights:
                    weights[compound] = 0
                weights[compound] += 1
            if c <= 0:
                break
            c -= 1
            totalex += 1
        for k in weights:
            weights[k] = 1/weights[k]     # inverse example frequency
        totalweight = sum(weights.values())
        for k in weights:
            weights[k] = weights[k] * totalex / totalweight
        for k, compounds in compoundses.items():
            total = sum(compounds.values())
            for compounds_k in compounds:
                compounds[compounds_k] = compounds[compounds_k] / total
        return compoundses, weights

    def _compute_atom_divergences(self, dists):
        divergences = dict()
        for subsetname in dists:
            for subsetname2 in dists:
                if self.verbose:
                    print(f"computing divergence between {subsetname} and {subsetname2}")
                divergences[subsetname + "-" + subsetname2] = 1 - self.compute_chernoff_coeff(dists[subsetname], dists[subsetname2], 0.5)
        return divergences

    def compute_atom_divergences(self, ds):
        dists = self.compute_atom_distributions(ds)
        return self._compute_atom_divergences(dists)

    def _compute_compound_divergences(self, dists):
        divergences = dict()
        for subsetname in dists:
            for subsetname2 in dists:
                if self.verbose:
                    print(f"computing divergence between {subsetname} and {subsetname2}")
                divergences[subsetname + "-" + subsetname2] = 1 - self.compute_chernoff_coeff(dists[subsetname], dists[subsetname2], 0.1)
        return divergences

    def compute_compound_divergences(self, ds):
        dists = self.compute_compound_distributions(ds)
        return self._compute_compound_divergences(dists)

    def _compute_cooc_divergences(self, dists, fs):
        """ dists contains distributions per subset, fs contains in what proportion of examples a co-occurrence occurs"""
        divergences = dict()
        for subsetname in dists:
            for subsetname2 in dists:
                if self.verbose:
                    print(f"computing divergence between {subsetname} and {subsetname2}")
                divergences[subsetname + "-" + subsetname2] = 1 - self.compute_chernoff_coeff(dists[subsetname], dists[subsetname2], 0.1, weights=fs)
        return divergences

    def compute_cooc_divergences(self, ds):
        dists, dfs = self.compute_cooc_distributions(ds)
        return self._compute_cooc_divergences(dists, dfs)

In [117]:
dc = DivergenceComputer()

In [107]:
atom_dists = dc.compute_atom_distributions(ds)
print(atom_dists)
divs = dc._compute_atom_divergences(atom_dists)
divs




100%|██████████| 119679/119679 [00:31<00:00, 3801.52it/s]


{'train': {'(* )': 0.0205044498853523, '(count ARG1)': 0.0205044498853523, '(@SELECT ARG1)': 0.0205044498853523, '(m0 )': 0.023357449236943356, '(!= )': 0.012561133508833294, '(m2 )': 0.028554129874061394, '(filter ARG1 ARG2 ARG3)': 0.012561133508833294, '(m3 )': 0.012529633515992383, '(a )': 0.028569266234257675, '(ns:film.film_art_director )': 0.0016204087226343812, '(@COND ARG1 ARG2 ARG3)': 0.16829955265919258, '(ns:people.person.spouse_s/ns:people.marriage.spouse )': 0.008771725279153346, '(ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses )': 0.008771725279153346, '(@OR ARG*)': 0.029727811425497404, '(@WHERE ARG*)': 0.03525094653387579, '(@QUERY ARG ARG)': 0.03525094653387579, '(@R@ ARG1)': 0.03525094653387579, '(distinct )': 0.01474649664852349, '(?x0 )': 0.1354606055771351, '(@SELECT ARG1 ARG2)': 0.01474649664852349, '(ns:film.film.directed_by )': 0.008037407264225622, '(?x1 )': 0.06743575740096423, '(ns:film.film

{'train-train': 6.661338147750939e-16,
 'train-test': 0.03815403533540007,
 'train-iidvalid': 8.371979439303345e-05,
 'train-oodvalid': 0.03827450782468789,
 'test-train': 0.03815403533539985,
 'test-test': -4.440892098500626e-16,
 'test-iidvalid': 0.03915245174573945,
 'test-oodvalid': 9.497392817781058e-05,
 'iidvalid-train': 8.371979439314448e-05,
 'iidvalid-test': 0.039152451745739336,
 'iidvalid-iidvalid': 2.220446049250313e-16,
 'iidvalid-oodvalid': 0.03924798371011462,
 'oodvalid-train': 0.038274507824687776,
 'oodvalid-test': 9.497392817769956e-05,
 'oodvalid-iidvalid': 0.03924798371011473,
 'oodvalid-oodvalid': 1.1102230246251565e-16}

In [109]:
# dc.extract_compounds(taglisp_to_tree(ds[0][1]))
comp_dists = dc.compute_compound_distributions(ds)
# print(comp_dists)
divs = dc._compute_compound_divergences(comp_dists)
divs

100%|██████████| 119679/119679 [00:35<00:00, 3361.10it/s]


{'train': {'(count ARG1) - ARG-0 -> (* )': 0.021253661573115274, '(@SELECT ARG1) - ARG-0 -> (count ARG1)': 0.021253661573115274, '(@QUERY ARG ARG) - ARG -> (@SELECT ARG1)': 0.021253661573115274, '(filter ARG1 ARG2 ARG3) - ARG-0 -> (m0 )': 0.000251030837781498, '(filter ARG1 ARG2 ARG3) - ARG-1 -> (!= )': 0.01302010451702854, '(filter ARG1 ARG2 ARG3) - ARG-2 -> (m2 )': 0.0011949407109261173, '(@WHERE ARG*) - ARG -> (filter ARG1 ARG2 ARG3)': 0.01302010451702854, '(filter ARG1 ARG2 ARG3) - ARG-2 -> (m3 )': 0.0005614270763897016, '(@COND ARG1 ARG2 ARG3) - ARG-0 -> (m0 )': 0.004968035972040592, '(@COND ARG1 ARG2 ARG3) - ARG-1 -> (a )': 0.029613158086670092, '(@COND ARG1 ARG2 ARG3) - ARG-2 -> (ns:film.film_art_director )': 0.001679616804818435, '(@WHERE ARG*) - ARG -> (@COND ARG1 ARG2 ARG3)': 0.17444904667647043, '(@OR ARG*) - ARG -> (ns:people.person.spouse_s/ns:people.marriage.spouse )': 0.009092235175187298, '(@OR ARG*) - ARG -> (ns:fictional_universe.fictional_character.married_to/ns:fict

{'train-train': 0.0,
 'train-test': 0.027289698313615274,
 'train-iidvalid': 3.7102500061059196e-05,
 'train-oodvalid': 0.027258467050785606,
 'test-train': 0.03638324704032758,
 'test-test': 1.1102230246251565e-16,
 'test-iidvalid': 0.03647307007538281,
 'test-oodvalid': 5.5925304399462306e-05,
 'iidvalid-train': 4.2381988419548655e-05,
 'iidvalid-test': 0.02900700934426903,
 'iidvalid-iidvalid': 4.440892098500626e-16,
 'iidvalid-oodvalid': 0.029018909420546857,
 'oodvalid-train': 0.03601000591537773,
 'oodvalid-test': 6.160699638446676e-05,
 'oodvalid-iidvalid': 0.03613289254487739,
 'oodvalid-oodvalid': 8.881784197001252e-16}

In [119]:
# print(json.dumps(comp_dists, indent=3))

In [124]:
# dc.extract_compounds(taglisp_to_tree(ds[0][1]))
cooc_dists, cooc_fs = dc.compute_cooc_distributions(ds)
# print(cooc_dists)
divs = dc._compute_cooc_divergences(cooc_dists, cooc_fs)
divs



 32%|███▏      | 38246/119679 [00:48<01:17, 1054.96it/s]

KeyboardInterrupt: 

In [10]:
x = taglisp_to_tree("(A (B C D X Y Z) (E (F G H )))")
print(x)


(A (B (C ) (D ) (X ) (Y ) (Z )) (E (F (G ) (H ))))


In [11]:
def find_all_subgraphs(x:Tree, orderless={"B",}, useholes=False):
    if len(x) == 0:
        rooted = [Tree(x.label(), [])]
        if useholes:
            rooted = rooted + [Tree("*", [])]
        collected = [] + rooted
        return collected, rooted
    else:
        collected = []
        rooted = []
        childsubgraphses = []       # all subgraphs starting at a child of the current node
        for child in x:             # iterate over all children
            childsubgraphs, childsubgraphs2 = find_all_subgraphs(child, orderless=orderless, useholes=useholes)   # get their subgraphs
            childsubgraphses.append(childsubgraphs2)      # append their subgraphs for later recombination
            for xe in childsubgraphs:         # collect subgraphs
                collected.append(xe)
        # recombine subgraphs, anywhere between none of the children and all
        # if x.label() in orderless:   # recombine in all possible orders
        if x.label() in orderless:
            print(f"recombining orderless at {x.label()}")
            for permsize in range(1, len(x)+1):
                for perm in itertools.permutations(x, permsize):
                    for a in ([Tree("*", []), Tree(x.label(), [])] if useholes else [Tree(x.label(), [])]):
                        a[:] = perm
                        rooted.append(a)
                        # prodstr = " ".join(product)
                        # rooted.append(f"({a} {prodstr})")
            if useholes:
                rooted.append(Tree("*", []))
            rooted.append(Tree(x.label(), []))
            collected += rooted + []
            print(collected)
            print(rooted)
        else:
            print(f"recombining at {x.label()}")
            for product in itertools.product(*childsubgraphses):
                # print(product)
                for a in ([Tree("*", []), Tree(x.label(), [])] if useholes else [Tree(x.label(), [])]):
                    a[:] = product
                    rooted.append(a)
                    # prodstr = " ".join(product)
                    # rooted.append(f"({a} {prodstr})")
            if useholes:
                rooted.append(Tree("*", []))
            rooted.append(Tree(x.label(), []))
            collected += rooted + []
            print(collected)
            print(rooted)
        # else:                        # recombine only in given order
        # if True:
        #     print(f"recombining at {x.label()}")
        #     for product in itertools.product(*childsubgraphses):
        #         # print(product)
        #         for a in ["*", x.label()]:
        #             prodstr = " ".join(product)
        #             rooted.append(f"({a} {prodstr})")
        #     if useholes:
        #         rooted.append("*")
        #     rooted.append(x.label())
        #     collected += rooted + []
        #     print(collected)
        #     print(rooted)
        return collected, rooted


def _find_all_holings(x:Tree):   # (A B C) -> [(A B C), (A B *), (A * C), (A * *), (*, B, *), (*, *, C), (*, *, *)
    varret = ["*", x.label()]
    if len(x) == 0:
        return [Tree(varrete, []) for varrete in varret]
    else:
        childholingses = []
        for child in x:
            childholingses.append(_find_all_holings(child))
        ret = []
        for prod in itertools.product(*childholingses):
            for varrete in varret:
                ret.append(Tree(varrete, prod))
        return ret


def find_hole_fraq(x):
    numholes, numnodes = _find_hole_fraq(x)
    # print(numholes, numnodes)
    return numholes/numnodes


def _find_hole_fraq(x):
    numholes = 0
    numnodes = 0
    # print(x.label(), x.label() == "*")
    if x.label() == "*":
        numholes += 1
    numnodes += 1
    for child in x:
        _numholes, _numnodes = _find_hole_fraq(child)
        numholes += _numholes
        numnodes += _numnodes
    return numholes, numnodes


def find_all_holings(x:Tree, maxholefraq=0.5):
    # filtered holings enumerator
    ret = []
    for holing in _find_all_holings(x):
        if maxholefraq == 1 or find_hole_fraq(holing) <= maxholefraq:
            ret.append(holing)
    return ret

In [None]:
ret, _ = find_all_subgraphs(x)
for rete in ret:
    print(rete)
    # print("holings")
    # for holing in find_all_holings(rete):
    #     print(holing)