In [111]:
import json
import math
from nltk import Tree
from parseq.datasets import CFQDatasetLoader
from parseq.grammar import taglisp_to_tree
from tqdm import tqdm

In [71]:
ds = CFQDatasetLoader().load("mcd1/modent")

CFQDatasetLoader: make data
CFQDatasetLoader: make data in 0.0 sec
loading split 'mcd1'
splitting off a random 10% of 'train' for 'iidvalid' using seed 42
doing 'train'
doing 'test'
doing 'iidvalid'
doing 'oodvalid'


100%|██████████| 86169/86169 [00:38<00:00, 2240.71it/s]
100%|██████████| 11968/11968 [00:06<00:00, 1892.12it/s]
100%|██████████| 9574/9574 [00:04<00:00, 2181.02it/s]
100%|██████████| 11968/11968 [00:06<00:00, 1994.15it/s]


In [72]:
print(ds[0])
print(ds[0][1])
print(taglisp_to_tree(ds[0][1]))

('Was M0 a art director that M2 and M3 married', '(@R@ (@QUERY (@SELECT (count * ) ) (@WHERE (filter m0 != m2 ) (filter m0 != m3 ) (@COND m0 a ns:film.film_art_director ) (@COND m0 (@OR ns:people.person.spouse_s/ns:people.marriage.spouse ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses ) m2 ) (@COND m0 (@OR ns:people.person.spouse_s/ns:people.marriage.spouse ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses ) m3 ) ) ) )', 'train')
(@R@ (@QUERY (@SELECT (count * ) ) (@WHERE (filter m0 != m2 ) (filter m0 != m3 ) (@COND m0 a ns:film.film_art_director ) (@COND m0 (@OR ns:people.person.spouse_s/ns:people.marriage.spouse ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses ) m2 ) (@COND m0 (@OR ns:people.person.spouse_s/ns:people.marriage.spouse ns:fictional_universe.fictional_character.married_to/n

In [116]:
class DivergenceComputer():

    orderless = {"@QUERY", "@AND", "@OR", "@WHERE"}
    variablesize = {"@AND", "@OR", "@WHERE"}

    def __init__(self, verbose=False):
        super(DivergenceComputer, self).__init__()
        self.verbose = verbose

    def _extract_atom(self, x:Tree):
        if x.label() in self.orderless:
            if x.label() in self.variablesize:
                childstr = "ARG*"
            else:
                childstr = " ".join(["ARG" for _ in range(len(x))])
        else:
            childstr = " ".join([f"ARG{i+1}" for i in range(len(x))])
        ret = f"({x.label()} {childstr})"
        return ret

    def extract_atoms(self, x:Tree):
        ret = []
        for child in x:
            ret = ret + self.extract_atoms(child)
        ret.append(self._extract_atom(x))
        return ret

    def extract_compounds(self, x:Tree):
        """ This method extracts simple compounds that consist of two elements: parent and child """
        if len(x) == 0:     # leaf
            retcomps = []
            retatom = self._extract_atom(x)
            return retcomps, retatom
        else:
            compounds = []
            xstr = self._extract_atom(x)
            childgroups = []
            for i, child in enumerate(x):
                childcomps, childatom = self.extract_compounds(child)
                compounds = compounds + childcomps
                if x.label() in self.orderless:
                    connectstr = "ARG"
                else:
                    connectstr = f"ARG-{i}"
                for childcomp in childcomps:
                    childgroups.append((childcomp, connectstr))
            # TODO
            raise NotImplemented()
                # compounds.append(f"{xstr} - {connectstr} -> {childatom}")
            return compounds, xstr

    def extract_coocs(self, x:Tree):
        """ This method extracts co-occurrences across the entire tree """
        atoms = self.extract_atoms(x)
        coocs = []
        for i, atom in enumerate(atoms):
            for j, atom2 in enumerate(atoms):
                if i != j:
                    coocs.append(f"{atom},{atom2}")
        return coocs

    @staticmethod
    def compute_chernoff_coeff(dist1, dist2, alpha=0.5, weights=None):
        acc = 0
        for k in dist1:
            v1 = dist1[k]
            if k not in dist2:
                v2 = 0
            else:
                v2 = dist2[k]
            contrib = math.pow(v1, alpha) * math.pow(v2, 1 - alpha)
            if weights is not None:
                if k in weights:
                    contrib = contrib * weights[k]
            acc += contrib
        return acc

    def compute_atom_distributions(self, ds):
        atomses = dict()
        c = 100000000000
        for example in tqdm(ds):
            # print(example)
            atoms = self.extract_atoms(taglisp_to_tree(example[1]))
            # print(atoms)
            if example[2] not in atomses:
                atomses[example[2]] = dict()
            for atom in atoms:
                if atom not in atomses[example[2]]:
                    atomses[example[2]][atom] = 0
                atomses[example[2]][atom] += 1
            if c <= 0:
                break
            c -= 1
        for k, atoms in atomses.items():
            total = sum(atoms.values())
            for atoms_k in atoms:
                atoms[atoms_k] = atoms[atoms_k] / total
        return atomses

    def compute_compound_distributions(self, ds):
        compoundses = dict()
        c = 100000000000
        for example in tqdm(ds):
            # print(example)
            compounds = self.extract_compounds(taglisp_to_tree(example[1]))
            # print(atoms)
            if example[2] not in compoundses:
                compoundses[example[2]] = dict()
            for compound in compounds:
                if compound not in compoundses[example[2]]:
                    compoundses[example[2]][compound] = 0
                compoundses[example[2]][compound] += 1
            if c <= 0:
                break
            c -= 1
        for k, compounds in compoundses.items():
            total = sum(compounds.values())
            for compounds_k in compounds:
                compounds[compounds_k] = compounds[compounds_k] / total
        return compoundses

    def compute_cooc_distributions(self, ds):
        compoundses = dict()
        weights = dict()
        c = 100000000000
        totalex = 0
        for example in tqdm(ds):
            # print(example)
            compounds = self.extract_coocs(taglisp_to_tree(example[1]))
            # print(atoms)
            if example[2] not in compoundses:
                compoundses[example[2]] = dict()
            for compound in compounds:
                if compound not in compoundses[example[2]]:
                    compoundses[example[2]][compound] = 0
                compoundses[example[2]][compound] += 1
            for compound in set(compounds):
                if compound not in weights:
                    weights[compound] = 0
                weights[compound] += 1
            if c <= 0:
                break
            c -= 1
            totalex += 1
        for k in weights:
            weights[k] = 1/weights[k]     # inverse example frequency
        totalweight = sum(weights.values())
        for k in weights:
            weights[k] = weights[k] * totalex / totalweight
        for k, compounds in compoundses.items():
            total = sum(compounds.values())
            for compounds_k in compounds:
                compounds[compounds_k] = compounds[compounds_k] / total
        return compoundses, weights

    def _compute_atom_divergences(self, dists):
        divergences = dict()
        for subsetname in dists:
            for subsetname2 in dists:
                if self.verbose:
                    print(f"computing divergence between {subsetname} and {subsetname2}")
                divergences[subsetname + "-" + subsetname2] = 1 - self.compute_chernoff_coeff(dists[subsetname], dists[subsetname2], 0.5)
        return divergences

    def compute_atom_divergences(self, ds):
        dists = self.compute_atom_distributions(ds)
        return self._compute_atom_divergences(dists)

    def _compute_compound_divergences(self, dists):
        divergences = dict()
        for subsetname in dists:
            for subsetname2 in dists:
                if self.verbose:
                    print(f"computing divergence between {subsetname} and {subsetname2}")
                divergences[subsetname + "-" + subsetname2] = 1 - self.compute_chernoff_coeff(dists[subsetname], dists[subsetname2], 0.1)
        return divergences

    def compute_compound_divergences(self, ds):
        dists = self.compute_compound_distributions(ds)
        return self._compute_compound_divergences(dists)

    def _compute_cooc_divergences(self, dists, fs):
        """ dists contains distributions per subset, fs contains in what proportion of examples a co-occurrence occurs"""
        divergences = dict()
        for subsetname in dists:
            for subsetname2 in dists:
                if self.verbose:
                    print(f"computing divergence between {subsetname} and {subsetname2}")
                divergences[subsetname + "-" + subsetname2] = 1 - self.compute_chernoff_coeff(dists[subsetname], dists[subsetname2], 0.1, weights=fs)
        return divergences

    def compute_cooc_divergences(self, ds):
        dists, dfs = self.compute_cooc_distributions(ds)
        return self._compute_cooc_divergences(dists, dfs)

In [117]:
dc = DivergenceComputer()

In [107]:
atom_dists = dc.compute_atom_distributions(ds)
print(atom_dists)
divs = dc._compute_atom_divergences(atom_dists)
divs




100%|██████████| 119679/119679 [00:31<00:00, 3801.52it/s]


{'train': {'(* )': 0.0205044498853523, '(count ARG1)': 0.0205044498853523, '(@SELECT ARG1)': 0.0205044498853523, '(m0 )': 0.023357449236943356, '(!= )': 0.012561133508833294, '(m2 )': 0.028554129874061394, '(filter ARG1 ARG2 ARG3)': 0.012561133508833294, '(m3 )': 0.012529633515992383, '(a )': 0.028569266234257675, '(ns:film.film_art_director )': 0.0016204087226343812, '(@COND ARG1 ARG2 ARG3)': 0.16829955265919258, '(ns:people.person.spouse_s/ns:people.marriage.spouse )': 0.008771725279153346, '(ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses )': 0.008771725279153346, '(@OR ARG*)': 0.029727811425497404, '(@WHERE ARG*)': 0.03525094653387579, '(@QUERY ARG ARG)': 0.03525094653387579, '(@R@ ARG1)': 0.03525094653387579, '(distinct )': 0.01474649664852349, '(?x0 )': 0.1354606055771351, '(@SELECT ARG1 ARG2)': 0.01474649664852349, '(ns:film.film.directed_by )': 0.008037407264225622, '(?x1 )': 0.06743575740096423, '(ns:film.film

{'train-train': 6.661338147750939e-16,
 'train-test': 0.03815403533540007,
 'train-iidvalid': 8.371979439303345e-05,
 'train-oodvalid': 0.03827450782468789,
 'test-train': 0.03815403533539985,
 'test-test': -4.440892098500626e-16,
 'test-iidvalid': 0.03915245174573945,
 'test-oodvalid': 9.497392817781058e-05,
 'iidvalid-train': 8.371979439314448e-05,
 'iidvalid-test': 0.039152451745739336,
 'iidvalid-iidvalid': 2.220446049250313e-16,
 'iidvalid-oodvalid': 0.03924798371011462,
 'oodvalid-train': 0.038274507824687776,
 'oodvalid-test': 9.497392817769956e-05,
 'oodvalid-iidvalid': 0.03924798371011473,
 'oodvalid-oodvalid': 1.1102230246251565e-16}

In [109]:
# dc.extract_compounds(taglisp_to_tree(ds[0][1]))
comp_dists = dc.compute_compound_distributions(ds)
# print(comp_dists)
divs = dc._compute_compound_divergences(comp_dists)
divs

100%|██████████| 119679/119679 [00:35<00:00, 3361.10it/s]


{'train': {'(count ARG1) - ARG-0 -> (* )': 0.021253661573115274, '(@SELECT ARG1) - ARG-0 -> (count ARG1)': 0.021253661573115274, '(@QUERY ARG ARG) - ARG -> (@SELECT ARG1)': 0.021253661573115274, '(filter ARG1 ARG2 ARG3) - ARG-0 -> (m0 )': 0.000251030837781498, '(filter ARG1 ARG2 ARG3) - ARG-1 -> (!= )': 0.01302010451702854, '(filter ARG1 ARG2 ARG3) - ARG-2 -> (m2 )': 0.0011949407109261173, '(@WHERE ARG*) - ARG -> (filter ARG1 ARG2 ARG3)': 0.01302010451702854, '(filter ARG1 ARG2 ARG3) - ARG-2 -> (m3 )': 0.0005614270763897016, '(@COND ARG1 ARG2 ARG3) - ARG-0 -> (m0 )': 0.004968035972040592, '(@COND ARG1 ARG2 ARG3) - ARG-1 -> (a )': 0.029613158086670092, '(@COND ARG1 ARG2 ARG3) - ARG-2 -> (ns:film.film_art_director )': 0.001679616804818435, '(@WHERE ARG*) - ARG -> (@COND ARG1 ARG2 ARG3)': 0.17444904667647043, '(@OR ARG*) - ARG -> (ns:people.person.spouse_s/ns:people.marriage.spouse )': 0.009092235175187298, '(@OR ARG*) - ARG -> (ns:fictional_universe.fictional_character.married_to/ns:fict

{'train-train': 0.0,
 'train-test': 0.027289698313615274,
 'train-iidvalid': 3.7102500061059196e-05,
 'train-oodvalid': 0.027258467050785606,
 'test-train': 0.03638324704032758,
 'test-test': 1.1102230246251565e-16,
 'test-iidvalid': 0.03647307007538281,
 'test-oodvalid': 5.5925304399462306e-05,
 'iidvalid-train': 4.2381988419548655e-05,
 'iidvalid-test': 0.02900700934426903,
 'iidvalid-iidvalid': 4.440892098500626e-16,
 'iidvalid-oodvalid': 0.029018909420546857,
 'oodvalid-train': 0.03601000591537773,
 'oodvalid-test': 6.160699638446676e-05,
 'oodvalid-iidvalid': 0.03613289254487739,
 'oodvalid-oodvalid': 8.881784197001252e-16}

In [119]:
# print(json.dumps(comp_dists, indent=3))

In [118]:
# dc.extract_compounds(taglisp_to_tree(ds[0][1]))
cooc_dists, cooc_fs = dc.compute_cooc_distributions(ds)
# print(cooc_dists)
divs = dc._compute_cooc_divergences(cooc_dists, cooc_fs)
divs


100%|██████████| 119679/119679 [03:25<00:00, 582.37it/s]


{'train-train': 0.9461151036740916,
 'train-test': 0.9632821182120375,
 'train-iidvalid': 0.9468536581744926,
 'train-oodvalid': 0.9634449341565349,
 'test-train': 0.9625300960168326,
 'test-test': 0.9583431841513543,
 'test-iidvalid': 0.9635411823303807,
 'test-oodvalid': 0.9606212168434984,
 'iidvalid-train': 0.9505691046379636,
 'iidvalid-test': 0.9653336301053145,
 'iidvalid-iidvalid': 0.9444205806941813,
 'iidvalid-oodvalid': 0.9654799753468964,
 'oodvalid-train': 0.9625731789746448,
 'oodvalid-test': 0.9596544899768175,
 'oodvalid-iidvalid': 0.9636515146777334,
 'oodvalid-oodvalid': 0.9578824769034916}

In [120]:
cooc_fs



{'(!= ),(@OR ARG*)': 0.009196913195379549,
 '(@WHERE ARG*),(!= )': 0.009196913195379549,
 '(@QUERY ARG ARG),(ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses )': 0.01278533261561277,
 '(@SELECT ARG1),(!= )': 0.017187041495739252,
 '(count ARG1),(@SELECT ARG1)': 0.004327942825991461,
 '(a ),(@R@ ARG1)': 0.0037279130123836937,
 '(m2 ),(@QUERY ARG ARG)': 0.004976349644603648,
 '(ns:people.person.spouse_s/ns:people.marriage.spouse ),(a )': 0.017936079695109616,
 '(m2 ),(!= )': 0.020134184218704856,
 '(m3 ),(filter ARG1 ARG2 ARG3)': 0.0351577059154171,
 '(a ),(@WHERE ARG*)': 0.0037279130123836937,
 '(m0 ),(ns:people.person.spouse_s/ns:people.marriage.spouse )': 0.03580276117871794,
 '(@OR ARG*),(ns:film.film_art_director )': 0.10557663693923007,
 '(@R@ ARG1),(ns:people.person.spouse_s/ns:people.marriage.spouse )': 0.01278533261561277,
 '(@QUERY ARG ARG),(@OR ARG*)': 0.004270151878109565,
 '(* ),(ns:people.person.spouse_s/ns:

In [121]:
sum(cooc_fs.values())

119678.99999999978

In [123]:
sorted(cooc_fs.items(), key=lambda x: x[1], reverse=True)

[('(ns:film.film.starring/ns:film.performance.actor ),(ns:organization.organization.companies_acquired/ns:business.acquisition.company_acquired )',
  286.8517225638881),
 ('(ns:organization.organization.companies_acquired/ns:business.acquisition.company_acquired ),(ns:film.film.starring/ns:film.performance.actor )',
  286.8517225638881),
 ('(ns:film.film.cinematography ),(ns:m.0d060g )', 286.8517225638881),
 ('(ns:m.0d060g ),(ns:film.film.cinematography )', 286.8517225638881),
 ('(ns:m.06mkj ),(ns:film.film.film_art_direction_by )', 286.8517225638881),
 ('(ns:film.film.film_art_direction_by ),(ns:m.06mkj )', 286.8517225638881),
 ('(ns:organization.organization.acquired_by/ns:business.acquisition.acquiring_company ),(ns:m.03_3d )',
  286.8517225638881),
 ('(ns:m.03_3d ),(ns:organization.organization.acquired_by/ns:business.acquisition.acquiring_company )',
  286.8517225638881),
 ('(ns:m.0d0vqn ),(ns:organization.organization.companies_acquired/ns:business.acquisition.company_acquired )'