In [1]:
import REUParsing as rp
import os
from tqdm import tqdm

In [2]:
leaky_trees = rp.load_sentences("D:/REU Datasets/Universal Dependencies 2.8.1/leaky-treebanks/Unlabeled/UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu")
nonleaky_trees = rp.load_sentences("D:/REU Datasets/Universal Dependencies 2.8.1/nonleaky-treebanks/Unlabeled/UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu")

In [3]:
leaky_trees[0]

TokenTree<token={id=4, form=fyrstuni}, children=[...]>

In [4]:
# The depth of a tree from a node
def depth(node):
    if len(node.children) > 0:
        max_child_depth = 0
        for child in node.children:
            child_depth = depth(child)
            if child_depth > max_child_depth:
                max_child_depth = child_depth
        return max_child_depth + 1
    else:
        return 0

In [5]:
def avg_depth(trees):
    return sum([depth(tree) for tree in trees])/len(trees)

In [6]:
depth(nonleaky_trees[1])

4

In [7]:
print(avg_depth(leaky_trees))
print(avg_depth(nonleaky_trees))

2.5609756097560976
4.687436159346272


In [8]:
# The number of nodes in a tree
def tree_size(tree):
    size = 1
    for child in tree.children:
        size += tree_size(child)
    return size

In [9]:
def avg_tree_size(trees):
    return sum([tree_size(tree) for tree in trees])/len(trees)

In [10]:
tree_size(nonleaky_trees[1])

22

In [11]:
print(avg_tree_size(leaky_trees))
print(avg_tree_size(nonleaky_trees))

8.048780487804878
23.252298263534218


In [33]:
def total_relation_length(node, parent = None):
    if parent is None:
        return sum([total_relation_length(child, node) for child in node.children])
    if len(node.children) > 0:
        return sum([total_relation_length(child, node) for child in node.children]) + abs(node.token["id"] - parent.token["id"])
    else:
        return abs(node.token["id"] - parent.token["id"])

In [35]:
total_relation_length(nonleaky_trees[1])/tree_size(nonleaky_trees[1])

2.0

In [38]:
def avg_relation_length(trees):
    total_size = 0
    total_length = 0
    for tree in trees:
        total_size += tree_size(tree)
        total_length += total_relation_length(tree)
    return total_length / total_size

In [40]:
avg_relation_length(leaky_trees)

1.4636363636363636

In [61]:
treetype = ["train", "train", "test", "test"]
leakage = ["leaky", "nonleaky", "leaky", "nonleaky"]
features = {
    "leaky": {
        "train":{},
        "test":{}
    },
    "nonleaky":{
        "train":{},
        "test":{}
    }
}

for x in range(4):
    for treebank in tqdm(os.listdir(f"D:/REU Datasets/Universal Dependencies 2.8.1/{leakage[x]}-treebanks/Unlabeled/")):
        for file in os.listdir(f"D:/REU Datasets/Universal Dependencies 2.8.1/{leakage[x]}-treebanks/Unlabeled/{treebank}/"):
            if file.endswith(f"-ud-{treetype[x]}.conllu"):
                features[leakage[x]][treetype[x]][treebank] = {}
                trees = rp.load_sentences(f"D:/REU Datasets/Universal Dependencies 2.8.1/{leakage[x]}-treebanks/Unlabeled/{treebank}/{file}")
                features[leakage[x]][treetype[x]][treebank]["num_trees"] = len(trees)
                
                if len(trees) > 0:
                    features[leakage[x]][treetype[x]][treebank]["avg_depth"] = avg_depth(trees)
                    features[leakage[x]][treetype[x]][treebank]["avg_size"] = avg_tree_size(trees)
                    features[leakage[x]][treetype[x]][treebank]["avg_rel_length"] = avg_relation_length(trees)
                else:
                    features[leakage[x]][treetype[x]][treebank]["avg_depth"] = 0
                    features[leakage[x]][treetype[x]][treebank]["avg_size"] = 0
                    features[leakage[x]][treetype[x]][treebank]["avg_rel_length"] = 0

100%|████████████████████████████████████████████████████████████████████████████████| 202/202 [01:50<00:00,  1.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 202/202 [08:52<00:00,  2.64s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 202/202 [00:20<00:00,  9.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 202/202 [01:26<00:00,  2.33it/s]


In [64]:
for leakage in features:
    for treetype in features[leakage]:
        total_depth = 0
        total_size = 0
        total_rel_length = 0
        total_num_trees = 0
        for treebank in features[leakage][treetype]:
            total_depth += features[leakage][treetype][treebank]["avg_depth"] * features[leakage][treetype][treebank]["num_trees"]
            total_size += features[leakage][treetype][treebank]["avg_size"] * features[leakage][treetype][treebank]["num_trees"]
            total_rel_length += features[leakage][treetype][treebank]["avg_rel_length"] * features[leakage][treetype][treebank]["avg_size"] * features[leakage][treetype][treebank]["num_trees"]
            total_num_trees += features[leakage][treetype][treebank]["num_trees"]
        total_avg_depth = total_depth / total_num_trees
        total_avg_size = total_size / total_num_trees
        total_avg_rel_length = total_rel_length / total_size
        print(f"Average {leakage} {treetype} tree depth:", total_avg_depth)
        print(f"Average {leakage} {treetype} tree size:", total_avg_size)
        print(f"Average {leakage} {treetype} dependency length:", total_avg_rel_length)

Average leaky train tree depth: 2.2010348035529956
Average leaky train tree size: 7.214471107486183
Average leaky train dependency length: 1.9831152474450007
Average leaky test tree depth: 2.4286077850026087
Average leaky test tree size: 8.416252051866085
Average leaky test dependency length: 2.187663288015869
Average nonleaky train tree depth: 4.837531232831223
Average nonleaky train tree size: 24.12125560368625
Average nonleaky train dependency length: 3.419120111454377
Average nonleaky test tree depth: 4.3977084208896455
Average nonleaky test tree size: 21.580304806565064
Average nonleaky test dependency length: 3.3320623518896624


In [63]:
features

{'leaky': {'train': {'UD_Afrikaans-AfriBooms': {'num_trees': 59,
    'avg_depth': 2.5762711864406778,
    'avg_size': 10.033898305084746,
    'avg_rel_length': 2.2905405405405403},
   'UD_Ancient_Greek-Perseus': {'num_trees': 5112,
    'avg_depth': 2.1709702660406887,
    'avg_size': 7.752347417840376,
    'avg_rel_length': 2.170275044158466},
   'UD_Ancient_Greek-PROIEL': {'num_trees': 7732,
    'avg_depth': 2.0786342472840147,
    'avg_size': 6.193093636833937,
    'avg_rel_length': 1.7968674950402004},
   'UD_Arabic-NYUAD': {'num_trees': 1070,
    'avg_depth': 1.8822429906542055,
    'avg_size': 5.9392523364485985,
    'avg_rel_length': 1.7601888276947286},
   'UD_Arabic-PADT': {'num_trees': 471,
    'avg_depth': 2.307855626326964,
    'avg_size': 5.008492569002123,
    'avg_rel_length': 1.3098770665536243},
   'UD_Armenian-ArmTDP': {'num_trees': 331,
    'avg_depth': 1.8882175226586102,
    'avg_size': 6.169184290030212,
    'avg_rel_length': 1.7277179236043094},
   'UD_Basque-BDT'