In [279]:
# import statements
import REUParsing as rp
import numpy as np
import pickle
import conllu
import pyconll
import networkx as nx
from tqdm import tqdm
from matplotlib import pyplot as plt
from io import open
import os
import errno
import shutil
import pickle

In [246]:
# method that removes any contractions where a modifier has been removed
def remove_contractions_with_modifier(token_list):
    # get list of ids in a new token list
    new_token_ids = []
    for token in token_list:
        new_token_ids.append(token['id'])
        
    # if modifier removes a part of a contraction, remove the contraction
    contractions_to_remove = []
    for token in token_list:
        if isinstance(token['id'], tuple):
            if (token['id'][0] not in new_token_ids) or (token['id'][2] not in new_token_ids):
                contractions_to_remove.append(token['id'])
    
    # filter the token_list so it includes everything except the contractions we want to remove
    new_token_list = token_list.filter(id=lambda x: x not in contractions_to_remove)
    return new_token_list

In [247]:
# given a TokenList, returns a new TokenList without modifiers (and also without those subtrees)
def token_list_no_modifiers(token_list):
    modifiers = ['advmod', 'discourse', 'amod', 'nummod']
    ids_to_remove = []
    for token in token_list:
        # if dependency relation is a modifier, want to remove the subtree
        if token['deprel'] in modifiers:
            subtree_ids = ids_in_subtree(token_list, token['id'])
            # goes through the ids and adds them to the list of ones to remove if not already in the list
            for id_ in subtree_ids:
                if id_ not in ids_to_remove:
                    ids_to_remove.append(id_)
    new_token_list = token_list.filter(id=lambda x: x not in ids_to_remove)
    return new_token_list

In [248]:
# bfs algorithm that returns a list of ids in a subtree given the root of the subtree
def ids_in_subtree(token_list, root_subtree_id):
    visited = []
    queue = []
    
    visited.append(root_subtree_id)
    queue.append(root_subtree_id)
    
    while queue:
        s = queue.pop(0)
        neighbors = nodes_with_given_head(token_list, s)
        
        for neighbor in neighbors:
            if neighbor not in visited:
                visited.append(neighbor)
                queue.append(neighbor)
    
    return visited

In [249]:
# list of the ids of successors of a given node
def nodes_with_given_head(token_list, given_head):
    children = []
    for token in token_list:
        if token['head'] == given_head:
            children.append(token['id'])
    return children

In [250]:
# given a token list, creates the 'text' for the metadata
def new_text_from_token_list(token_list):
    # need to create the new text for the 'text' field of the metadata
    new_text = ""
    already_added = []
    for token in token_list:
        if isinstance(token['id'], tuple):
            already_added.append(token['id'][0])
            already_added.append(token['id'][2])
        if token['id'] not in already_added:
            if token['misc'] == None:
                new_text += token['form'] + " "
            else:
                if 'SpaceAfter' in token['misc'].keys():
                    new_text += token['form']
                else:
                    new_text += token['form'] + " "
    return new_text

In [268]:
# this fixes the numbers of an altered tree so that they are incremented by one 
def fixed_numbers(token_lst):
    token_list = token_lst
    old_to_new_id = {}
    old_to_new_id[None] = None
    count = 0
    
    for token in token_list:
        if not isinstance(token['id'], tuple):
            count += 1
            old_id = token['id']
            temp1 = count
            old_to_new_id[old_id] = temp1
            token['id'] = count
    
    for token in token_list:
        # changing all the 'head' fields - except for when it is 0 (the root)
        old_head = token['head']
        if old_head != 0:
            temp2 = old_to_new_id[old_head]
            token['head'] = temp2
            
            
        # making all the 'deps' fields be none - since the numbers mess up if I leave it
        token['deps'] = None
            
        # changing all the 'deps' fields - except for when it is 0 (the root)
#         if token['deps'] != None:
#             dep_count = 0
#             for dependent in token['deps']:
#                 try:
#                     dep_lst = list(dependent)
#                     if isinstance(dep_lst[1], int): # could be a tuple instead of an int
#                         old_dependent = dep_lst[1] # this is the second value which is the id to a different token
#                         if old_dependent != 0: # leave it if it is the root
#                             temp3 = old_to_new_id[old_dependent]
#                             dep_lst[1] = temp3
#                             token['deps'][dep_count] = tuple(dep_lst)
#                             dep_count += 1
#                     else: # this is if the second spot in the dependency tuple is a tuple
#                         inside_lst = list(dep_lst[1])
#                         first_old_dep = inside_lst[0]
#                         second_old_dep = inside_lst[2]
#                         if first_old_dep != 0: # leave it if it is the root
#                             temp4 = old_to_new_id[first_old_dep]
#                             inside_lst[0] = temp4
#                         if second_old_dep != 0: # leave it if it is the root
#                             temp5 = old_to_new_id[second_old_dep]
#                             inside_lst[2] = temp5
#                         dep_lst[1] = tuple(inside_lst)
#                         token['deps'][dep_count] = tuple(dep_lst)
#                         dep_count += 1
#                 except:
#                     pass
        
        # if the id is a range, change the range to the new ids
        if isinstance(token['id'], tuple):
            id_lst = list(token['id'])
            old_first_number = token['id'][0]
            old_second_number = token['id'][2]
            temp6 = old_to_new_id[old_first_number]
            id_lst[0] = temp6
            temp7 = old_to_new_id[old_second_number]
            id_lst[2] = temp7
            token['id'] = tuple(id_lst)
            
    return token_list

In [270]:
# Load file
my_conll_file_location = 'UD2/ud-treebanks-v2.8/UD_German-HDT/de_hdt-ud-test.conllu'
data_file = open(my_conll_file_location, "r", encoding="utf-8")

with open("de_hdt-ud-test-nomod.conllu", "w") as f:
    for token_list in conllu.parse_incr(data_file):
        # for every token list, remove modifiers, remove contractions with modifiers, and fix the numbers
        new_token_list = fixed_numbers(remove_contractions_with_modifier(token_list_no_modifiers(token_list)))
        # set the metadata equal to eachother
        new_token_list.metadata = token_list.metadata
        # create the new text using the method and set the 'text' field of the metadata to the new text
        new_token_list.metadata['text'] = new_text_from_token_list(token_list)
        
        # serialize the token list so it is in conllu format
        serialized = new_token_list.serialize()
        # write this new serialized data to the file
        f.write(serialized)

In [271]:
# iterating through all the files to create a new parallel set of files

root_addr = 'UD2/ud-treebanks-v2.8/'
new_root_addr = 'UD2_no_modifiers/ud-treebanks-v2.8/'

for lang_dir in tqdm(os.listdir(root_addr)):
    directory = root_addr + lang_dir + '/'
    
    # create the new language directory in the other folder
    new_directory = new_root_addr + lang_dir + '/'
    if not os.path.exists(os.path.dirname(new_directory)):
        os.makedirs(os.path.dirname(new_directory), exist_ok=True)
    for file in os.listdir(directory):
        if file.endswith(".conllu"): #Change this to "-train.conllu" if you want
            file_addr = directory + file
            new_file_addr = new_directory + file
            
            # open and go through the old file, write to the new file
            data_file = open(file_addr, "r", encoding="utf-8")
            
            with open(new_file_addr, "w") as f:
                for token_list in conllu.parse_incr(data_file):
                    # for every token list, remove modifiers, remove contractions with modifiers, and fix the numbers
                    new_token_list = fixed_numbers(remove_contractions_with_modifier(token_list_no_modifiers(token_list)))
                    # set the metadata equal to eachother
                    new_token_list.metadata = token_list.metadata
                    # create the new text using the method and set the 'text' field of the metadata to the new text
                    new_token_list.metadata['text'] = new_text_from_token_list(token_list)
        
                    # serialize the token list so it is in conllu format
                    serialized = new_token_list.serialize()
                    # write this new serialized data to the file
                    f.write(serialized)

100%|██████████| 202/202 [11:01<00:00,  3.27s/it] 


In [295]:
def ud_2_graph(tree, parent=1, graph=None):
    if graph is None:
        graph = nx.Graph()
        graph.add_node(graph.number_of_nodes(), name='root', upos='ROOT')
        graph.add_node(graph.number_of_nodes(), name=tree.token['form'], upos=tree.token['upos'])
        graph.add_edge(0, parent, deprel='<root>')
    for child in tree.children:
        child_num = graph.number_of_nodes()
        graph.add_node(child_num, name=child.token['form'], upos=child.token['upos'])
        graph.add_edge(parent, child_num, deprel=child.token['deprel'])
        graph = ud_2_graph(child, child_num, graph)
    return graph

def generate_hashes(UDtrees, edge_attr=None, node_attr=None, quiet=True):
    hashes = {}
    mostcommon = []
    for sentence in (UDtrees if quiet else tqdm(UDtrees)):
        hash_ = nx.weisfeiler_lehman_graph_hash(ud_2_graph(sentence),edge_attr=edge_attr, node_attr=node_attr)
        if hash_ not in hashes:
            hashes[hash_] = []
        hashes[hash_].append(sentence)
    return hashes

def load_sentences(file):
    with open(file, "r", encoding="utf-8") as data:
        sentences = conllu.parse_tree_incr(data)
        sentences = list(sentences)
    return sentences

In [298]:
# creating the dictionary of hashes for each TEST treebank (with no modifiers)
# root_addr = "D:\\REU Datasets\\Universal Dependencies 2.8.1\\ud-treebanks-v2.8\\" #put the address of Universal dependencies here
root_addr = "UD2_no_modifiers/ud-treebanks-v2.8/" #put the address of Universal dependencies here
language_hashes = {}
for dirname in tqdm(os.listdir(root_addr)):
    treebank = dirname
    if treebank not in language_hashes:
        language_hashes[treebank] = {}
    directory = root_addr + dirname
    for file in os.listdir(directory):
        if file.endswith("-test.conllu"): #Change this to "-train.conllu" if you want
            file_addr = directory + "/" + file
            trees = load_sentences(file_addr)
            hashes = generate_hashes(trees, edge_attr=None, node_attr=None, quiet=True)
            for hash_ in hashes:
                if hash_ not in language_hashes[treebank]:
                    language_hashes[treebank][hash_] = 0
                language_hashes[treebank][hash_] += len(hashes[hash_])
with open("test_no_mod_hashes_none.dict", "wb") as f: #Change this filename to whatever you want
    pickle.dump(language_hashes, f)

100%|██████████| 202/202 [01:41<00:00,  1.98it/s]


In [299]:
# creating the dictionary of hashes for each TRAIN treebank (with no modifiers)
# root_addr = "D:\\REU Datasets\\Universal Dependencies 2.8.1\\ud-treebanks-v2.8\\" #put the address of Universal dependencies here
root_addr = "UD2_no_modifiers/ud-treebanks-v2.8/" #put the address of Universal dependencies here
language_hashes = {}
for dirname in tqdm(os.listdir(root_addr)):
    treebank = dirname
    if treebank not in language_hashes:
        language_hashes[treebank] = {}
    directory = root_addr + dirname
    for file in os.listdir(directory):
        if file.endswith("-train.conllu"): #Change this to "-train.conllu" if you want
            file_addr = directory + "/" + file
            trees = load_sentences(file_addr)
            hashes = generate_hashes(trees, edge_attr=None, node_attr=None, quiet=True)
            for hash_ in hashes:
                if hash_ not in language_hashes[treebank]:
                    language_hashes[treebank][hash_] = 0
                language_hashes[treebank][hash_] += len(hashes[hash_])
with open("train_no_mod_hashes_none.dict", "wb") as f: #Change this filename to whatever you want
    pickle.dump(language_hashes, f)

100%|██████████| 202/202 [09:53<00:00,  2.94s/it]


In [289]:
with open("cross_no_mod.out", "rb") as lf:
    cross_no_mod_leakage = pickle.load(lf)

In [290]:
with open("per_no_mod.out", "rb") as lf:
    per_no_mod_leakage = pickle.load(lf)

In [293]:
{lang:cross_no_mod_leakage[lang] for lang in sorted(cross_no_mod_leakage, key=lambda x: cross_no_mod_leakage[x], reverse=True)}

{'UD_Kaapor-TuDeT': 0.8360655737704917,
 'UD_Turkish-Tourism': 0.7720018239854081,
 'UD_Tamil-MWTT': 0.7153558052434458,
 'UD_Warlpiri-UFAL': 0.7090909090909092,
 'UD_Guajajara-TuDeT': 0.6601941747572815,
 'UD_Telugu-MTG': 0.6095890410958904,
 'UD_Akuntsu-TuDeT': 0.5841584158415841,
 'UD_Polish-LFG': 0.5547191661841344,
 'UD_Assyrian-AS': 0.5438596491228069,
 'UD_English-Pronouns': 0.5438596491228069,
 'UD_Tagalog-TRG': 0.5234375,
 'UD_Munduruku-TuDeT': 0.5229357798165137,
 'UD_Faroese-OFT': 0.4817880794701987,
 'UD_Slovenian-SST': 0.47927927927927927,
 'UD_Chukchi-HSE': 0.47011952191235057,
 'UD_Turkish-GB': 0.4440972222222222,
 'UD_Old_Church_Slavonic-PROIEL': 0.43470639789658205,
 'UD_Norwegian-NynorskLIA': 0.41901776384535006,
 'UD_Kiche-IU': 0.41672473867595816,
 'UD_Nayini-AHA': 0.4,
 'UD_Khunsari-AHA': 0.4,
 'UD_Classical_Chinese-Kyoto': 0.3986577181208054,
 'UD_Finnish-OOD': 0.3878416588124411,
 'UD_English-EWT': 0.3842079922965816,
 'UD_Gothic-PROIEL': 0.38289601554907676,
 'U

In [294]:
{lang:per_no_mod_leakage[lang] for lang in sorted(per_no_mod_leakage, key=lambda x: per_no_mod_leakage[x], reverse=True)}

{'UD_Turkish-Tourism': 0.8796169630642954,
 'UD_Classical_Chinese-Kyoto': 0.6834451901565997,
 'UD_Telugu-MTG': 0.6575342465753424,
 'UD_Icelandic-Modern': 0.62890625,
 'UD_Latin-LLCT': 0.5395927601809954,
 'UD_Polish-LFG': 0.5356108859293573,
 'UD_Old_French-SRCMF': 0.4250129735339906,
 'UD_Slovenian-SST': 0.38198198198198197,
 'UD_Norwegian-NynorskLIA': 0.38140020898641586,
 'UD_Sanskrit-Vedic': 0.37067209775967414,
 'UD_Finnish-FTB': 0.36475629351901445,
 'UD_Old_East_Slavic-TOROT': 0.34339407744874717,
 'UD_English-EWT': 0.34039480019258544,
 'UD_Korean-GSD': 0.339737108190091,
 'UD_Dutch-LassySmall': 0.3013698630136986,
 'UD_Old_Church_Slavonic-PROIEL': 0.2953549517966696,
 'UD_Turkish-FrameNet': 0.2926829268292683,
 'UD_Turkish-Penn': 0.26296296296296295,
 'UD_Naija-NSC': 0.25102880658436216,
 'UD_Slovak-SNK': 0.25070688030160226,
 'UD_Latin-PROIEL': 0.24444444444444444,
 'UD_Czech-FicTree': 0.2401239349341596,
 'UD_Russian-Taiga': 0.2383654937570942,
 'UD_German-HDT': 0.23511566