In [38]:
# import statements
import numpy as np
import pickle
import conllu
import pyconll
import networkx as nx
from tqdm import tqdm
from matplotlib import pyplot as plt
from io import open
import os
import errno
import shutil
import pickle

In [39]:
# method that counts the number of a certain dependency relation between a head pos and dependent pos given a token list
def count_of_deprel_tokenlist(token_list, head_pos, dep_pos, deprel):
    count = 0
    id_to_pos = {}
    id_to_pos[0] = 'root'
    for token in token_list:
        id_to_pos[token['id']] = token['deprel']
    for token in token_list:
        if token['deprel'] == deprel and token['upos'].lower() == dep_pos and id_to_pos[token['head']] == head_pos:
            count += 1
    return count

In [40]:
# function that goes through conllu file and finds how many of a certain relation there exists
# Load file
def count_of_deprel_conllu_file(file, head_pos, dep_pos, deprel):
    data_file = open(file, "r", encoding="utf-8")

    total_count = 0
    for token_list in conllu.parse_incr(data_file):
        total_count += count_of_deprel_tokenlist(token_list, head_pos, dep_pos, deprel)
    
    return total_count

In [47]:
my_file = 'single-mod-outs-nostrip/en_noobjmod_outs.conllu'
count_subj_amod = count_of_deprel_conllu_file(my_file, 'nsubj', 'adj', 'amod')
count_obj_amod = count_of_deprel_conllu_file(my_file, 'obj', 'adj', 'amod')

In [48]:
print(count_subj_amod)
print(count_obj_amod)

130
237


In [5]:
# method that removes any contractions where a modifier has been removed
def remove_contractions_with_modifier(token_list):
    # get list of ids in a new token list
    new_token_ids = []
    for token in token_list:
        new_token_ids.append(token['id'])
        
    # if modifier removes a part of a contraction, remove the contraction
    contractions_to_remove = []
    for token in token_list:
        if isinstance(token['id'], tuple):
            if (token['id'][0] not in new_token_ids) or (token['id'][2] not in new_token_ids):
                contractions_to_remove.append(token['id'])
    
    # filter the token_list so it includes everything except the contractions we want to remove
    new_token_list = token_list.filter(id=lambda x: x not in contractions_to_remove)
    return new_token_list

In [6]:
# given a TokenList, returns a new TokenList without a given modifier 
# (dependency relation between head and dependent) (and also without those subtrees)
def token_list_no_modifiers(token_list, head_pos, dep_pos, deprel):
    ids_to_remove = []
    id_to_pos = {}
    id_to_pos[0] = 'root'
    for token in token_list:
        id_to_pos[token['id']] = token['deprel']
    for token in token_list:
        if token['deprel'] == deprel and token['upos'].lower() == dep_pos and id_to_pos[token['head']] == head_pos:
            subtree_ids = ids_in_subtree(token_list, token['id'])
            for id_ in subtree_ids:
                if id_ not in ids_to_remove:
                    ids_to_remove.append(id_)
    new_token_list = token_list.filter(id=lambda x: x not in ids_to_remove)
    return new_token_list

In [7]:
# bfs algorithm that returns a list of ids in a subtree given the root of the subtree
def ids_in_subtree(token_list, root_subtree_id):
    visited = []
    queue = []
    
    visited.append(root_subtree_id)
    queue.append(root_subtree_id)
    
    while queue:
        s = queue.pop(0)
        neighbors = nodes_with_given_head(token_list, s)
        
        for neighbor in neighbors:
            if neighbor not in visited:
                visited.append(neighbor)
                queue.append(neighbor)
    
    return visited

In [8]:
# list of the ids of successors of a given node
def nodes_with_given_head(token_list, given_head):
    children = []
    for token in token_list:
        if token['head'] == given_head:
            children.append(token['id'])
    return children

In [9]:
# given a token list, creates the 'text' for the metadata
def new_text_from_token_list(token_list):
    # need to create the new text for the 'text' field of the metadata
    new_text = ""
    already_added = []
    for token in token_list:
        if isinstance(token['id'], tuple):
            already_added.append(token['id'][0])
            already_added.append(token['id'][2])
        if token['id'] not in already_added:
            if token['misc'] == None:
                new_text += token['form'] + " "
            else:
                if 'SpaceAfter' in token['misc'].keys():
                    new_text += token['form']
                else:
                    new_text += token['form'] + " "
    return new_text

In [10]:
# this fixes the numbers of an altered tree so that they are incremented by one 
def fixed_numbers(token_lst):
    token_list = token_lst
    old_to_new_id = {}
    old_to_new_id[None] = None
    count = 0
    
    for token in token_list:
        if not isinstance(token['id'], tuple):
            count += 1
            old_id = token['id']
            temp1 = count
            old_to_new_id[old_id] = temp1
            token['id'] = count
    
    for token in token_list:
        # changing all the 'head' fields - except for when it is 0 (the root)
        old_head = token['head']
        if old_head != 0:
            temp2 = old_to_new_id[old_head]
            token['head'] = temp2
            
            
        # making all the 'deps' fields be none - since the numbers mess up if I leave it
        token['deps'] = None

        
        # if the id is a range, change the range to the new ids
        if isinstance(token['id'], tuple):
            id_lst = list(token['id'])
            old_first_number = token['id'][0]
            old_second_number = token['id'][2]
            temp6 = old_to_new_id[old_first_number]
            id_lst[0] = temp6
            temp7 = old_to_new_id[old_second_number]
            id_lst[2] = temp7
            token['id'] = tuple(id_lst)
            
    return token_list

In [22]:
# Load file
my_conll_file_location = 'data/UD2/ud-treebanks-v2.8/UD_German-HDT/de_hdt-ud-train.conllu'
data_file = open(my_conll_file_location, "r", encoding="utf-8")

with open("de_hdt-ud-train-obj.conllu", "w") as f:
    for token_list in conllu.parse_incr(data_file):
        # for every token list, remove modifiers, remove contractions with modifiers, and fix the numbers
        temp_list = token_list_no_modifiers(token_list, 'obj', 'adj', 'amod')
        new_token_list = fixed_numbers(remove_contractions_with_modifier(temp_list))
        # set the metadata equal to eachother
        new_token_list.metadata = token_list.metadata
        # create the new text using the method and set the 'text' field of the metadata to the new text
        new_token_list.metadata['text'] = new_text_from_token_list(new_token_list)
        
        # serialize the token list so it is in conllu format
        serialized = new_token_list.serialize()
        # write this new serialized data to the file
        f.write(serialized)

In [23]:
my_file = 'de_hdt-ud-train-obj.conllu'
count_subj_amod = count_of_deprel_conllu_file(my_file, 'nsubj', 'adj', 'amod')
count_obj_amod = count_of_deprel_conllu_file(my_file, 'obj', 'adj', 'amod')

In [24]:
print(count_subj_amod)
print(count_obj_amod)

24960
0
