In [1]:
import xml.etree.ElementTree as ET
import random

In [2]:
xml_file = "/home/max/Documents/mlt/adv_machine/project/data/ukwac_nowiki3466.xml" # the original xml file
path = "../../data/" # where to store files
corpus_file = "ukwac_slim.txt"
train_file = "train.csv"
test_file = "test.csv"

subsample = 100
mini_train_file = "mini_train.csv"
mini_test_file = "mini_test.csv"

train_proportion = 0.7

In [3]:
tree = ET.parse(xml_file)

In [4]:
root = tree.getroot()

In [5]:
# Know your data
print("No. of sentences:", len(root))
frames = [f for f in root.iter("Frame")]
print("No. of predicates:", len(frames))

No. of sentences: 21187
No. of predicates: 76965


In [6]:
# Getting what we need
data = []
nones=0
for s in root:
    sentence = {}
    
    sentence["tokens"] = s.find("tokenized").text.split() # 'quotechar' and 'delimiter'

    predicates = []
    
    ignore = False
    
    for frame in s.iter("Frame"):
        a_pred = {} 
        
        if frame.attrib["prd_idx"] == "None":
            nones+=1
            ignore = True
            break # Stop looking for data of that sentence
            
        else:
            a_pred["idx"] = int(frame.attrib["prd_idx"]) #frame prd_idx (later one 0s-and-1(s) vector) HOW TO REPRESENT?
        
            args = []
            for arg in frame.iter("Arg"):
                element = {}
                
                attributes = arg.attrib
                element["role"] = attributes["role"]
                element["token"] = attributes["phrase"]
                element["start"] = int(attributes["span_begin"])
                element["end"] = int(attributes["span_end"])

                args.append(element)
            
        a_pred["arguments"] = args
        predicates.append(a_pred)
    
    sentence["srl"] = predicates
    if ignore == False:
        data.append(sentence)
    
print("Missing:", nones)


Missing: 208


In [7]:
# Prepare our data
mega_list = []
for sentence in data:
    for predicate in sentence["srl"]:
        if predicate["idx"] != "None":
            pred_vec = ["0" for i in range(len(sentence["tokens"]))]
            pred_vec[predicate["idx"]] = "1"

            io_sequence = ["O" for i in range(len(sentence["tokens"]))]
            for arg in predicate["arguments"]:
                if arg["start"] == arg["end"]:
                    io_sequence[arg["start"]] = "B-"+arg["role"]
                else:
                    io_sequence[arg["start"]] = "B-"+arg["role"]
                    for i in range(arg["start"]+1, arg["end"]+1): # +1?
                        io_sequence[i] = "I-"+arg["role"]
                        
        mega_list.append([sentence["tokens"], pred_vec, io_sequence])
print("Size:", len(mega_list))

Size: 76757


In [8]:
# Checking that it adds up
for row in mega_list:
    a = len(row[0])
    b = len(row[1])
    c = len(row[2])
    
    if a != b:
        print(a, b)
    if a != c:
        print(a, c)
    if b != c:
        print(b, c)

In [9]:
# Write our data to a text file 
with open(path+corpus_file, "w") as f:
    for x in mega_list:
        f.write("\t".join([" ".join(lst) for lst in x]))
        f.write("\n")

In [10]:
# Create train and test data form our text file
with open(path+corpus_file, "r") as corpus, open(path+train_file, "w") as train, open(path+test_file, "w") as test:
    c = corpus.readlines()
    random.shuffle(c)
    cut = int(len(c)*train_proportion)
    train.write("\n".join([line.replace("\n", "") for line in c[:cut]]))
    test.write("\n".join([line.replace("\n", "") for line in c[cut:]]))

In [11]:
# Creating mini versions of train and test data for development purposes
with open(path+corpus_file, "r") as corpus, open(path+mini_train_file, "w") as train, open(path+mini_test_file, "w") as test:
    c = corpus.readlines()
    random.shuffle(c)
    c = c[:subsample]
    cut = int(len(c)*train_proportion)
    train.write("\n".join([line.replace("\n", "") for line in c[:cut]]))
    test.write("\n".join([line.replace("\n", "") for line in c[cut:]]))