# General overview
```
Europarl (files) --// My code //--> files:
    Bi-Text (SRC ||| TRG)
    SRC
    TRG
    GRAMMAR (TRG)
    IDS

SRC --// My code //--> SRC_numbered (or "SRC_forFSL")

Bi-Text --// Awsome //--> ALIGNMENTS

SRC_numbered --// Open Sesame //--> FSL_numbered.conll

FSL_numbered --// My code //--> New SRC            (realigned)
TRG                             New TRG    
GRAMMAR                         New GRAMMAR
IDS                             New IDS
ALIGNMENTS                      New ALIGNMENTS

FSL_numbered --// My code //--> PP_FSL_numbered (simplified)

PP_FSL  --// My code //--> Frame semantic transfer
New ALIGNMENTS
New GRAMMAR
--------------
New SRC
New TRG
(New IDS)

Frame semantic transfer --> prefered format, xml?

```  

# 1. Preprocessing

In [None]:
import xml.etree.ElementTree as ET

In [None]:
def id2score(string):
    """ Takes an id of the form 'ep-00-02-15.2261' and returns a score. 
    """
    first_split = string.split(".")
    number = first_split[-1]
    second_split = first_split[0].split("-")
    date = tuple(second_split[1:])
    return date, number
    

In [None]:
def extractor(src_file, trg_file, bitxt, src_out, trg_out, trg_dep, ids, stop = 1000, add_id = False):

    src_parser = ET.XMLPullParser(['start', 'end'])
    trg_parser = ET.XMLPullParser(['start', 'end'])

    src_parser.feed("<root>") # we need a root-token; https://stackoverflow.com/questions/38853644/python-xml-parseerror-junk-after-document-element
    trg_parser.feed("<root>")

    line = None
    sub_loop = None
    sub_id = None
    skip_sub = None

    #sentence_pairs = []
    
    if stop != None:
        STOP = stop
        i = 0
        
    with open(src_file, "r") as srcf, open(trg_file, "r") as trgf, open(bitxt, "w") as bit_f, open(src_out, "w") as src_f, open(trg_out, "w") as trg_f, open(trg_dep, "w") as trgdep_f, open(ids, "w") as ids_f:

        while line != "</corpus>":
            line = srcf.readline()
            line = line.strip("\n")

            if line[:11] == "<linken id=":
                welcome = True
                sent_id = line[12:-2]
                src_segment = [sent_id] if add_id else []
                identity = sent_id
                continue
            if line[:3] == "<w " and welcome == True:
                src_parser.feed(line)
                as_list = [x for x in src_parser.read_events()]
                event, element = as_list[-1]
                src_segment.append(element.text)
                continue
            if line == "</linken>":
                welcome = False
                sub_loop = None

                if sent_id == sub_id:
                    loop_hole = True
                    trg_segment = [sub_id] if add_id else []
                    grammar = []

                while sub_loop != "</linken>":

                    sub_loop = trgf.readline()
                    sub_loop = sub_loop.strip("\n")
                    if sub_loop[:11] == "<linken id=":
                        sub_id = sub_loop[12:-2]
                        if sent_id == sub_id:
                            loop_hole = True
                            trg_segment = [sub_id] if add_id else []
                            grammar = []
                            continue
                        else:
                            date, score = id2score(sent_id)
                            sub_date, sub_score = id2score(sub_id)
                            #print("===> IDs",sent_id, "(EN)", sub_id, "(SV)")
                            #if sent_id.split(".")[-1] < sub_id.split(".")[-1]:
                            if date == sub_date and score < sub_score:
                                src_segment = [] if add_id else [sent_id]
                                break
                            else:
                                skip_sub = True
                                continue

                    if sub_loop[:3] == "<w " and loop_hole == True:
                        trg_parser.feed(sub_loop)
                        s_list = [x for x in trg_parser.read_events()]
                        sub_event, sub_element = s_list[-1]
                        trg_segment.append(sub_element.text)
                        g = sub_element.attrib
                        syntax = [g["ref"], g["dephead"], g["deprel"]]
                        grammar.append(";".join(syntax))
                        continue
                    
                    if sub_loop == "</linken>":
                        if skip_sub == True:
                            skip_sub = False
                            sub_loop = None
                            continue
                        loop_hole = False
                        
                        if stop == None:
                        
                            bit_f.write(" ".join(src_segment) + " ||| " + " ".join(trg_segment) + "\n") 
                            src_f.write(" ".join(src_segment) + "\n") 
                            trg_f.write(" ".join(trg_segment) + "\n") 
                            trgdep_f.write(" ".join(grammar) + "\n") 
                            ids_f.write(identity + "\n")
                        
                        else:
                            if i in [n for n in range(0, STOP, 5)]:
                                bit_f.write(" ".join(src_segment) + " ||| " + " ".join(trg_segment) + "\n") 
                                src_f.write(" ".join(src_segment) + "\n") 
                                trg_f.write(" ".join(trg_segment) + "\n") 
                                trgdep_f.write(" ".join(grammar) + "\n")
                                ids_f.write(identity + "\n")
                                
                            i += 1
                        
                        src_segment = []
                        trg_segment = []
                        grammar = []
            
            if stop != None:
                if i > STOP:
                    break

    src_parser.feed("</root>") # perhaps not really required, but true to standard ...
    trg_parser.feed("</root>")      

In [None]:
# INPUT
path     =  "/home/max/corpora/europarl/"
en_file  =  path + "europarl-en.xml"
sv_file  =  path + "europarl-sv.xml"

In [None]:
# OUTPUT
scratch = True
out_dir = "../data/"
bitxt   = out_dir + "bitext_scratch.txt" if scratch else out_dir + "bitext.txt"
src_out = out_dir + "src_scratch.txt" if scratch else out_dir + "src.txt"
trg_out = out_dir + "trg_scratch.txt" if scratch else out_dir + "trg.txt"
trg_dep = out_dir + "trg_dep_scratch.txt" if scratch else out_dir + "trg_dep.txt" #what about json here???
ids     = out_dir + "ids_scratch.txt" if scratch else out_dir + "ids.txt"


In [None]:
extractor(en_file, sv_file, bitxt, src_out, trg_out, trg_dep, ids, stop=10000, add_id=False)

## Enumerate for FSL

In [None]:
file_in = src_out
file_out = src_out[:-4] + "_forFSL.txt"
print(file_in, ">>>", file_out)

In [None]:
with open(file_in, "r") as f:
    lines = f.readlines()

with open(file_out, "w") as f:
    for i, line in enumerate(lines, start = 1):
        f.write(f"#{i} {line}")

# 2. Postprocessing of FSL

## Diagnostics: Output from sesame vs. what went in
The number of sentences that are feed to Open Sesame is note the same number that goes in. IN > OUT. This messes up the synchronization of data in the pipline. 

The most obvious solution is to number sentences. However note that this affects the FSL as the numbers are in some case assigned FEs. These annotations can be ignored, but then note special cases such as:

```
#       B-Element
1       I-Element
Please  I-Element
,       0
...
```

When the tagging of the first two words are ignored from further porcessing the tag of the third word still "pollutes" the data. 

Another idea is to use some similarity-based matching algorithm to identify (exclude) sentences which have not "survived" Open Sesame. However, there is another issue: Open sesmae produces UNK for "the" suggesting which effects sentence matching. Also this fact indicates that thee is somthing wrong with the pre-trained models I use for Open Seseame. Probable action: train Open Sesamae for pre-trained models myself. 

## How many sentnces come out from Open Sesame?

In [None]:
my_file = "../data/fsl_scratch.conll" 

unique_sentences = []
with open(my_file, "r") as output_from_sesame:
    sentence = []
    for line in output_from_sesame:
        line = line.strip("\n")
        line = line.split("\t")
        if line == [""]:
            if " ".join(sentence) not in unique_sentences:
                unique_sentences.append(" ".join(sentence))
            sentence = []
        else:
            #print(line)
            sentence.append(line[1])
print(len(unique_sentences))

## ... and in the numbered version?

In [None]:
# numbered version
my_file = "../data/fsl_scratch_numbered.conll" 

unique_sentences = []
with open(my_file, "r") as output_from_sesame:
    sentence = []
    for line in output_from_sesame:
        line = line.strip("\n")
        line = line.split("\t")
        if line == [""]:
            if " ".join(sentence) not in unique_sentences:
                unique_sentences.append(" ".join(sentence))
            sentence = []
        else:
            #print(line)
            sentence.append(line[1])
print(len(unique_sentences))

## How does it look?

In [None]:
print(unique_sentences[-1])

In [None]:
for x in unique_sentences[:20]:
    print(x)

## Original count (the number of sentnces that goes into Open Sesame)

In [None]:
with open("../data/src_scratch.txt") as f:
    print(len(f.readlines()))

## Does any identitiy marker disapear? (e.g. --> "UNK") If no output: "no".

In [None]:
#test
numbers = [f"#{n}" for n in range(1, 2001)]

for x in unique_sentences:
    x = x.split()
    id_n = x[0]+x[1]
    if id_n not in numbers:
        print(id_n)
        


## Re-alignment based on numbered sentences

In [None]:
dir_in  = "../data/"
dir_out = "../data/realigned/"

# Existing files:
x_src  = "src_scratch_forFSL.txt" # this is the 2000 lines file with numbers
XX_src = "src_scratch.txt"        # this is the 2000 lines file without numbers
x_algn = "alignments_scratch.txt"
x_grmr = "trg_dep_scratch.txt"
x_ids  = "ids_scratch.txt"
x_trg  = "trg_scratch.txt"

# New files
suffix = "_realigned.txt"
new_trg  = dir_out + "trg_scratch" + suffix
new_src  = dir_out + "src_scratch" + suffix
new_algn = dir_out + x_algn[:-4] + suffix
new_grmr = dir_out + x_grmr[:-4] + suffix
new_ids  = dir_out + x_ids[:-4] + suffix

my_iter = iter(unique_sentences) # these are the unique sentences out from Open Sesame - defined above
#print(len(unique_sentences))
LENGTH = 2000
MAX = len(unique_sentences)
missed = []
i = 0

with open(dir_in + x_src, "r") as xs, open(dir_in + XX_src, "r") as XXS, open(dir_in + x_algn, "r") as xa, open(dir_in + x_grmr, "r") as xg, open(dir_in + x_ids, "r") as xi, open(dir_in + x_trg, "r") as xt, open(new_trg, "w") as new_t, open(new_src, "w") as new_s, open(new_algn, "w") as new_a, open(new_grmr, "w") as new_g, open(new_ids, "w") as new_i:
    while i < MAX:
        #print(i, end="\r")
        alignment  = xa.readline()
        grammar    = xg.readline()
        europ_id   = xi.readline()
        target     = xt.readline()
        source     = XXS.readline()
        from_fsl   = next(my_iter)
        from_fsl   = from_fsl.split()
        idn_after  = from_fsl[1]
        src_before = xs.readline().split()
        #print(src_before)
        idn_before = src_before[0][1:]
        
        
        
        # Here comes problem. Since Open Sesame tags the id "# 34",
        # this will f-ck up the statement "if idn_after == idn_before:".
        # In the tagged cases, the statement return False, when we want
        # True.
        
        if idn_after == idn_before: # compare unique_sentences with original source (numbered)
            
            new_t.write(target) # here we could potentially remove the id# if we want to
            new_s.write(source)
            new_a.write(alignment)
            new_g.write(grammar)
            new_i.write(europ_id)
        else:
            #print(idn_after, idn_before)
            while idn_after != idn_before:
                missed.append(idn_before)
                #print(len(missed), end="\r")
                alignment  = xa.readline()
                grammar    = xg.readline()
                europ_id   = xi.readline()
                target     = xt.readline()
                source     = XXS.readline()
                src_before = xs.readline().split()
                idn_before = src_before[0][1:]
            
            #print(idn_after, idn_before)
            new_t.write(target) # here we could potentially remove the id# if we want to
            new_s.write(source)
            new_a.write(alignment)
            new_g.write(grammar)
            new_i.write(europ_id)
        
        i += 1

print(len(missed))

In [None]:
missed

## Re-alignment based on similiarity ... TBC

## Simplify so called conll format to one sentence's annotation per line

**A problem -- Note to Self**
*(4 Jan) I think realignment work, but simplification does not, for some reason. There is no 1:1 mapping between sentences and their annotations. Why?*
Here is the problem to be solved:
1. collected "Resumption of the session"
2. comes to "\n" --> equal to previous? No, so write to file and reset. Set `previous` to "Resumption of ..."
3. collect "Please rise ..."
4. comes to "\n" --> **equal to previous? No** so again write to file and reset ...

... now, after the next collection: `previous == sent`, the list will be popoulated, but the first frame annotation is already printed to file. 

**A decision can only be made when there are two sentences collected**


In [None]:
data_dir = "../data/"
#my_conll_annot = data_dir + "fsl_scratch.conll"
#simplified_fsl = data_dir + "pp_fsl_scratch.txt"
my_conll_annot = data_dir + "fsl_scratch_numbered.conll" 
simplified_fsl = data_dir + "pp_fsl_scratch_numbered.txt"
check_up_file  = data_dir + "conll-pp-check_up.txt"

**Note:** indexing must be the same in transfer. Awsome align start from 0. Open-Sesame output start from 1. My numbering of lines adds 2 more to this. Eurparl annotations for dep rel also strat from 1, but these are only references. 

In [None]:
def read_conll(raw, initializer = 0, sep = ";", adjustment = 2):
    """ 
    :param adjustment: compensates (substracts) word indicies for, e.g., numbering "#1 text"
                       adjustment = 2 adjusts for "# 1 text"
    """
    
    sentence = []
    annotation = []
    fes = []
    fe = None
    
    for i, line in enumerate(raw.split("\n")[adjustment:-1], start = initializer):
        #i = i - adjustment
        
        feat = line.split("\t")

        sentence.append(feat[1]) # word
        
        if feat[-3] != "_": # LU
            lu_idx = str(i)
            frame = feat[-2] # frame
        
        fe_col = feat[-1]
            
        if fe_col == "_":
            if fe != None: # FE
                fes.append(f"{start}:{end+1}#{fe}")
                fe = None
                continue
        else:
            if fe_col[0] == "S":
                fes.append(f"{i}:{i+1}#{fe_col[2:]}")
                continue
            if fe_col[0] == "B":
                start = i
                fe = fe_col[2:]
                continue
            if fe_col[0] == "I":
                end = i
    
    festr = "|".join(fes)
    annotation = f"{frame};{lu_idx};{festr}"
    
    return sentence, annotation    

In [None]:
def terminator(string):
    
    out = []
    
    #for i, x in enumerate(string.split("\n")):
    #    print(i,x)
    
    for line in string.split("\n")[:-1]:

        line = line.split("\t")
        suffix = "-".join(line[-3:])
        if suffix == "_-_-O":
            line = line[1]
        else:
            line = line[1] + " [" + suffix + "]"
        out.append(line)
    
    out = " ".join(out)
    #print(out)

    return out

In [None]:
def simplify(conll_file, output_file, check_file, sep=" "):
    
    with open(conll_file, "r") as original, open(output_file, "w") as output, open(check_file, "w") as check:
        
        raw        = ""
        annot_sent = []
        conll_in   = []
        previous   = None
        buffer     = False
        
        for line in original:
            if line == "\n":
                sent, annot_lu = read_conll(raw)
                conll          = terminator(raw)

                if sent == previous:
                    annot_sent.append(annot_lu)
                    conll_in.append(conll)
                    raw = ""
                    previous = sent
                    buffer = True
                    continue
                else:
                    if buffer:
                        output.write(sep.join(annot_sent) + "\n")
                        annot_sent = []                        
                        annot_sent.append(annot_lu)
                    
                        check.write(" << NEXT ONE >> ".join(conll_in) + "\n")
                        conll_in = []
                        conll_in.append(conll)
                    
                        raw = ""
                        previous = sent
                        buffer = True
                        continue
                    else:
                        annot_sent.append(annot_lu)
                        conll_in.append(conll)
                        raw = ""
                        previous = sent
                        buffer = True
                        continue
            raw += line
        output.write(sep.join(annot_sent) + "\n")              # to print whats left in the buffer
        check.write(" << NEXT ONE >> ".join(conll_in) + "\n")


In [None]:
simplify(my_conll_annot, simplified_fsl, check_up_file)

# 3. Transfer

**NOTE:** Keep track of word indices!
1. Python start from 0
2. Awsome start from 0
3. Open S output start from 1
4. Enumeration prior FSL --> "# 23 text text"

**SOLUTION:** This is solved above by parameters `initalizer` and `adjustment` in `read_conll` function.

Anoter issue on deprel --> constituents: is there need of additional code to fully cover "dependents of dependents". 

In [None]:
def aligner(alignment_dict, idx):
    if idx in alignment_dict:
        return int(alignment_dict[idx])
    else:
        return None


In [None]:
def read_alignment(string):
    d = {}
    for ww in string.split():
        d[ww.split("-")[0]] = ww.split("-")[1] # SRC --> TRG ?
    # d = aligner(d)
    return d

In [None]:
def read_grammar(string):
    """ For every idx, list dependents.
    """
    
    # there can be several ROOT
    
    drs = string.split()
    #print(drs)
    
    ref2idx = {int(dr.split(";")[0]): i for i, dr in enumerate(drs)}
    ref2idx[None] = None
    #print(ref2idx)
    
    #head_of = {}
    h2d  = {idx: [] for idx in [v for v in ref2idx.values()]}
    #print(head_of)
    
    for dr in drs:
        dr = dr.split(";")
        ref  = ref2idx[int(dr[0])]
        head = ref2idx[None if dr[1] == "" else int(dr[1])] # the ROOT has no head
        
        h2d[head].append([ref])
    
    #print("before", h2d)
        
    # Phrases ...
    for head in h2d:
        if h2d[head] == []:
            continue
        constituents = h2d[head]
        update = []
        for constituent in constituents:
            #print("c", constituent)
            for w in constituent:
                new_constituents = h2d[w]
                for nc in new_constituents:
                    for nw in nc:
                        if nw not in constituent:
                            #print(constituent)
                            constituent += [nw]
            update.append(sorted(constituent))
        h2d[head] = sorted(update)
    
    return h2d

In [None]:
# Example
sentence = "Ni lÃ¤t mig aldrig komma till tals .".split()
print(len(sentence))
print("-"*10)
grammar = read_grammar("1;5;SS 2;;ROOT 3;2;SS 4;2;TA 5;2;OO 6;5;OA 7;6;PA 8;2;IP")
print(grammar)
#grammar[1]

In [None]:
def N_align(start, end, constituent, adict):
    count = 0
    for idx in range(start, end):
        if aligner(adict, idx) in constituent:
            count += 1
    return count

In [None]:
def transfer(fsl, alignment, deprel, ids, output_file, log_file, algorithm = "Tonelli2", wasteful = True):
    """ 
    :param fsl: a simplified FSL file
    :param alignment:
    :param deprel:
    :param ids:
    :param output_file:
    :param log_file:
    :param algorithm: 
    :param wasteful:
    """
    
    whereiam = 1
    
    with open(fsl, "r") as FSL, open(alignment, "r") as A, open(deprel, "r") as G, open(ids, "r") as ID, open(output_file, "w") as O, open(log_file, "w") as LOG:
        for line in FSL:
            
            line = line.strip("\n")
            #print(whereiam, line)
            idn = ID.readline().strip("\n")
            dep_rel = G.readline().strip("\n")
            adict = read_alignment(A.readline().strip("\n"))
            #print(whereiam, adict)
            
            log = []
            t_annotations = []
            
            frames = line.split(" ")    # "Frame;idx;start:end#Role|start:end#Role Frame;idx;start:end#Role|start:end#Role"
                                        # there can be multiple frames per sentence
            
            for order, this_frame in enumerate(frames):
                #trg_frame = [] # to populate
                fs_info = this_frame.split(";") # Frame;index_LU;FEs
                frame_name = fs_info[0]
                trg_lu = aligner(adict, fs_info[1]) # index of LU in src --> index of LU in trg
                if trg_lu == None:
                    trg_lu = "NoLUalign"
                    log.append(f"No target alignment--fr{order}")
                    #O.write("\n")
                    #continue # is there any point considering fes when there is no trg alignment?
                
                src_fes = []
                trg_fes = []
                
                if fs_info[-1] == "": #i.e. if there are no frame elements
                    trg_fes_str = "NoFEsinSRC"
                    trg_frame = f"{frame_name};{trg_lu};{trg_fes_str}"
                    t_annotations.append(trg_frame)
                else:
                    if trg_lu == "NoLUalign":
                        trg_fes_str = "FEsinSRCbutNoLU"
                        trg_frame = f"{frame_name};{trg_lu};{trg_fes_str}"
                        t_annotations.append(trg_frame)
                    else:
                        for x in fs_info[-1].split("|"): # pipe(|) separates FEs within frame
                            x = x.split("#")             # hashtag (#) separates span from FE name
                            y = x[0].split(":")          # colon (:) separates start from stop in span
                            y = [int(v) for v in y]      # make start and stop integers
                            z = tuple(y + [x[-1]])       # (start, stop, FE_name)
                            src_fes.append(z)

                        if algorithm == "Yang":
                            pass
                            # do something
                            # Note: must avoid (a) discontinous and (b) overlapping elements in trg

                        if algorithm == "Tonelli2":
                            grammar = read_grammar(dep_rel)
                            if trg_lu not in grammar:
                                trg_fes_str = "NoDepTRGLU"
                            else:
                                lu_trg_dependents = grammar[trg_lu]

                                for start, end, fe_name in src_fes:
                                    best_score = 0
                                    best_candidate = None

                                    for dp in lu_trg_dependents: # every dependent should be a list of word indices
                                        score = N_align(start, end, dp, adict)
                                        if score > best_score:   # what if the same score?
                                            best_score = score
                                            best_candidate = dp  # dp is a list of numbers word indices

                                if best_score != 0:
                                    start = min(best_candidate)  # note again structure of dp: a list of numbers
                                    end   = max(best_candidate)
                                    # Note assumption here:
                                    # Dependents are assumed to be continous,
                                    # while in fact dependets as derived from
                                    # read_grammar() are discontinious.
                                    trg_fe = f"{start}:{end}#{fe_name}" # add 1 to end?
                                    trg_fes.append(trg_fe)

                                #if wasteful == True:
                                #    if len(src_fes) != len(trg_fes):
                                #        O.write("/n")
                                #        log.append(f"Mismatch of frame elements")
                                #        continue

                                if trg_fes == []:
                                    trg_fes_str = "NoTransfer"
                                else:
                                    trg_fes_str = "|".join(trg_fes)
                            
                            trg_frame = f"{frame_name};{trg_lu};{trg_fes_str}"
                            t_annotations.append(trg_frame)
            
            
            
            #print(whereiam, " ".join(t_annotations))
            whereiam += 1

            O.write(" ".join(t_annotations) + "\n")
                            
            if log != []:
                LOG.write(f"{idn}:" + ";".join(log) + "\n")


In [None]:
result  = "../data/results/result.txt"
logfile = "../data/results/log.txt"

fs_annotations = "../data/pp_fsl_scratch_numbered.txt"
alignments     = "../data/realigned/alignments_scratch_realigned.txt"
grammar        = "../data/realigned/trg_dep_scratch_realigned.txt"
ids            = "../data/realigned/ids_scratch_realigned.txt"

transfer(fsl=fs_annotations, 
         alignment=alignments, 
         deprel=grammar, 
         ids=ids, 
         output_file=result, 
         log_file=logfile, 
         algorithm = "Tonelli2", 
         wasteful = True)

## Inspect data

In [None]:
from random import shuffle, seed
seed(1)

def inspector(source, target, alignment, frame_semantics, grammar, conll2pp, sample = None):
    
    with open(source, "r") as f:
        SRC = f.readlines()
        
    with open(target, "r") as f:
        TRG = f.readlines()
        
    with open(alignment, "r") as f:
        ALG = f.readlines()
    
    with open(frame_semantics, "r") as f:
        FSL = f.readlines()
        
    with open(grammar,"r") as f:
        GRM = f.readlines()
    
    with open(conll2pp, "r") as f:
        C2P = f.readlines()
    
    everything = list(zip(SRC, TRG, ALG, FSL, GRM, C2P))
    shuffle(everything)
    if sample != None:
        sample = everything[:sample]
    else:
        sample = everything
    myiter = iter(sample)
    
    return myiter


In [None]:
source    = "../data/realigned/src_scratch_realigned.txt"
target    = "../data/realigned/trg_scratch_realigned.txt"
alignment = "../data/realigned/alignments_scratch_realigned.txt"
frame_sem = "../data/pp_fsl_scratch_numbered.txt"
grammar   = "../data/realigned/trg_dep_scratch_realigned.txt"
conll2pp  = "../data/conll-pp-check_up.txt"

my_data = inspector(source, target, alignment, frame_sem, grammar, conll2pp, sample = 10)

for x in my_data:
    for y in x:
        y = y.strip("\n")
        print(len(y.split()), "\t", y.replace(" << NEXT ONE >> ", "\n\t"))
    print("-"*20)


In [None]:
def compare(file1, file2, out, span):
    with open(file1, "r") as f:
        one = f.readlines()
    with open(file2, "r") as f:
        two = f.readlines()
        
    with open(out, "w") as f:
        for one, two in zip(one, two):
            one = one.strip("\n")
            two = two.strip("\n")
            f.write(f"{one[:span]} <<>> {two[:span]}\n")

In [None]:
compare("../data/realigned/src_scratch_realigned.txt", "../data/conll-pp-check_up.txt", "../data/find_it.txt", 50)

# Evaluation

In [94]:
from operator import itemgetter
def src_evaluate(fsl_src, out):
    
    frames = {}
    frames_per_sent = {}
    fes_per_frame = []
    f_list = []
    missing_fe = 0
    lu_being_fe = {}
    
    i=0
   
    with open(fsl_src, "r") as f:
        lines = f.readlines()
        for line in lines:
            frames_sent = set()
            for frame in line.split():
                frame = frame.split(";")
                name = frame[0]
                frames_sent.add(name)
                f_list.append(name)
                if name in frames:
                    frames[name] += 1
                else:
                    frames[name] = 1
                lu = int(frame[1])
                fes  = frame[2]
                if fes == "":
                    missing_fe += 1
                    fes_per_frame.append(0)
                else:
                    fes_lst = fes.split("|")
                    fes_per_frame.append(len(fes_lst))
                    for fe in fes_lst:
                        fe = fe.split("#")
                        start, stop = tuple([int(x) for x in fe[0].split(":")])
                        fe_typ = fe[1]
                        if start == lu:
                            if stop == start + 1:
                                if fe_typ in lu_being_fe:
                                    lu_being_fe[fe_typ] += 1
                                else: 
                                    lu_being_fe[fe_typ] = 1
            for fsx in frames_sent:
                if fsx in frames_per_sent:
                    frames_per_sent[fsx] += 1
                else:
                    frames_per_sent[fsx] = 1
                
    
    length = len(lines)
    topk = 15
    frames_count = list(frames.items())
    frames_count.sort(key=itemgetter(1), reverse=True)
    
    print(f"Length: {length}")
    print(f"No. of frames (types): {len(frames.keys())}")
    print(f"No. of frame tokens: {len(f_list)}")
    print(f"Frames missing FEs: {missing_fe}")
    print(f"Frame per sentence: {round(len(f_list)/length, 3)}")
    print(f"Frame elements per frame: {round(sum(fes_per_frame)/len(fes_per_frame), 3)}")
    exm = [x for x in fes_per_frame if x > 0]
    print(f"Frame elements per frame (excl. missing): {round(sum(exm)/len(exm), 3)}")
    print(f"No. of frames where LU is also FE: {sum(lu_being_fe.values())}")

    print()
    print(f"Top {topk} frames:")
    for f in frames_count[:topk]:
        print(f"{f[0]}\t{f[1]}\t{frames_per_sent[f[0]]}\t{round((frames_per_sent[f[0]]/length)*100, 1)}")
        #print(f"{f[0]}\t{f[1]}\t{round((f[1]/length)*100, 1)}")

def transfer_eval(transfer_file, trg_file, out_file):
    
    f_status  = {}
    frames_per_sent = {}
    no_lu = 0
    frames = {}
    
    with open(transfer_file, "r") as f:
        lines = f.readlines()
        for line in lines:
            frames_sent = set()
            for frame in line.split():
                frame = frame.split(";")
                name = frame[0]
                frames_sent.add(name)
                
                lu = frame[1]
                if lu == "NoLUalign":
                    no_lu += 1
                else:
                    if name in frames:
                        frames[name] += 1
                    else:
                        frames[name] = 1
                
                fe = frame[2]
                if fe in f_status:
                    f_status[fe] += 1
                else:
                    f_status[fe] = 1
            for fsx in frames_sent:
                if fsx in frames_per_sent:
                    frames_per_sent[fsx] += 1
                else:
                    frames_per_sent[fsx] = 1
    
    length = len(lines)
    frames_count = list(frames.items())
    frames_count.sort(key=itemgetter(1), reverse=True)
    topk = 15
    
    # 'NoFEsinSRC', 'NoTransfer', 'FEsinSRCbutNoLU', 'NoDepTRGLU'
    for s in f_status.keys():
        print(s, f_status[s])
        
    print("No LU Alignment:", no_lu)
    
    print(f"Top {topk} frames:")
    for f in frames_count[:topk]:
        print(f"{f[0]}\t{f[1]}\t{frames_per_sent[f[0]]}\t{round((frames_per_sent[f[0]]/length)*100, 1)}")
        
                    
    

In [95]:
src_evaluate("../data/pp_fsl_scratch_numbered.txt", None)

Length: 1968
No. of frames (types): 446
No. of frame tokens: 12071
Frames missing FEs: 6489
Frame per sentence: 6.134
Frame elements per frame: 0.592
Frame elements per frame (excl. missing): 1.281
No. of frames where LU is also FE: 1958

Top 15 frames:
Origin	1986	1126	57.2
Degree	1024	772	39.2
Causation	563	485	24.6
Means	518	415	21.1
Taking_time	292	263	13.4
Possession	279	251	12.8
Negation	262	247	12.6
Performers_and_roles	233	224	11.4
Proportional_quantity	202	184	9.3
Being_employed	161	153	7.8
Quantified_mass	160	155	7.9
Measure_volume	158	151	7.7
Type	157	146	7.4
Likelihood	156	148	7.5
Organization	138	129	6.6


In [84]:
src_evaluate("../data/results/result.txt", None)

ValueError: invalid literal for int() with base 10: 'NoFEsinSRC'

In [96]:
transfer_eval("../data/results/result.txt", None, None)

NoFEsinSRC 6489
NoTransfer 4344
FEsinSRCbutNoLU 1188
NoDepTRGLU 50
No LU Alignment: 2832
Top 15 frames:
Origin	1218	1126	57.2
Degree	867	772	39.2
Causation	434	485	24.6
Means	385	415	21.1
Taking_time	238	263	13.4
Negation	232	247	12.6
Possession	191	251	12.8
Proportional_quantity	157	184	9.3
Performers_and_roles	150	224	11.4
Measure_volume	138	151	7.7
Likelihood	133	148	7.5
Quantified_mass	129	155	7.9
Organization	126	129	6.6
Being_employed	108	153	7.8
Time_vector	108	126	6.4


# Output format

In [None]:
def xml_annotator(trg, transfer):
    pass
       
    

# Obsolete ... 

In [None]:
def simplify(conll_file, output_file, check_file, sep=" "):
    
    with open(conll_file, "r") as original, open(output_file, "w") as output, open(check_file, "w") as check:
        
        raw = ""
        annot_sent = []
        conll_in   = []
        previous   = None
        
        for line in original:
            if line == "\n":
                sent, annot_lu = read_conll(raw)
                conll          = terminator(raw)
                
                
                
                if sent == previous:
                    annot_sent.append(annot_lu)
                    conll_in.append(conll)
                    raw = ""
                    previous = sent
                    continue
                
                else:
                    
                    
                    annot_sent.append(annot_lu)
                    output.write(sep.join(annot_sent) + "\n")
                    annot_sent = []
                    conll_in.append(conll)
                    check.write(" << NEXT ONE >> ".join(conll_in) + "\n")
                    conll_in = []
                    raw = ""
                    previous = sent
                    continue
            raw += line
