In [None]:
!pip install sentencepiece
!pip install waiting

!mkdir /content/fastalign
!cmake -B/content/fastalign -H/content/drive/MyDrive/tools/fast_align
!make -C/content/fastalign

mkdir: cannot create directory ‘/content/fastalign’: File exists
-- Could NOT find SparseHash (missing: SPARSEHASH_INCLUDE_DIR) 
-- Configuring done
-- Generating done
-- Build files have been written to: /content/fastalign
make: Entering directory '/content/fastalign'
make[1]: Entering directory '/content/fastalign'
make[2]: Entering directory '/content/fastalign'
make[2]: Leaving directory '/content/fastalign'
[ 50%] Built target atools
make[2]: Entering directory '/content/fastalign'
make[2]: Leaving directory '/content/fastalign'
[100%] Built target fast_align
make[1]: Leaving directory '/content/fastalign'
make: Leaving directory '/content/fastalign'


In [None]:
from IPython.lib.display import ScribdDocument
import os
import sentencepiece as spm
from waiting import wait
import threading, subprocess
import concurrent.futures
import time

def get_dictionary(file_name):
    word_dict = {}
    char_dict = {}
    with open(file_name, 'r') as file_in:
        for line in file_in:
            for word in line.split():
                if word not in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1
                
    for word in word_dict:
        for char in word:
            if char not in char_dict:
                char_dict[char] = 1
            else:
                char_dict[char] += 1
    
    if not os.path.isfile(file_name + '.vocab_word'):
        with open(file_name + '.vocab_word', 'w+') as file_dict:
            for word in word_dict:
                file_dict.write(word + ' ' + str(word_dict[word]) + '\n')
    
    if not os.path.isfile(file_name + '.vocab_char'):
        with open(file_name + '.vocab_char', 'w+') as file_dict:
            for char in char_dict:
                file_dict.write(char + ' ' + str(char_dict[char]) + '\n')

    return len(word_dict), len(char_dict)

def get_tokenized_corpus(input, model_type, vocab_size=100, timeout_seconds=10):
    model_prefix= input + '_' + model_type
    if model_type in ['bpe','unigram']:
        model_prefix+= '_' + str(vocab_size)

    if not os.path.isfile(model_prefix+'.model'):
        print("Training Sentencepiece:", model_prefix)
        cmd = "--input=%s --model_prefix=%s --model_type=%s --vocab_size=%d" % (input, model_prefix, model_type, vocab_size)
        spm.SentencePieceTrainer.train(cmd)

    wait(lambda: os.path.isfile(model_prefix+'.model'), timeout_seconds=timeout_seconds, waiting_for="SentencePiece creates .model")
    vocab_size_ = vocab_size
    
    # if not os.path.isfile(model_prefix) and not os.path.isfile("/".join(model_prefix.split('/')[:-1]) + '/' + model_prefix.split('/')[-1].replace('_train', '')):
    if not os.path.isfile(model_prefix):
        sp = spm.SentencePieceProcessor()
        sp.load(model_prefix + '.model')

        file_in = open(input, "r")

        with open(model_prefix, "w+") as file_out:
            for line in file_in:
                file_out.write(" ".join(sp.encode_as_pieces(line)) + "\n" )
    
        vocab_size_ = sp.GetPieceSize()
    return model_prefix, vocab_size_

def token_2_word_alignment(src, tgt, align_file, is_word=[False, False]): 
    if not os.path.isfile(align_file+'.2word'):
        with open(src, 'r') as src_file, open(tgt, 'r') as tgt_file, open(align_file, 'r') as file_in, open(align_file+'.2word', 'w+') as file_out:
            for line_src, line_tgt, line_in in zip(src_file, tgt_file, file_in):

                line_srcs = line_src.split()
                if is_word[0]:
                    line_srcs = ['▁'+tok for tok in line_srcs]

                line_tgts = line_tgt.split()
                if is_word[1]:
                    line_tgts = ['▁'+tok for tok in line_tgts]
                
                word_idx = 0
                token2word_src = {}
                for idx, tok in enumerate(line_srcs):
                    if tok == '▁':
                        word_idx+=1
                    elif tok[0] == '▁':
                        word_idx+=1
                        token2word_src[idx+1] = word_idx
                    else:
                        token2word_src[idx+1] = word_idx
                
                word_idx = 0
                token2word_tgt = {}
                for idx, tok in enumerate(line_tgts):
                    if tok == '▁':
                        word_idx+=1
                    elif tok[0] == '▁':
                        word_idx+=1
                        token2word_tgt[idx+1] = word_idx
                    else:
                        token2word_tgt[idx+1] = word_idx

                word_aligns = []
                for align in line_in.split():
                    align_src = int(align.split('-')[0])
                    align_tgt = int(align.split('-')[1])

                    if align_src in token2word_src and align_tgt in token2word_tgt:
                        align_word = str(token2word_src[align_src])+'-'+str(token2word_tgt[align_tgt])
                        if align_word not in word_aligns:
                            word_aligns.append(align_word)

                file_out.write(" ".join(word_aligns)+'\n')
    
    return align_file+'.2word'

def get_test_tokenized(file_name, num_sent_test):
    dir_out = "/".join(file_name.split('/')[:-1]) + '/' + file_name.split('/')[-1].replace('_train', '')
    if not os.path.isfile(dir_out):
        print('Create ', dir_out)
        with open(file_name, 'r') as file_in, open(dir_out, 'w+') as file_out:
            idx = 0
            for line_in in file_in:
                file_out.write(line_in)
                idx+=1
                if idx == num_sent_test: break

        #print('Delete ', file_name)
        #open(file_name, 'w').close()
        #os.remove(file_name)

    return dir_out
            
def train_fastalign(vocab_pair, file_src, file_tgt, num_sent_test, timeout_seconds=10):
    both = file_src.split('/')[-1] + '-' + file_tgt.split('/')[-1]
    both_dir = "/".join(file_src.split('/')[:-1]) + '/' + both

    if not os.path.isfile(both_dir + '.align.shift1.2word'):

        if not os.path.isfile(both_dir):
            print(vocab_pair, ': Create ', both_dir)
            with open(file_src, 'r') as file_src_, open(file_tgt, 'r') as file_tgt_, \
            open(both_dir, "w+") as file_both:
                for line_src, line_tgt in zip(file_src_, file_tgt_):
                    if line_src != '\n' and line_tgt != '\n' and line_src != '' and line_tgt != '':
                        file_both.write(line_src.rstrip('\n') + ' ||| ' + line_tgt)
            
        if not os.path.isfile(both_dir+'.align') and not os.path.isfile(both_dir+'.align.shift1'):
            cmd_fastalign = "/content/fastalign/fast_align -i %s -d -o -v > %s.align"%(both_dir, both_dir)
            os.system(cmd_fastalign)
            print(vocab_pair, ': Done training ', both_dir+'.align')
            wait(lambda: os.path.isfile(both_dir+'.align'), timeout_seconds=timeout_seconds, waiting_for="Fastalign returns alignment file")
        

        if not os.path.isfile(both_dir+'.align.shift1'):
            print(vocab_pair, ': Shift 1 ', both_dir+'.align.shift1')
            with open(both_dir+'.align', 'r') as file_in, open(both_dir+'.align.shift1', 'w+') as file_out:
                idx = 0
                for line_in in file_in:
                    aligns=[]
                    for align in line_in.split():
                        aligns.append("%d-%d"%(int(align.split('-')[0])+1, int(align.split('-')[1])+1))

                    line_out = " ".join(aligns)
                    file_out.write(line_out+'\n')
                    idx+=1
                    if idx == num_sent_test: break
        
            wait(lambda: os.path.isfile(both_dir+'.align.shift1'), timeout_seconds=timeout_seconds, waiting_for="Shift 1 for alignment file")

        if not os.path.isfile(both_dir+'.align.shift1.2word'):
            print(vocab_pair, ': Create ', both_dir+'.align.shift1.2word')
            is_word = [True if vocab=='word' else False for vocab in vocab_pair.split('-')]
            align_name = both_dir+'.align.shift1'
            src_name = "/".join(align_name.split('/')[:-1]) + '/' + align_name.split('/')[-1].split('.')[0].split('-')[0]
            tgt_name = "/".join(align_name.split('/')[:-1]) + '/' + align_name.split('/')[-1].split('.')[0].split('-')[1]
            
            final_output = token_2_word_alignment(src_name, tgt_name, align_name, is_word=is_word)

            wait(lambda: os.path.isfile(final_output), timeout_seconds=timeout_seconds, waiting_for="Token_2_word for alignment file")

        print('-----', vocab_pair, ': DONE ', both_dir+'.align.shift1.2word')
    else:
        print('-----', vocab_pair, ': Already found')

    if os.path.isfile(both_dir) and os.path.isfile(both_dir+'.align'):
        #print(vocab_pair, ': Delete ', both_dir)
        with open(both_dir+'.align', 'r') as f:
            count = 0
            for count, line in enumerate(f):
                pass
            num_line =  count + 1
        if num_line > 10:
            open(both_dir, 'w').close()
            os.remove(both_dir)

    if os.path.isfile(both_dir+'.align') and os.path.isfile(both_dir+'.align.shift1'):
        #print(vocab_pair, ': Delete ', both_dir+'.align')
        with open(both_dir+'.align.shift1', 'r') as f:
            count = 0
            for count, line in enumerate(f):
                pass
            num_line =  count + 1
        if num_line > 10:
            open(both_dir+'.align', 'w').close()
            os.remove(both_dir+'.align')

    if os.path.isfile(both_dir+'.align.shift1') and os.path.isfile(both_dir+'.align.shift1.2word'):
        #print(vocab_pair, ': Delete ', both_dir+'.align.shift1')
        with open(both_dir+'.align.shift1.2word', 'r') as f:
            count = 0
            for count, line in enumerate(f):
                pass
            num_line =  count + 1
        if num_line > 10:
            open(both_dir+'.align.shift1', 'w').close()
            os.remove(both_dir+'.align.shift1')

    return both_dir+'.align.shift1.2word'

def get_alignment_files(train_src, train_tgt, 
                        test_src, test_tgt,
                        data_dir,
                        src,
                        tgt,
                        model_type='bpe',
                        vocab_sizes=None,
                        timeout_seconds=180):
    
    # Concatenate test and train set
    cmd_format = "cat %s %s > %s%s_test_train"
    cmd_src = cmd_format%(test_src, train_src, data_dir, src)
    cmd_tgt = cmd_format%(test_tgt, train_tgt, data_dir, tgt)

    os.system('mkdir ' + data_dir)

    print(cmd_src)
    print(cmd_tgt)

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.submit(os.system(cmd_src))
        executor.submit(os.system(cmd_tgt))

    wait(lambda: os.path.isfile(data_dir + src + '_test_train'), timeout_seconds=timeout_seconds, waiting_for="Os creates source file")
    wait(lambda: os.path.isfile(data_dir + tgt + '_test_train'), timeout_seconds=timeout_seconds, waiting_for="Os creates target file")

    #---------------------------------------------------------------------------
    # Get vocabulary size and create vocabulary file.
    start_time = time.time()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        vocab_word_size_src, vocab_char_size_src = executor.submit(get_dictionary, data_dir + src + '_test_train').result()
        vocab_word_size_tgt, vocab_char_size_tgt = executor.submit(get_dictionary, data_dir + tgt + '_test_train').result()

    print("Source vocab:", vocab_word_size_src, vocab_char_size_src)
    print("Target vocab:", vocab_word_size_tgt, vocab_char_size_tgt)

    if vocab_sizes is not None:
        vocab_src_sizes = [size for size in vocab_sizes if size > vocab_char_size_src and size < vocab_word_size_src]
        vocab_tgt_sizes = [size for size in vocab_sizes if size > vocab_char_size_tgt and size < vocab_word_size_tgt]

        print("Search space:", vocab_sizes)
        print("Search space for Source:", vocab_src_sizes)
        print("Search space for Target:", vocab_tgt_sizes)

    print("Preparing vocabulary finised in %f"%(time.time()-start_time))

    #---------------------------------------------------------------------------
    start_time = time.time()
    files_src = {}
    files_tgt = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        output_src = executor.submit(get_tokenized_corpus, data_dir + src + '_test_train', 'char')
        output_tgt = executor.submit(get_tokenized_corpus, data_dir + tgt + '_test_train', 'char')

        token_char_src, size_src = output_src.result()
        token_char_tgt, size_tgt = output_tgt.result()

    files_src['char'] = token_char_src
    files_tgt['char'] = token_char_tgt

    if vocab_sizes is not None:
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            output_src = [executor.submit(get_tokenized_corpus, data_dir + src + '_test_train', model_type, vocab_src) for vocab_src in vocab_src_sizes] 
            output_tgt = [executor.submit(get_tokenized_corpus, data_dir + tgt + '_test_train', model_type, vocab_tgt) for vocab_tgt in vocab_tgt_sizes] 
            
            for output in concurrent.futures.as_completed(output_src):
                files_src[str(output.result()[1])] = output.result()[0]

            for output in concurrent.futures.as_completed(output_tgt):
                files_tgt[str(output.result()[1])] = output.result()[0]

    files_src['word'] = data_dir + src + '_test_train'
    files_tgt['word'] = data_dir + tgt + '_test_train'
    
    print("Tokenization files:")
    print(files_src)
    print(files_tgt)
    print("Tokenization finised in %f"%(time.time()-start_time))

    #---------------------------------------------------------------------------
    start_time = time.time()

    with open(test_src, 'r') as test_src_file:
        for count, line in enumerate(test_src_file):
            pass
        num_sent_test =  count + 1

    files_align2word_src_tgt = {}
    files_align2word_tgt_src = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        outputs_src_tgt = {}
        outputs_tgt_src = {}

        for src_ in files_src:
            for tgt_ in files_tgt:
                outputs_src_tgt[src_+'-'+tgt_] = executor.submit(train_fastalign, src_+'-'+tgt_, files_src[src_], files_tgt[tgt_], num_sent_test, timeout_seconds)

        for tgt_ in files_tgt:
            for src_ in files_src:
                outputs_tgt_src[tgt_+'-'+src_] = executor.submit(train_fastalign, tgt_+'-'+src_, files_tgt[tgt_], files_src[src_], num_sent_test, timeout_seconds)

        for vocab_pair in outputs_src_tgt:
            files_align2word_src_tgt[vocab_pair] = outputs_src_tgt[vocab_pair].result()

        for vocab_pair in outputs_tgt_src:
            files_align2word_tgt_src[vocab_pair] = outputs_tgt_src[vocab_pair].result()

    print("Alignment-word output files:")
    print("Src-Tgt:", len(files_align2word_src_tgt), files_align2word_src_tgt)
    print("Tgt-Src:", len(files_align2word_tgt_src), files_align2word_tgt_src)

    with open(data_dir+"output_%s_%s"%(src,tgt), 'w+') as f: 
        for key, value in files_align2word_src_tgt.items(): 
            f.write('%s %s\n' % (key, value))

    with open(data_dir+"output_%s_%s"%(tgt,src), 'w+') as f: 
        for key, value in files_align2word_tgt_src.items(): 
            f.write('%s %s\n' % (key, value))

    files_test_src = {}
    files_test_tgt = {}

    for vocab in files_src:
        dir_out = get_test_tokenized(files_src[vocab], num_sent_test)
        files_test_src[vocab] = dir_out

    for vocab in files_tgt:
        dir_out = get_test_tokenized(files_tgt[vocab], num_sent_test)
        files_test_tgt[vocab] = dir_out

    with open(data_dir+"output_%s"%(src), 'w+') as f: 
            for key, value in files_test_src.items(): 
                f.write('%s %s\n' % (key, value))

    with open(data_dir+"output_%s"%(tgt), 'w+') as f: 
        for key, value in files_test_tgt.items(): 
            f.write('%s %s\n' % (key, value))

    print("Training and Re-aligning finised in %f"%(time.time()-start_time))

In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
import json
import concurrent.futures

def clean_alignment(line):
    line = line.replace("\n",'')
    return line

def clean_sentence(line):
    line = line.replace("\n",'')
    return line

def get_prediction(prediction_file_name):
    prediction_file = open(prediction_file_name, 'r', encoding='utf-8')
    prediction_output = []
    for line in prediction_file:
        prediction_output.append([ x for x in clean_alignment(line).split() ])
        
    return prediction_output

    
def get_reference(reference_file_name, ref_only=True):
    reference_file = open(reference_file_name, 'r', encoding='utf-8')
    
    reference_lines = reference_file.readlines()
    reference_lines = np.reshape(reference_lines, (int(len(reference_lines)/2), 2))

    ref = []
    
    if ref_only:
        for s, p in zip(reference_lines[:,0], reference_lines[:,1]):
            s = clean_alignment(s)
            p = clean_alignment(p)
            ref.append(s.split() + p.split())
        return ref
    else:
        sure = []
        possible = []
    
        for s, p in zip(reference_lines[:,0], reference_lines[:,1]):
            s = clean_alignment(s)
            p = clean_alignment(p)
            sure.append(s.split())
            possible.append(p.split())
            ref.append(s.split() + p.split())
        
        return ref, sure, possible
        
def get_corpus_file(corpus_file_name):
    corpus_file = open(corpus_file_name, 'r', encoding='utf-8')
    corpus_output = []
    for line in corpus_file:
        corpus_output.append([ x for x in clean_sentence(line).split() ])
        
    return corpus_output

def calculate_AER(reference_file_name, prediction_file_name):
    reference_set, sure_set, fuzzy_set = get_reference(reference_file_name, ref_only=False)
    prediction_set = get_prediction(prediction_file_name)
    
    sure_correct = 0.
    fuzzy_correct = 0.
    count_alignment = 0.
    count_sure = 0.
    
    for sure, fuzzy, prediction in zip(sure_set, fuzzy_set, prediction_set):
        for align in prediction:
            if align in sure:
                sure_correct+=1.
            if align in fuzzy:
                fuzzy_correct+=1.
                
        count_alignment += float(len(prediction))
        count_sure += float(len(sure))
    
    aer = 1. - (sure_correct*2 + fuzzy_correct)/ (count_alignment + count_sure)
    
    return aer

def calculate_scores(tp, fp, tn, fn):
    acc = 0.
    if tp + fp + tn + fn != 0:
        acc = (tp+tn)/(tp + fp + tn + fn)
        
    precision = 0.
    if tp+fp != 0.:
        precision = tp/(tp+fp)
        
    recall = 0.
    if tp+fn != 0.:
        recall = tp/(tp+fn)
        
    f1 = 0.
    if precision+recall != 0:
        f1 = (2*precision*recall)/(precision+recall)
    
    return acc, precision, recall, f1

def analyse_reference(reference_set,
                      sure_set, possible_set,
                      source_set, target_set):

    #-------------------------------------
    total_num_link = 0
    total_null_link = 0
    
    num_word_source = 0
    num_word_target = 0

    len_word_source = []
    len_word_target = []

    len_char_source = []
    len_char_target = []
    
    # Count for Reference
    num_align_ref = 0
    num_no_ref = 0
    
    num_sure = 0
    num_fuzzy = 0
    
    num_align_ref_one2one = 0
    num_align_ref_one2many_source = 0
    num_align_ref_one2many_target = 0
    num_align_ref_many2one_source = 0
    num_align_ref_many2one_target = 0
    num_align_ref_many2many = 0
    num_align_ref_many2many_source = 0
    num_align_ref_many2many_target = 0
    
    num_no_ref_no = 0
    num_no_ref_null = 0


    # Null
    num_source2null_ref = 0
    num_target2null_ref = 0
    num_source2notnull_ref = 0
    num_target2notnull_ref = 0
    
    num_source2null_ref_ratio_list = []
    num_target2null_ref_ratio_list = []
    
    for line in sure_set:
        for s in line:
            num_sure+= 1
    
    for line in possible_set:
        for p in line:
            num_fuzzy+= 1
    
    for idx, [source, target, ref] in enumerate(zip(source_set, target_set, reference_set)):
        
        source_len = len(source)
        target_len = len(target)

        len_word_source.append(source_len)
        len_word_target.append(target_len)

        len_char_source_ = 0
        for word in source:
            len_char_source_+=len(word)
        len_char_source.append(len_char_source_)

        len_char_target_ = 0
        for word in target:
            len_char_target_+=len(word)
        len_char_target.append(len_char_target_)

        
        total_num_link += ((source_len) * (target_len))
        total_null_link += ((source_len) + (target_len))
        
        num_word_source += source_len
        num_word_target += target_len
        

        #---------------------------
        # Null
        num_source2null_ref_sent = 0
        num_target2null_ref_sent = 0
        
        for idx_source in range(1, source_len+1):
            source_to_null_ref = True
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                if align_check in ref:
                    source_to_null_ref = False
                    
            if source_to_null_ref:
                num_source2null_ref+=1
                num_source2null_ref_sent+=1
            else:
                num_source2notnull_ref+=1
                    
            
        for idx_target in range(1, target_len+1):
            target_to_null_ref = True
            for idx_source in range(1, source_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                if align_check in ref:
                    target_to_null_ref = False
                    
            if target_to_null_ref:
                num_target2null_ref+=1
                num_target2null_ref_sent+=1
            else:
                num_target2notnull_ref+=1
                

                
        num_source2null_ref_ratio_list.append(num_source2null_ref_sent/source_len)
        num_target2null_ref_ratio_list.append(num_target2null_ref_sent/target_len)
        
        num_align_ref_one2many_source_list = []
        num_align_ref_many2one_target_list = []
        num_align_ref_many2many_source_list = []
        num_align_ref_many2many_target_list = []
        
        align_ref_one2one_list = []
        align_ref_one2many_list = []
        align_ref_many2one_list = []
        align_ref_many2many_list = []
        
        for idx_source in range(1, source_len+1):
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                
                # Count number of links in Ref
                if align_check in ref:
                    num_align_ref +=1
                    
                    check_one2many = False
                    check_many2one = False
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in ref and align_check_ != align_check:
                            check_one2many = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in ref and align_check_ != align_check:
                            check_many2one = True
                    
                    if check_one2many is True and check_many2one is False:
                        num_align_ref_one2many_target +=1
                        align_ref_one2many_list.append(align_check)
                        if idx_source not in num_align_ref_one2many_source_list:
                            num_align_ref_one2many_source_list.append(idx_source)
                    if check_many2one is True and check_one2many is False:
                        num_align_ref_many2one_source +=1
                        align_ref_many2one_list.append(align_check)
                        if idx_target not in num_align_ref_many2one_target_list:
                            num_align_ref_many2one_target_list.append(idx_target)
                    if check_one2many is False and check_many2one is False:
                        num_align_ref_one2one +=1
                        align_ref_one2one_list.append(align_check)
                        
                # Count number of links not in Ref
                if align_check not in ref:
                    num_no_ref +=1
                    
                    source_to_null = True
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in ref:
                            source_to_null = False
                    
                    target_to_null = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in ref:
                            target_to_null = False
                            
                    if source_to_null and target_to_null:
                        num_no_ref_null +=1
                        
                    else:
                        num_no_ref_no +=1
                
                        
        for idx_source in range(1, source_len+1):
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                # Count number of many2many links in Ref
                if align_check in ref:
                    
                    check_one2many = False
                    check_many2one = False
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in ref and align_check_ != align_check:
                            check_one2many = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in ref and align_check_ != align_check:
                            check_many2one = True
                            
                    if check_one2many is True and check_many2one is True:
                        if idx_source not in num_align_ref_many2many_source_list \
                        and idx_source not in num_align_ref_one2many_source_list:
                            num_align_ref_many2many_source_list.append(idx_source)
                        if idx_target not in num_align_ref_many2many_target_list \
                        and idx_target not in num_align_ref_many2one_target_list:
                            num_align_ref_many2many_target_list.append(idx_target)
                        num_align_ref_many2many+=1
                        align_ref_many2many_list.append(align_check)
                        
                        
        num_align_ref_one2many_source += len(num_align_ref_one2many_source_list)
        num_align_ref_many2one_target += len(num_align_ref_many2one_target_list)
        
        num_align_ref_many2many_source += len(num_align_ref_many2many_source_list)
        num_align_ref_many2many_target += len(num_align_ref_many2many_target_list)
    
    
    num_source2null_ref_ratio_mean = np.mean(num_source2null_ref_ratio_list)
    num_target2null_ref_ratio_mean = np.mean(num_target2null_ref_ratio_list)
    
    
    #-------------------------------------
    values = {"num_word_source": num_word_source, 
              "num_word_target": num_word_target,
              "total_num_link": total_num_link,
              "num_sure": num_sure, 
              "num_fuzzy": num_fuzzy,
              
              "num_align_ref": num_align_ref, 
              "num_no_ref": num_no_ref, 
              "num_no_ref_no": num_no_ref_no, 
              "num_no_ref_null": num_no_ref_null,

              "num_align_ref_one2one": num_align_ref_one2one, 
              "num_align_ref_one2many_source": num_align_ref_one2many_source, 
              "num_align_ref_one2many_target": num_align_ref_one2many_target,
              "num_align_ref_many2one_source": num_align_ref_many2one_source, 
              "num_align_ref_many2one_target": num_align_ref_many2one_target,
              "num_align_ref_many2many": num_align_ref_many2many, 
              "num_align_ref_many2many_source": num_align_ref_many2many_source, 
              "num_align_ref_many2many_target": num_align_ref_many2many_target,

              "total_null_link": total_null_link, 
              "num_source2null_ref": num_source2null_ref, 
              "num_target2null_ref": num_target2null_ref, 
              "num_source2notnull_ref": num_source2notnull_ref, 
              "num_target2notnull_ref": num_target2notnull_ref,
              "num_source2null_ref_ratio_mean": num_source2null_ref_ratio_mean, 
              "num_target2null_ref_ratio_mean": num_target2null_ref_ratio_mean,
              
              "len_word_source": len_word_source,
              "len_word_target": len_word_target,
              
              "len_char_source": len_char_source,
              "len_char_target": len_char_target

              }
    
    return values

def analyse_prediction(prediction_set,
                       reference_set,
                       sure_set, fuzzy_set,
                       source_set,
                       target_set):

    # Count for Prediction
    num_align_pred = 0
    num_no_pred = 0
    num_align_pred_one2one = 0
    num_align_pred_one2many_source = 0
    num_align_pred_one2many_target = 0
    num_align_pred_many2one_source = 0
    num_align_pred_many2one_target = 0
    num_align_pred_many2many = 0
    num_align_pred_many2many_source = 0
    num_align_pred_many2many_target = 0

    num_no_pred_no = 0
    num_no_pred_null = 0
    
    #TP
    num_true_align_tp = 0
    
    num_true_align_tp_one2one_pred = 0
    num_true_align_tp_one2many_pred = 0
    num_true_align_tp_many2one_pred = 0
    num_true_align_tp_many2many_pred = 0
    
    #TN
    num_true_no_tn = 0
    num_true_no_tn_no_in_pred = 0
    num_true_no_tn_null_in_pred = 0
    
    #FN
    num_false_no_fn = 0
    num_false_no_fn_no_in_pred = 0
    num_false_no_fn_null_in_pred = 0
    
    #FP
    num_false_align_no_fp = 0
    num_false_align_no_fp_one2one_pred = 0
    num_false_align_no_fp_one2many_pred = 0
    num_false_align_no_fp_many2one_pred = 0
    num_false_align_no_fp_many2many_pred = 0

    num_false_align_no_fp_no_in_ref = 0
    num_false_align_no_fp_null_in_ref = 0

    # Null
    num_source2null_pred = 0
    num_target2null_pred = 0
    num_source2notnull_pred = 0
    num_target2notnull_pred = 0
    
    num_source2null_pred_tp = 0
    num_source2null_pred_fp = 0
    num_source2null_pred_tn = 0
    num_source2null_pred_fn = 0
    
    num_target2null_pred_tp = 0
    num_target2null_pred_fp = 0
    num_target2null_pred_tn = 0
    num_target2null_pred_fn = 0
    
    num_source2null_pred_ratio_list = []
    num_target2null_pred_ratio_list = []
    
    num_true_null_tp = 0
    num_false_not_null_fp = 0
    num_false_null_fn = 0
    num_true_not_null_tn = 0
    
    for idx, [source, target, pred, ref] in enumerate(zip(source_set, target_set,
                                                                      prediction_set, reference_set)):
        
        source_len = len(source)
        target_len = len(target)

        #---------------------------
        # Null
        num_source2null_pred_sent = 0
        num_target2null_pred_sent = 0
        
        for idx_source in range(1, source_len+1):
            source_to_null_ref = True
            source_to_null_pred = True
            
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                if align_check in ref:
                    source_to_null_ref = False
                if align_check in pred:
                    source_to_null_pred = False
                    
                
            if source_to_null_pred:
                num_source2null_pred+=1
                num_source2null_pred_sent+=1
                
            else:
                num_source2notnull_pred+=1
                
            if source_to_null_ref and source_to_null_pred:
                num_true_null_tp+=1
                num_source2null_pred_tp+=1
                
            if not source_to_null_ref and not source_to_null_pred:
                num_true_not_null_tn+=1  
                num_source2null_pred_tn+=1
                
            if source_to_null_ref and not source_to_null_pred:
                num_false_null_fn+=1
                num_source2null_pred_fn+=1
                
            if not source_to_null_ref and source_to_null_pred:
                num_false_not_null_fp+=1
                num_source2null_pred_fp+=1
                
            
        for idx_target in range(1, target_len+1):
            target_to_null_ref = True
            target_to_null_pred = True
            
            for idx_source in range(1, source_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                if align_check in ref:
                    target_to_null_ref = False
                if align_check in pred:
                    target_to_null_pred = False
                
            if target_to_null_pred:
                num_target2null_pred+=1
                num_target2null_pred_sent+=1
            else:
                num_target2notnull_pred+=1
                
            if target_to_null_ref and target_to_null_pred:
                num_true_null_tp+=1
                num_target2null_pred_tp+=1
            if not target_to_null_ref and not target_to_null_pred:
                num_true_not_null_tn+=1  
                num_target2null_pred_tn+=1
            if target_to_null_ref and not target_to_null_pred:
                num_false_null_fn+=1
                num_target2null_pred_fn+=1
            if not target_to_null_ref and target_to_null_pred:
                num_false_not_null_fp+=1
                num_target2null_pred_fp+=1
                
                
        num_source2null_pred_ratio_list.append(num_source2null_pred_sent/source_len)
        num_target2null_pred_ratio_list.append(num_target2null_pred_sent/target_len)
        
        
        num_align_pred_one2many_source_list = []
        num_align_pred_many2one_target_list = []
        num_align_pred_many2many_source_list = []
        num_align_pred_many2many_target_list = []
        
        align_pred_one2one_list = []
        align_pred_one2many_list = []
        align_pred_many2one_list = []
        align_pred_many2many_list = []
        
        for idx_source in range(1, source_len+1):
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)

                # Count number of links in Prediction
                if align_check in pred:
                    num_align_pred +=1
                    
                    check_one2many = False
                    check_many2one = False
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in pred and align_check_ != align_check:
                            check_one2many = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in pred and align_check_ != align_check:
                            check_many2one = True
                    
                    if check_one2many is True and check_many2one is False:
                        num_align_pred_one2many_target +=1
                        align_pred_one2many_list.append(align_check)
                        if idx_source not in num_align_pred_one2many_source_list:
                            num_align_pred_one2many_source_list.append(idx_source)
                    if check_many2one is True and check_one2many is False:
                        num_align_pred_many2one_source +=1
                        align_pred_many2one_list.append(align_check)
                        if idx_target not in num_align_pred_many2one_target_list:
                            num_align_pred_many2one_target_list.append(idx_target)
                    if check_one2many is False and check_many2one is False:
                        num_align_pred_one2one +=1
                        align_pred_one2one_list.append(align_check)
                
                # Count number of links not in Prediction
                if align_check not in pred:
                    num_no_pred+=1
                    
                    source_to_null = True
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in pred:
                            source_to_null = False
                    
                    target_to_null = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in pred:
                            target_to_null = False
                            
                    if source_to_null and target_to_null:
                        num_no_pred_null +=1
                    else:
                        num_no_pred_no +=1
                        
        for idx_source in range(1, source_len+1):
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                
                # Count number of many2many links in Prediction
                if align_check in pred:
                    
                    check_one2many = False
                    check_many2one = False
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in pred and align_check_ != align_check:
                            check_one2many = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in pred and align_check_ != align_check:
                            check_many2one = True
                            
                    if check_one2many is True and check_many2one is True:
                        if idx_source not in num_align_pred_many2many_source_list \
                        and idx_source not in num_align_pred_one2many_source_list:
                            num_align_pred_many2many_source_list.append(idx_source)
                        if idx_target not in num_align_pred_many2many_target_list \
                        and idx_target not in num_align_pred_many2one_target_list:
                            num_align_pred_many2many_target_list.append(idx_target)
                        num_align_pred_many2many+=1
                        align_pred_many2many_list.append(align_check)
                        
        for idx_source in range(1, source_len+1):
            for idx_target in range(1, target_len+1):
                align_check = str(idx_source) +'-'+ str(idx_target)
                
                    
                # Count number of links in Prediction and in Ref: TP
                if align_check in pred and align_check in ref:
                    num_true_align_tp +=1
                    
                    if align_check in align_pred_one2one_list:
                        num_true_align_tp_one2one_pred+=1
                    if align_check in align_pred_one2many_list:
                        num_true_align_tp_one2many_pred+=1
                    if align_check in align_pred_many2one_list:
                        num_true_align_tp_many2one_pred+=1
                    if align_check in align_pred_many2many_list:
                        num_true_align_tp_many2many_pred+=1
                
                        
                # Count number of links not in Prediction and not in Ref: TN
                if align_check not in pred and align_check not in ref:
                    num_true_no_tn+=1
                    
                    source_to_null = True
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in pred:
                            source_to_null = False
                    
                    target_to_null = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in pred:
                            target_to_null = False
                            
                    if source_to_null and target_to_null:
                        num_true_no_tn_null_in_pred +=1
                    else:
                        num_true_no_tn_no_in_pred +=1
                    
                # Count number of links in Prediction and not in Ref: FP
                if align_check in pred and align_check not in ref:
                    num_false_align_no_fp+=1
                    
                    if align_check in align_pred_one2one_list:
                        num_false_align_no_fp_one2one_pred+=1
                    if align_check in align_pred_one2many_list:
                        num_false_align_no_fp_one2many_pred+=1
                    if align_check in align_pred_many2one_list:
                        num_false_align_no_fp_many2one_pred+=1
                    if align_check in align_pred_many2many_list:
                        num_false_align_no_fp_many2many_pred+=1
                        
                    source_to_null = True
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in ref:
                            source_to_null = False
                    
                    target_to_null = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in ref:
                            target_to_null = False
                    
                    if source_to_null and target_to_null:
                        num_false_align_no_fp_null_in_ref +=1
                    else:
                        num_false_align_no_fp_no_in_ref +=1
                        
                # Count number of links not in Prediction and in Ref: FN
                if align_check not in pred and align_check in ref:
                    num_false_no_fn+=1
                    
                    source_to_null = True
                    for idx_target_ in range(1, target_len+1):
                        align_check_ = str(idx_source) +'-'+ str(idx_target_)
                        if align_check_ in pred:
                            source_to_null = False
                    
                    target_to_null = True
                    for idx_source_ in range(1, source_len+1):
                        align_check_ = str(idx_source_) +'-'+ str(idx_target)
                        if align_check_ in pred:
                            target_to_null = False
                    
                    if source_to_null and target_to_null:
                        num_false_no_fn_null_in_pred +=1
                    else:
                        num_false_no_fn_no_in_pred +=1  
        
        num_align_pred_one2many_source += len(num_align_pred_one2many_source_list)
        num_align_pred_many2one_target += len(num_align_pred_many2one_target_list)
        
        num_align_pred_many2many_source += len(num_align_pred_many2many_source_list)
        num_align_pred_many2many_target += len(num_align_pred_many2many_target_list)
    
    num_source2null_pred_ratio_mean = np.mean(num_source2null_pred_ratio_list)
    num_target2null_pred_ratio_mean = np.mean(num_target2null_pred_ratio_list)
        
    acc, precision, recall, f1 = calculate_scores(num_true_align_tp, num_false_align_no_fp, num_true_no_tn, num_false_no_fn)
    
    null_acc, null_precision, null_recall, null_f1 = calculate_scores(num_true_null_tp, num_false_not_null_fp, num_true_not_null_tn, num_false_null_fn)

    sure_correct = 0.
    fuzzy_correct = 0.
    count_alignment = 0.
    count_sure = 0.

    for sure, fuzzy, prediction in zip(sure_set, fuzzy_set, prediction_set):
        for align in prediction:
            if align in sure:
                sure_correct+=1.
            if align in fuzzy:
                fuzzy_correct+=1.
                
        count_alignment += float(len(prediction))
        count_sure += float(len(sure))
    
    aer = 1. - (sure_correct*2 + fuzzy_correct)/ (count_alignment + count_sure)
    
    values = {"num_align_pred": num_align_pred, 
              "num_no_pred": num_no_pred, 
              "num_no_pred_no": num_no_pred_no, 
              "num_no_pred_null": num_no_pred_null,
              
              "num_true_align_tp": num_true_align_tp,
              "num_false_align_no_fp": num_false_align_no_fp,
              "num_false_align_no_fp_no_in_ref": num_false_align_no_fp_no_in_ref, 
              "num_false_align_no_fp_null_in_ref": num_false_align_no_fp_null_in_ref,
              "num_false_no_fn": num_false_no_fn, 
              "num_false_no_fn_no_in_pred": num_false_no_fn_no_in_pred, 
              "num_false_no_fn_null_in_pred": num_false_no_fn_null_in_pred,
              "num_true_no_tn": num_true_no_tn, 
              "num_true_no_tn_no_in_pred": num_true_no_tn_no_in_pred, 
              "num_true_no_tn_null_in_pred": num_true_no_tn_null_in_pred,
              
              "acc": acc, 
              "precision": precision, 
              "recall": recall, 
              "f1": f1,
              "aer": aer,
             
              "num_align_pred_one2one": num_align_pred_one2one,
              "num_align_pred_one2many_source": num_align_pred_one2many_source, 
              "num_align_pred_one2many_target": num_align_pred_one2many_target,
              "num_align_pred_many2one_source": num_align_pred_many2one_source, 
              "num_align_pred_many2one_target": num_align_pred_many2one_target,
              "num_align_pred_many2many": num_align_pred_many2many, 
              "num_align_pred_many2many_source": num_align_pred_many2many_source, 
              "num_align_pred_many2many_target": num_align_pred_many2many_target,
             
              "num_true_align_tp_one2one_pred": num_true_align_tp_one2one_pred, 
              "num_true_align_tp_one2many_pred": num_true_align_tp_one2many_pred,
              "num_true_align_tp_many2one_pred": num_true_align_tp_many2one_pred, 
              "num_true_align_tp_many2many_pred": num_true_align_tp_many2many_pred,
              "num_false_align_no_fp_one2one_pred": num_false_align_no_fp_one2one_pred, 
              "num_false_align_no_fp_one2many_pred": num_false_align_no_fp_one2many_pred,
              "num_false_align_no_fp_many2one_pred": num_false_align_no_fp_many2one_pred, 
              "num_false_align_no_fp_many2many_pred": num_false_align_no_fp_many2many_pred,
             
              "num_source2null_pred": num_source2null_pred, 
              "num_target2null_pred": num_target2null_pred, 
              "num_source2notnull_pred": num_source2notnull_pred, 
              "num_target2notnull_pred": num_target2notnull_pred,
             
              "num_source2null_pred_tp": num_source2null_pred_tp, 
              "num_source2null_pred_fp": num_source2null_pred_fp, 
              "num_source2null_pred_fn": num_source2null_pred_fn, 
              "num_source2null_pred_tn": num_source2null_pred_tn,
              "num_target2null_pred_tp": num_target2null_pred_tp,
              "num_target2null_pred_fp": num_target2null_pred_fp, 
              "num_target2null_pred_fn": num_target2null_pred_fn, 
              "num_target2null_pred_tn": num_target2null_pred_tn,
             
              "num_source2null_pred_ratio_mean": num_source2null_pred_ratio_mean, 
              "num_target2null_pred_ratio_mean": num_target2null_pred_ratio_mean,
              "num_true_null_tp": num_true_null_tp, 
              "num_false_not_null_fp": num_false_not_null_fp, 
              "num_false_null_fn": num_false_null_fn, 
              "num_true_not_null_tn": num_true_not_null_tn,
             
              "null_acc": null_acc, 
              "null_precision": null_precision, 
              "null_recall": null_recall, 
              "null_f1": null_f1,

              }

    return values

def get_len(file_name):
    lens_word = []
    lens_char = []
    with open(file_name, 'r') as f:
        for line in f:
            words = line.split()
            lens_word.append(len(words))

            len_char = 0
            for w in words:
                len_char+= len(w)
            
            lens_char.append(len_char)

    return lens_word, lens_char
    
def get_subword_statistics(reference_file_name, source_file_name, target_file_name, file_align_dirs, file_token_source_dirs, file_token_target_dirs):
    stats = {}
    
    vocs_src = []
    vocs_tgt = []
    file_align_dict = {}
    with open(file_align_dirs, 'r') as f:
        for line in f:
            voc_src = line.split()[0].split('-')[0]
            voc_tgt = line.split()[0].split('-')[1]
            if voc_src not in vocs_src + ['char', 'word']:
                vocs_src.append(voc_src)
            if voc_tgt not in vocs_tgt+ ['char', 'word']:
                vocs_tgt.append(voc_tgt)

            file_align_dict[line.split()[0]] = line.split()[1]

    vocs_src= [int(v) for v in vocs_src]
    vocs_tgt= [int(v) for v in vocs_tgt]

    vocs_src.sort()
    vocs_tgt.sort()

    vocs_src= ['char'] + [str(v) for v in vocs_src] + ['word']
    vocs_tgt= ['char'] + [str(v) for v in vocs_tgt] + ['word']

    file_src_dict = {}
    with open(file_token_source_dirs, 'r') as f:
        for line in f:
            file_src_dict[line.split()[0]] = line.split()[1]

    file_tgt_dict = {}
    with open(file_token_target_dirs, 'r') as f:
        for line in f:
            file_tgt_dict[line.split()[0]] = line.split()[1]

    reference_set, sure_set, fuzzy_set = get_reference(reference_file_name, ref_only=False)
    source_set = get_corpus_file(source_file_name)
    target_set = get_corpus_file(target_file_name)

    def get_analyse_prediction(vocab_pair, prediction_set_, reference_set, sure_set, fuzzy_set, source_set, target_set):
        prediction_set = get_prediction(prediction_set_)
        statsPre = analyse_prediction(prediction_set, reference_set, sure_set, fuzzy_set, source_set, target_set)

        voc_src = vocab_pair.split('-')[0]
        voc_tgt = vocab_pair.split('-')[1]

        len_word_source, len_char_source = get_len(file_src_dict[voc_src])
        len_word_target, len_char_target = get_len(file_tgt_dict[voc_tgt])
        
        statsPre["len_word_source"] = len_word_source
        statsPre["len_char_source"] = len_char_source
        statsPre["len_word_target"] = len_word_target
        statsPre["len_char_target"] =len_char_target

        print(vocab_pair, end=' ')

        return statsPre

    with concurrent.futures.ThreadPoolExecutor() as executor:
        statsRef= executor.submit(analyse_reference, reference_set, sure_set, fuzzy_set, source_set, target_set)

        statsPres = {}
        for vocab_pair in file_align_dict:
            statsPres[vocab_pair] = executor.submit(get_analyse_prediction, vocab_pair, file_align_dict[vocab_pair], reference_set, sure_set, fuzzy_set, source_set, target_set)

        stats["reference"] = statsRef.result()

        for vocab_pair in file_align_dict:
            stats[vocab_pair] = statsPres[vocab_pair].result()

    return stats

def get_len_n_gram(file_name):
    n_grams = {}
    with open(file_name, 'r') as f:
        for line in f:
            words = line.split()
            for word in words:
                w = word.replace('▁', '')
                len_word = len(w)
                if len_word not in n_grams:
                    n_grams[len_word] = [w]
                else:
                    if w not in n_grams[len_word]:
                        n_grams[len_word].append(w)

    len_n_gram = {}
    for n_gram in n_grams:
        if n_gram != 0:
            len_n_gram[n_gram] = len(n_grams[n_gram])
            
    return len_n_gram

def add_subword_statistics_n_gram(subword_statistic_file_name, file_align_dirs, source_file_name, target_file_name, file_token_source_dirs, file_token_target_dirs):
    with open(subword_statistic_file_name, 'r') as f:
        stats = json.load(f)
    
    vocs_src = []
    vocs_tgt = []
    vocab_pairs = []
    with open(file_align_dirs, 'r') as f:
        for line in f:
            voc_src = line.split()[0].split('-')[0]
            voc_tgt = line.split()[0].split('-')[1]
            if voc_src not in vocs_src + ['char', 'word']:
                vocs_src.append(voc_src)
            if voc_tgt not in vocs_tgt+ ['char', 'word']:
                vocs_tgt.append(voc_tgt)

    vocs_src= [int(v) for v in vocs_src]
    vocs_tgt= [int(v) for v in vocs_tgt]

    vocs_src.sort()
    vocs_tgt.sort()

    vocs_src= ['char'] + [str(v) for v in vocs_src] + ['word']
    vocs_tgt= ['char'] + [str(v) for v in vocs_tgt] + ['word']

    file_src_dict = {}
    with open(file_token_source_dirs, 'r') as f:
        for line in f:
            file_src_dict[line.split()[0]] = line.split()[1]

    file_tgt_dict = {}
    with open(file_token_target_dirs, 'r') as f:
        for line in f:
            file_tgt_dict[line.split()[0]] = line.split()[1]
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        statsRef = stats["reference"]

        statsRef['n_gram_src'] = executor.submit(get_len_n_gram, source_file_name).result()
        statsRef['n_gram_tgt'] = executor.submit(get_len_n_gram, target_file_name).result()

        for voc_src in vocs_src:
            for voc_tgt in vocs_tgt:
                stats[voc_src+'-'+voc_tgt]['n_gram_src'] = executor.submit(get_len_n_gram, file_src_dict[voc_src]).result()
                stats[voc_src+'-'+voc_tgt]['n_gram_tgt'] = executor.submit(get_len_n_gram, file_tgt_dict[voc_tgt]).result()

    return stats
    
def get_data_directory(src, tgt):
    main_dir = "/content/drive/MyDrive/WordAlignmentCorpora/"
    result_dir = "/content/drive/MyDrive/subwordOptimization/"
    if src == 'en' and tgt == 'ro':
        test_src = main_dir + "en-ro/corp.test.ro-en.cln.en.low"
        test_tgt = main_dir + "en-ro/corp.test.ro-en.cln.ro.low"
        test_align = main_dir + "en-ro/test.en-ro.ali.startFrom1"
        align_dir = result_dir + "en-ro/output_en_ro"
        src_dir = result_dir + "en-ro/output_en"
        tgt_dir = result_dir + "en-ro/output_ro"
    if src == 'ro' and tgt == 'en':
        test_src = main_dir + "en-ro/corp.test.ro-en.cln.ro.low"
        test_tgt = main_dir + "en-ro/corp.test.ro-en.cln.en.low"
        test_align = main_dir + "en-ro/test.ro-en.ali.startFrom1"
        align_dir = result_dir + "en-ro/output_ro_en"
        src_dir = result_dir + "en-ro/output_ro"
        tgt_dir  = result_dir + "en-ro/output_en"

    if src == 'en' and tgt == 'cz':
        test_src = main_dir + "en-cz/testing.en-cz.en.low" 
        test_tgt = main_dir + "en-cz/testing.en-cz.cz.low"
        test_align = main_dir + "en-cz/testing.en-cz.alignment.fixed"
        align_dir = result_dir + "en-cz/output_en_cz"
        src_dir = result_dir + "en-cz/output_en"
        tgt_dir  = result_dir + "en-cz/output_cz"
    if src == 'cz' and tgt == 'en':
        test_src = main_dir + "en-cz/testing.en-cz.cz.low"
        test_tgt = main_dir + "en-cz/testing.en-cz.en.low"
        test_align = main_dir + "en-cz/testing.cz-en.alignment.fixed"
        align_dir = result_dir + "en-cz/output_cz_en"
        src_dir = result_dir + "en-cz/output_cz"
        tgt_dir  = result_dir + "en-cz/output_en"

    if src == 'en' and tgt == 'fr':
        test_src = main_dir + "en-fr/testing.low.en"
        test_tgt = main_dir + "en-fr/testing.low.fr"
        test_align = main_dir + "en-fr/testing.en-fr.align"
        align_dir = result_dir + "en-fr/output_en_fr"
        src_dir = result_dir + "en-fr/output_en"
        tgt_dir  = result_dir + "en-fr/output_fr"
    if src == 'fr' and tgt == 'en':
        test_src = main_dir + "en-fr/testing.low.fr"
        test_tgt = main_dir + "en-fr/testing.low.en"
        test_align = main_dir + "en-fr/testing.fr-en.align"
        align_dir = result_dir + "en-fr/output_fr_en"
        src_dir = result_dir + "en-fr/output_fr"
        tgt_dir  = result_dir + "en-fr/output_en"

    if src == 'en' and tgt == 'ge':
        test_src = main_dir + "en-de/corp.test.de-en.en.low.ngoho"
        test_tgt = main_dir + "en-de/corp.test.de-en.de.low.ngoho"
        test_align = main_dir + "en-de/alignmentDeEn.fixed.ali.startFrom1.en-de.ngoho"
        align_dir = result_dir + "en-de/output_en_de"
        src_dir = result_dir + "en-de/output_en"
        tgt_dir  = result_dir + "en-de/output_de"
    if src == 'ge' and tgt == 'en':
        test_src = main_dir + "en-de/corp.test.de-en.de.low.ngoho"
        test_tgt = main_dir + "en-de/corp.test.de-en.en.low.ngoho"
        test_align = main_dir + "en-de/alignmentDeEn.fixed.ali.startFrom1.de-en.ngoho"
        align_dir = result_dir + "en-de/output_de_en"
        src_dir = result_dir + "en-de/output_de"
        tgt_dir  = result_dir + "en-de/output_en"
    
    if src == 'en' and tgt == 'ja':
        test_src = main_dir + "en-ja/testing.en-ja.en"
        test_tgt = main_dir + "en-ja/testing.en-ja.ja"
        test_align = main_dir + "en-ja/en-ja.align.from1.final"
        align_dir = result_dir + "en-ja/output_en_ja"
        src_dir = result_dir + "en-ja/output_en"
        tgt_dir  = result_dir + "en-ja/output_ja"
    if src == 'ja' and tgt == 'en':
        test_src = main_dir + "en-ja/testing.en-ja.ja"
        test_tgt = main_dir + "en-ja/testing.en-ja.en"
        test_align = main_dir + "en-ja/ja-en.align.from1.final"
        align_dir = result_dir + "en-ja/output_ja_en"
        src_dir = result_dir + "en-ja/output_ja"
        tgt_dir  = result_dir + "en-ja/output_en"

    if src == 'en' and tgt == 'vi':
        test_src = main_dir + "en-vi/testing.en-vi.low.en"
        test_tgt = main_dir + "en-vi/testing.en-vi.low.vi"
        test_align = main_dir + "en-vi/testing.en-vi.align.from1"
        align_dir = result_dir + "en-vi/output_en_vi"
        src_dir = result_dir + "en-vi/output_en"
        tgt_dir  = result_dir + "en-vi/output_vi"
    if src == 'vi' and tgt == 'en':
        test_src = main_dir + "en-vi/testing.en-vi.low.vi"
        test_tgt = main_dir + "en-vi/testing.en-vi.low.en"
        test_align = main_dir + "en-vi/testing.vi-en.align.from1"
        align_dir = result_dir + "en-vi/output_vi_en"
        src_dir = result_dir + "en-vi/output_vi"
        tgt_dir  = result_dir + "en-vi/output_en"
    
    return test_src, test_tgt, test_align, align_dir, src_dir, tgt_dir


## Run

In [None]:
!mkdir /content/drive/MyDrive/subwordOptimization/

train_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-ro/train.merg.en-ro.cln.en.utf8.low.lenSent50"
train_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-ro/train.merg.en-ro.cln.ro.utf8.low.lenSent50"
test_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-ro/corp.test.ro-en.cln.en.low"
test_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-ro/corp.test.ro-en.cln.ro.low"
data_dir="/content/drive/MyDrive/subwordOptimization/en-ro/"

get_alignment_files(train_src, train_tgt,
                        test_src, test_tgt,
                        data_dir,
                        src='en', tgt='ro',
                        model_type='bpe',
                        vocab_sizes=[100, 200, 500, 1000, 2000, 4000, 8000, 16000, 32000, 48000])

In [None]:
!mkdir /content/drive/MyDrive/subwordOptimization/

train_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-ja/kyoto-train.cln.low.en"
train_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-ja/kyoto-train.cln.low.ja"
test_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-ja/testing.en-ja.en"
test_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-ja/testing.en-ja.ja"
data_dir="/content/drive/MyDrive/subwordOptimization/en-ja/"

get_alignment_files(train_src, train_tgt,
                        test_src, test_tgt,
                        data_dir,
                        src='en', tgt='ja',
                        model_type='bpe',
                        vocab_sizes=[100, 200, 500, 1000, 2000, 4000, 8000, 16000, 32000, 48000])

In [None]:
!mkdir /content/drive/MyDrive/subwordOptimization/

train_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-cz/training.en-cz.en.tok.low.cln"
train_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-cz/training.en-cz.cz.tok.low.cln"
test_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-cz/testing.en-cz.en.low"
test_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-cz/testing.en-cz.cz.low"
data_dir="/content/drive/MyDrive/subwordOptimization/en-cz/"

get_alignment_files(train_src, train_tgt,
                        test_src, test_tgt,
                        data_dir,
                        src='en', tgt='cz',
                        model_type='bpe',
                        vocab_sizes=[100, 200, 500, 1000, 2000, 4000, 8000, 16000, 32000, 48000])

In [None]:
!mkdir /content/drive/MyDrive/subwordOptimization/

train_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-vi/train.low.en"
train_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-vi/train.low.vi"
test_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-vi/testing.en-vi.low.en"
test_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-vi/testing.en-vi.low.vi"
data_dir="/content/drive/MyDrive/subwordOptimization/en-vi/"

get_alignment_files(train_src, train_tgt,
                        test_src, test_tgt,
                        data_dir,
                        src='en', tgt='vi',
                        model_type='bpe',
                        vocab_sizes=[100, 200, 500, 1000, 2000, 4000, 8000, 16000, 32000, 48000])

In [None]:
!mkdir /content/drive/MyDrive/subwordOptimization/

train_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-de/corp.train.de-en.low.cln.en.final.lenSent50"
train_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-de/corp.train.de-en.low.cln.de.final.lenSent50"
test_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-de/corp.test.de-en.en.low.ngoho"
test_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-de/corp.test.de-en.de.low.ngoho"
data_dir="/content/drive/MyDrive/subwordOptimization/en-de/"

get_alignment_files(train_src, train_tgt,
                        test_src, test_tgt,
                        data_dir,
                        src='en', tgt='de',
                        model_type='bpe',
                        vocab_sizes=[100, 200, 500, 1000, 2000, 4000, 8000, 16000, 32000, 48000])

In [None]:
!mkdir /content/drive/MyDrive/subwordOptimization/

train_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-fr/europarl-v7.en-fr.cln.low.en.lenSent50"
train_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-fr/europarl-v7.en-fr.cln.low.fr.lenSent50"
test_src = "/content/drive/MyDrive/WordAlignmentCorpora/en-fr/testing.low.en"
test_tgt = "/content/drive/MyDrive/WordAlignmentCorpora/en-fr/testing.low.fr"
data_dir="/content/drive/MyDrive/subwordOptimization/en-fr/"

get_alignment_files(train_src, train_tgt,
                        test_src, test_tgt,
                        data_dir,
                        src='en', tgt='fr',
                        model_type='bpe',
                        vocab_sizes=[100, 200, 500, 1000, 2000, 4000, 8000, 16000, 32000, 48000])

In [None]:
lang_pairs = [
              {'src':'en', 'tgt':'ro'}, 
              {'src':'ro', 'tgt':'en'},

              {'src':'en', 'tgt':'cz'},
              {'src':'cz', 'tgt':'en'},

              {'src':'en', 'tgt':'fr'},
              {'src':'fr', 'tgt':'en'},

              {'src':'en', 'tgt':'ge'},
              {'src':'ge', 'tgt':'en'},
              
              {'src':'en', 'tgt':'ja'},
              {'src':'ja', 'tgt':'en'},
              
              {'src':'en', 'tgt':'vi'},
              {'src':'vi', 'tgt':'en'},
              ]

for lang_pair in lang_pairs:

    print("Create file", '/content/drive/MyDrive/subword_stats_'+lang_pair['src']+'_'+lang_pair['tgt'])
    test_src, test_tgt, test_align, align_dir, src_dir, tgt_dir = get_data_directory(src=lang_pair['src'], tgt=lang_pair['tgt'])

    subword_statistics = get_subword_statistics(test_align, test_src, test_tgt, align_dir, src_dir, tgt_dir)

    with open('/content/drive/MyDrive/subword_stats_'+lang_pair['src']+'_'+lang_pair['tgt'], 'w+') as file:
        file.write(json.dumps(subword_statistics, indent=4))

    print("Done", '/content/drive/MyDrive/subword_stats_'+lang_pair['src']+'_'+lang_pair['tgt'] )
        

In [None]:
import os
with open('/content/drive/MyDrive/subwordOptimization/en-ja/output_en_ja', 'r') as f:
    for line in f:
        if os.path.isfile(line.split()[1]):
            file_ = open(line.split()[1], "r")
            line_count = 0
            for line_ in file_:
                if line != "\n":
                    line_count += 1
            file_.close()
            if line_count == 0:
                print(line_count, line.split()[1])

                open(line.split()[1], 'w').close()
                os.remove(line.split()[1])

        if os.path.isfile(line.split()[1].replace('.2word', '')):
            file_ = open(line.split()[1].replace('.2word', ''), "r")
            line_count = 0
            for line_ in file_:
                if line != "\n":
                    line_count += 1
            file_.close()
            if line_count == 0:
                print(line_count, line.split()[1].replace('.2word', ''))

                open(line.split()[1].replace('.2word', ''), 'w').close()
                os.remove(line.split()[1].replace('.2word', ''))

        if os.path.isfile(line.split()[1].replace('.shift1.2word', '')):
            file_ = open(line.split()[1].replace('.shift1.2word', ''), "r")
            line_count = 0
            for line_ in file_:
                if line != "\n":
                    line_count += 1
            file_.close()
            if line_count == 0:
                print(line_count, line.split()[1].replace('.shift1.2word', ''))

                open(line.split()[1].replace('.shift1.2word', ''), 'w').close()
                os.remove(line.split()[1].replace('.shift1.2word', ''))

                