In [3]:
# Character codes for baseline punctuation split model
ending_punc = [
    '\u0964',
    '\u061F',
    '\u002E',
    '\u3002',
    '\u0021',
    '\u06D4',
    '\u17D4',
    '\u003F',
    '\uFF61',
    '\uFF0E',
    '\u2026',
]

closing_punc = [
    '\u3011',
    '\u00BB',
    '\u201D',
    '\u300F',
    '\u2018',
    '\u0022',
    '\u300D',
    '\u201C',
    '\u0027',
    '\u2019',
    '\u0029'
]

list_set = [
    '\u30fb',
    '\uFF65',
    '\u002a', # asterisk
    '\u002d',
    '\u4e00' 
]

5-element Vector{Char}:
 '・': Unicode U+30FB (category Po: Punctuation, other)
 '･': Unicode U+FF65 (category Po: Punctuation, other)
 '*': ASCII/Unicode U+002A (category Po: Punctuation, other)
 '-': ASCII/Unicode U+002D (category Pd: Punctuation, dash)
 '一': Unicode U+4E00 (category Lo: Letter, other)

In [4]:
# Split rule for English
struct PunctuationSpace
end

function (obj :: PunctuationSpace)(left_context, right_context)
    left_context_regex = r".*[?!.][.?!\")\']*"
    # if the right context doesnt start with a number
    if !occursin(right_context[1], "012345678")
        # if the left context has an ending punctuation followed by a number of punctuations
        if match(left_context_regex, left_context) != nothing
            # the site is a candidate site
            return true
        end
    end
    return false
end

In [5]:
# Tokenizer
struct Vocabulary
    itos
    stoi
    Vocabulary(itos=["<unk>", "<eos>", "<mos>", "<pad>"], stoi = Dict("<unk>" => 1, "<eos>" => 2, "<mos>" => 3, "<pad>" => 4)) = new(itos, stoi)
end

Base.length(v :: Vocabulary) = length(v.itos)

function add_word_to_vocab(v :: Vocabulary, word)
    if get(v.stoi, word, nothing) == nothing
        v.stoi[word] = length(v) + 1
        push!(v.itos, word)
    end
end
# Goes overs the words in the file and records them to vocab
function build_vocab(v :: Vocabulary, file_path)
    open(file_path) do f
        for line in readlines(f)
            for word in split(line)
                add_word_to_vocab(v, strip(word))
            end
        end
    end
end
# Returns the unique index of the word
function embed_word(v :: Vocabulary, word)
    return get(v.stoi, word, 3)
end
# Returns the word associated with the unique index
function get_word(v :: Vocabulary, embedding)
    return v.itos[embedding]
end

function detokenize(v :: Vocabulary, input_string)
    input_string = replace(input_string, ' ' => "")
    input_string = replace(input_string, "\u2581" => ' ')
    return input_string
end
# If out_type is int it uses the list of unique indices of the words in input string
# Otherwise it returns the list of words in input string
function encode(v :: Vocabulary, input_string; out_type=Int64)
    if out_type == Int64
        embedded_words = []
        input_string = split(input_string)
        for s in input_string
            push!(embedded_words, embed_word(v, s))
        end
        return embedded_words
    else
        return split(input_string) 
    end
end
    
# Returns the words for given list of unique indices
function decode(v :: Vocabulary, input_array)
    output = []
    for i in input_array
        push!(output, get_word(v, i))
    end
    return join(output, ' ')
end

function merge(v :: Vocabulary, sentence; technique = "replace")
    if technique == "replace"
        return replace(replace(sentence, ' ', ""), "▁" => " ") 
    else
        return decode(v, sentence)
    end
end

# This method creates the embeddings of the context arrays 
function context_to_tensor(v :: Vocabulary, contexts)
    con_arr = []
    fact_arr = []
    lab_arr = []
    for (left, right, label) in contexts
        tens = []
        for l in split(left)
            push!(tens, embed_word(v, l))
        end
        for r in split(right)
            push!(tens, embed_word(v, r))
        end
        push!(con_arr, tens)
        if label == "<eos>"
            push!(lab_arr, 0)
        else
            push!(lab_arr, 1)
        end
    end
    return (con_arr, lab_arr) 
end

context_to_tensor (generic function with 1 method)

In [6]:
# Reads the document and constructs left and right contexts 
function split_test_file(document, tokenizer, left_context_size, right_context_size)
    document = encode(tokenizer, document, out_type=String)
    left_contexts = []
    right_contexts = []
    if length(document) > 0
        left_temp = ["<pad>" for i in collect(1 : left_context_size - 1)]
        push!(left_temp, document[1])
        right_temp = [x for x in document[collect(2 : (right_context_size) + 1)]]
        
        while length(right_temp) < right_context_size
            push!(right_temp, "<pad>")
        end
        temp_index = right_context_size + 2
    
        for (index, word) in enumerate(document)
            push!(left_contexts, join(left_temp, " "))
            push!(right_contexts, join(right_temp, " "))
            deleteat!(left_temp, 1)
            push!(left_temp, right_temp[1])
            deleteat!(right_temp, 1)
            if temp_index < length(document)
                push!(right_temp, document[temp_index])
                temp_index += 1
            else
                push!(right_temp, "<pad>")
            end
        end
    end
    return left_contexts, right_contexts
end
            
            

split_test_file (generic function with 1 method)

In [7]:
function detokenize(input_string)
    return replace(replace(input_string, ' ' => ""), '\u2581' => ' ')
end
struct EvalModel
    model
    tokenizer
    left_context_size
    right_context_size
    context_size
    EvalModel(model) = new(model, model.tokenizer, model.left_context_size, model.right_context_size, model.left_context_size + model.right_context_size)
end
# Takes the content and extracts left and right contexts
# Then detects candidate sites and divides them into batches 
# for model prediction
function batchify(obj :: EvalModel, content, batch_size, candidates)
    left_contexts, right_contexts = split_test_file(content, obj.tokenizer, obj.left_context_size, obj.right_context_size)
    # lines include left and right contexts, whereas indices includes the indices of the candidate sites
    # in the file text
    if length(left_contexts) > 0
        lines = []
        indices = []
        index = 1
        for (left, right) in zip(left_contexts, right_contexts)
            if candidates(detokenize(join(left, ' ')), detokenize(join(right, ' ')))   
                push!(lines, (left, right, "<eos>"))
                push!(indices, index)
            end
            index += 1
        end
        
        data, _ = context_to_tensor(obj.tokenizer, lines)
        
        nbatch = div(size(data)[1], batch_size)
        remainder = size(data)[1] % batch_size
        
        if remainder > 0
            remaining_data = copy(data)[collect(nbatch * batch_size + 1 : nbatch * batch_size + remainder)]
            remaining_indices = copy(indices)[collect(nbatch * batch_size + 1 : nbatch * batch_size + remainder)]
        end
        
        data = data[collect(1:nbatch * batch_size)]
        indices = indices[collect(1:nbatch * batch_size)]
        
        
        data = reshape(data, :, batch_size)
        indices = reshape(indices, :, batch_size)
        
        if remainder > 0
            remaining_data = reshape(remaining_data, :, remainder)
            remaining_indices = reshape(remaining_indices, :, remainder)
        end
        
        batches = []
        for i in collect(1 : size(data)[1])
            context_batch = Array(data[i, :])'
            index_batch = indices[i, :]
            push!(batches, (context_batch, index_batch))
        end
        
        if remainder > 0
            push!(batches, (Array(remaining_data[1, :])', indices[1, :]))
        
        end
        
        return batches
    else
        return []
    end
end

function parallel_evaluation(obj :: EvalModel, content, batch_size; candidates = nothing, min_sent_size = 3)
    batches = batchify(obj, content, batch_size, candidates)
    eos = []
    for (contexts, indices) in batches
        data = contexts
        # Normally model does prediction at all positions here because batchify function
        # returns all the candidate sites
        for index in indices
            push!(eos, index)
        end
    end
    if length(eos) == 0
        return strip(content)
    else
        #BAZI UNIQUE OLMAYAN ENTRYLER VAR BU PROBLEM
        eos = unique(eos)
        eos = sort(eos)
        next_index = eos[1]
        deleteat!(eos, 1)
        this_content = encode(obj.tokenizer, content, out_type = String)
        output = []
        counter = 1
        # for each candidate site a newline is added after it 
        for (index, word) in enumerate(this_content)
            if counter == next_index
                if !isempty(eos)
                    next_index = eos[1]
                    deleteat!(eos, 1)
                end
                push!(output, word)
                push!(output, "\n")
                
            else
                push!(output, word)
            end
            counter += 1
        end
        
        output = join(output, " ")
        output = replace(output, r" \n" => "\n")
        
        return output
    end
    return nothing
end
                    

parallel_evaluation (generic function with 1 method)

In [8]:
# its given a tokenizer for detecting candidate sites a left context size and a right context size
struct AlwaysSplitModel
    tokenizer
    left_context_size
    right_context_size
    AlwaysSplitModel(tokenizer, left_context_size, right_context_size) = new(tokenizer, left_context_size, right_context_size)
end

In [9]:
# The function for evaluating the model, creates the final segmented file 
function evaluate_model(eval_model, input_file, output_file, batch_size; candidates=nothing)
    output = open(output_file, "w")
    input = open(input_file, "r")
    for line in readlines(input)
        for batch_output in parallel_evaluation(eval_model, line, batch_size, candidates = candidates)
            if batch_output != nothing
                try
                    write(output, batch_output)
                catch e
                    print(batch_output * "\n")
                end
            end
        end
    end
    close(output)
    close(input)
    return output_file
end

evaluate_model (generic function with 1 method)

In [12]:
tokenizer = Vocabulary()
input_file = "test.en"
build_vocab(tokenizer, input_file)
candidate_generator = PunctuationSpace()
always_split_model = AlwaysSplitModel(tokenizer, 1, 1)
eval_model = EvalModel(always_split_model)
evaluate_model(eval_model, input_file, "always_split_result2.en", 16; candidates = candidate_generator)

"always_split_result2.en"