In [1]:
using Random
using DataStructures
using NBInclude
@nbinclude("dcj_algo.ipynb")

In [2]:
# returns dup char --> num instances dictionary, num of duplicate chars  
##### returns 2 x N array (N = num dup char) of duplicate char & num instances, num of duplicate chars

function find_dups_in_str(str::String)
    alphabet = Set{Char}() 
    duplicates = OrderedDict{Char, Int}()  # duplicate char --> num instances 

    for char in str
        char = lowercase(char)   
        if char in alphabet 
            if char in keys(duplicates)
                duplicates[char] += 1
            else 
                duplicates[char] = 2 
            end 
        else 
            push!(alphabet, char)
        end 
    end 

    return duplicates, length(duplicates) 
end 

find_dups_in_str (generic function with 1 method)

In [3]:
function generate_map(num_dups::Int, dup_to_num_instances::OrderedDict{Char, Int})
    map = Array{Int}(undef, num_dups)
    
    idx = 1
    # for each position of the map (each gene w >1 instances)
    for (dup_char, num_instances) in pairs(dup_to_num_instances)
        # an integer value is selcted uniformly from the interval [0, occ(α, S)! -1] 
        mapidx_for_dup_char = rand(1:factorial(num_instances))
        
        map[idx] = mapidx_for_dup_char
        idx += 1
    end 

    return map
end 

# S = "aaabbc"
# S_dupchar_to_multiplicity, S_num_dups = find_dups_in_str(S)
# generate_map(S_num_dups, S_dupchar_to_multiplicity)

generate_map (generic function with 1 method)

In [4]:
# DP implementation of factorial
function Base.factorial(n::Int, factorials::Vector{Int})
    if n == 1 || n == 0 
        return 1
    end 

    if factorials[n] != 0  
        return factorials[n]
    else 
        fact =  n * factorial(n - 1, factorials)
        factorials[n] = fact
        return fact
    end 
end

# returns kth lexicogrpahical order of the first n ints 
# k ε [1,n!]
function nth_lex_permutation(n::Int, k::Int)
    k -= 1
    factorials = zeros(Int, 26)
        
    numbers = collect(1:n)
    permutation = Vector{Int}()

    for i in 1:n
        fact = factorial(n - i, factorials)
        index = div(k, fact)
        k %= fact

        push!(permutation, numbers[index+1])
        splice!(numbers, index+1)

    end
    
    return permutation
end


# n = 5  # num duplicates 
# k = 45  #  kth lexicogrpahical order of the first n ints [1:n!]

# perm = nth_lex_permutation(n, k)
# println(perm)

nth_lex_permutation (generic function with 1 method)

In [5]:
function new_char(alphabet::Set{Char})
    while length(alphabet) != 26 
        c = Char(rand('a':'z'))
        if c ∉ alphabet 
            push!(alphabet, c)
            return c
        end 
    end 
    throw(ArgumentError("Ran out of unique characters in the alphabet"))
    
end 

function convert_perm_to_ordered_unique_chars(dupchar::Char, dup_perm::Vector{Int}, dupchar_to_unique_char::Dict{Char, Vector{Char}})
    ordered_unique_chars = Vector{Char}()
    for i in dup_perm 
        push!(ordered_unique_chars, dupchar_to_unique_char[dupchar][i])
    end 
    return ordered_unique_chars
end 

function deduplicate_genome(map::Array{Int}, dup_genome::String, dupchar_to_multiplicity::OrderedDict{Char, Int}, dupchar_to_unique_char::Dict{Char, Vector{Char}})
    if dupchar_to_unique_char == Dict{Char, Vector{Char}}()  # passed in target genome; need to assign dup chars to unique chars
        alphabet = Set(dup_genome)
        
        # create mapping from duplicated chars (multiplicity k) --> [array of k unique chars], idx = idx of duplicate (referenced by permutation) 
        idx = 1
        for (dupchar, mult) in pairs(dupchar_to_multiplicity)
            unique_chars = [dupchar]
            for i in 2:dupchar_to_multiplicity[dupchar]
                c = new_char(alphabet)
                push!(unique_chars, c)
            end     
            dupchar_to_unique_char[dupchar] = unique_chars
            
            idx += 1
        end 
    end 
    
    # for each duplicated char, create the correct ordering of unique chars in deduplicated genome 
    # based on mapping from dup chars -> unique chars & lexicographical permutation
    idx = 1
    perm_to_ordered_unique_chars = Dict{Char, Vector{Char}}()
    for (dupchar, mult) in pairs(dupchar_to_multiplicity)
        # for each elem i in map, convert i --> ith permutation of k ints in lexicographical order (k = multiplicity) 
        dup_perm = nth_lex_permutation(mult, map[idx])  

        perm_to_ordered_unique_chars[dupchar] = convert_perm_to_ordered_unique_chars(dupchar, dup_perm, dupchar_to_unique_char)
        
        idx += 1
    end 

    # deduplicate the genome with the orderings generated above with the permutations for duplicate genes
    dedup_genome = Vector{Char}()
    for char in dup_genome
        if lowercase(char) in keys(dupchar_to_multiplicity)
            unique_char = perm_to_ordered_unique_chars[lowercase(lowercase(char))][1]
            if isuppercase(char) 
                unique_char = uppercase(unique_char)
            end 
            push!(dedup_genome, unique_char)

            splice!(perm_to_ordered_unique_chars[lowercase(char)], 1)
        else 
            push!(dedup_genome, char)
        end 
    end

    return join(dedup_genome)
end 


# P = "abbAac"
# P_dup_to_num_instances, P_num_dups = find_dups_in_str(P)
# P_map = generate_map(P_num_dups, P_dup_to_num_instances)
# dupchar_to_unique_char = Dict{Char, Vector{Char}}()

# deduplicate_genome(P_map, P, P_dup_to_num_instances, dupchar_to_unique_char)

deduplicate_genome (generic function with 1 method)

In [10]:
# mapping S --> T, generates r random maps (RM)

function randommap(S::String, P::String, num_maps::Int)
    println("SRC ", S, " --> TARGET ", P, "\n")


    ## GENERATE MAPS  

    S_dupchar_to_multiplicity, S_num_dups = find_dups_in_str(S)
    P_dupchar_to_multiplicity, P_num_dups = find_dups_in_str(P)

    printstyled("target", color=:magenta)
    println("\ndup genes --> multiplicity")
    println(P_dupchar_to_multiplicity, "\n")

    # arbitrary map p for target string 
    P_map = ones(Int, P_num_dups)
    println("target map")
    print(P_map, "\n")
    

    printstyled("\nsource", color=:magenta)
    println("\ndup genes --> multiplicity")
    println(S_dupchar_to_multiplicity, "\n")

    S_M = Array{Array{Int}}(undef, num_maps)  

    # r RM of src str S are generated and stored in a set S_M 
    println("source maps")
    for i in 1:num_maps
        s_map = generate_map(S_num_dups, S_dupchar_to_multiplicity)
        println(s_map)
        S_M[i] = s_map
    end 


    ## DEDUPICATION 
    printstyled("\ndeduplication\n", color=:magenta)

    dupchar_to_unique_char = Dict{Char, Vector{Char}}()

    P_dedup = deduplicate_genome(P_map, P, P_dupchar_to_multiplicity, dupchar_to_unique_char)
    println("deduplicated target ", P_dedup, "\n")

    S_dedup_list = Vector{String}()
    for s_map in S_M 
        s_dedup = deduplicate_genome(s_map, S, S_dupchar_to_multiplicity, dupchar_to_unique_char)
        println("deduplicated source ", s_dedup)
        push!(S_dedup_list, s_dedup)
    end 
   
    ## DCJ DISTANCE 
    dcj_dist_list = Vector{Int}()
    for s_dedup in S_dedup_list
        d = calculate_distance(P_dedup, s_dedup, "none")
        push!(dcj_dist_list, d)
    end

    print("\ndcj distances ", dcj_dist_list)
    #return shortest map iwth shortest dist between permutations 
end 


src = "aaaaabbbbcddc"
target = "ccbabbdbaadaa"
num_maps = 15

randommap(src, target, num_maps)

SRC aaaaabbbbcddc --> TARGET ccbabbdbaadaa

[35mtarget[39m
dup genes --> multiplicity
OrderedDict('c' => 2, 'b' => 4, 'a' => 5, 'd' => 2)

target map
[1, 1, 1, 1]

[35msource[39m
dup genes --> multiplicity
OrderedDict('a' => 5, 'b' => 4, 'd' => 2, 'c' => 2)

source maps
[73, 2, 2, 2]
[38, 19, 2, 1]
[14, 7, 1, 2]
[86, 3, 2, 1]
[106, 10, 1, 1]
[45, 1, 2, 1]
[112, 5, 2, 2]
[62, 7, 1, 2]
[39, 18, 2, 1]
[87, 9, 2, 2]
[98, 14, 1, 1]
[4, 21, 2, 1]
[66, 21, 1, 2]
[41, 8, 2, 1]
[31, 8, 1, 1]

[35mdeduplication[39m
deduplicated target cpbavqdnfxowj

deduplicated source wafxjbvnqpodc
deduplicated source fwajxnbvqcodp
deduplicated source awfjxvbqnpdoc
deduplicated source wxajfbqvncodp
deduplicated source jfxwavqnbcdop
deduplicated source fjxawbvqncodp
deduplicated source jxfwabnvqpodc
deduplicated source xwajfvbqnpdoc
deduplicated source fwxajqnvbcodp
deduplicated source wxfajvqbnpodc
deduplicated source jafwxqbnvcdop
deduplicated source afwjxnvbqcodp
deduplicated source xwjfanvbqpdoc
dedupl