In [15]:
using Random
using DataStructures
using Plots
using Statistics
using Combinatorics

using NBInclude
@nbinclude("dcj_algo.ipynb")
@nbinclude("testing_diameter.ipynb")
# @nbinclude("testing_maps.ipynb")

In [16]:
# returns dup char --> num instances dictionary, num of duplicate chars  
##### returns 2 x N array (N = num dup char) of duplicate char & num instances, num of duplicate chars

function find_dups_in_str(str::String)
    alphabet = Set{Char}() 
    duplicates = OrderedDict{Char, Int}()  # duplicate char --> num instances 

    for char in str
        if char == '.' || char == ','
            continue 
        end 
        
        char = lowercase(char)   
        if char in alphabet 
            if char in keys(duplicates)
                duplicates[char] += 1
            else 
                duplicates[char] = 2 
            end 
        else 
            push!(alphabet, char)
        end 
    end 

    return duplicates, length(duplicates) 
end 

find_dups_in_str (generic function with 1 method)

In [17]:
function generate_map(num_dups::Int, dup_to_num_instances::OrderedDict{Char, Int}, S_M_set::Set{Array{Int}})
    
    while true 
        map = Array{Int}(undef, num_dups)
        
        idx = 1
        # for each position of the map (each gene w >1 instances)
        for (dup_char, num_instances) in pairs(dup_to_num_instances)
            # an integer value is selcted uniformly from the interval [0, occ(α, S)! -1] 
            mapidx_for_dup_char = rand(1:factorial(num_instances))
            
            map[idx] = mapidx_for_dup_char
            idx += 1
        end 
        if map ∉ S_M_set
            return map 
        end 
    end 
end 

# S = "aabc"
# S_dupchar_to_multiplicity, S_num_dups = find_dups_in_str(S)
# S_M_set = Set{Array{Int}}()
# map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set)
# push!(S_M_set, map)
# map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set)
# push!(S_M_set, map)
# map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set) # should throw error bc alr generated all possible maps

generate_map (generic function with 1 method)

In [18]:
function new_char(alphabet::Set{Char})
    while length(alphabet) != 26 
        c = Char(rand('a':'z'))
        if c ∉ alphabet 
            push!(alphabet, c)
            return c
        end 
    end 
    throw(ArgumentError("Ran out of unique characters in the alphabet"))
    
end 


function deduplicate_genome(map::Array{Int}, dup_genome::String, dupchar_to_multiplicity::OrderedDict{Char, Int}, dupchar_to_unique_chars::OrderedDict{Char, Vector{Char}})
    if dupchar_to_unique_chars == OrderedDict{Char, Vector{Char}}()  # passed in target genome; need to assign dup chars to unique chars
        alphabet = Set(dup_genome)
        
        # create mapping from duplicated chars (multiplicity k) --> [array of k unique chars], idx = idx of duplicate (referenced by permutation) 
        idx = 1
        for (dupchar, mult) in pairs(dupchar_to_multiplicity)
            unique_chars = [dupchar]
            for i in 2:dupchar_to_multiplicity[dupchar]
                c = new_char(alphabet)
                push!(unique_chars, c)
            end     
            dupchar_to_unique_chars[dupchar] = unique_chars
            
            idx += 1
        end 
    end 

    # for each duplicated char, create the correct ordering of unique chars in deduplicated genome 
    # based on mapping from dup chars -> unique chars & lexicographical permutation
    idx = 1
    dup_char_to_mapped_unique_chars = Dict{Char, Vector{Char}}()
    for (dupchar, unique_chars) in pairs(dupchar_to_multiplicity)  # need to process dup chars in the order of ordereddict dupchar_to_mult 
        unique_chars = dupchar_to_unique_chars[dupchar]
        # for each elem i in map, convert i --> ith permutation of k ints in lexicographical order (k = multiplicity)
        dup_char_to_mapped_unique_chars[dupchar] = nthperm(unique_chars, map[idx])  
        
        idx += 1
    end 

    # deduplicate the genome with the orderings generated above with the permutations for duplicate genes
    dedup_genome = Vector{Char}()
    for char in dup_genome
        
        if lowercase(char) in keys(dupchar_to_multiplicity)
            unique_char = dup_char_to_mapped_unique_chars[lowercase(char)][1]
            if isuppercase(char) 
                unique_char = uppercase(unique_char)
            end 
            push!(dedup_genome, unique_char)

            splice!(dup_char_to_mapped_unique_chars[lowercase(char)], 1)
        else 
            push!(dedup_genome, char)
        end 
    end

    return join(dedup_genome)
end 



# P = "abbAac"
# P_dup_to_num_instances, P_num_dups = find_dups_in_str(P)
# P_map = generate_map(P_num_dups, P_dup_to_num_instances, Set{Array{Int64}}())
# dupchar_to_unique_char = OrderedDict{Char, Vector{Char}}()

# deduplicate_genome(P_map, P, P_dup_to_num_instances, dupchar_to_unique_char)

deduplicate_genome (generic function with 1 method)

In [36]:
# mapping S --> T, generates r random maps (RM)
# mode options: "none", "info"
function randommap(S::String, P::String, num_maps::Int, mode::String)
    if mode == "info"
        m = 1
    else  # none 
        m = 0 
    end 

    printstyled("SRC " * S * " --> TARGET " * P * "\n", color=:cyan)

    
    ## GENERATE MAPS  
    P_dupchar_to_multiplicity, P_num_dups = find_dups_in_str(P)
    S_dupchar_to_multiplicity, S_num_dups = find_dups_in_str(S)

    # check max num of maps 
    max_maps = 0 
    for (dup, num_instances) in S_dupchar_to_multiplicity
        max_maps += factorial(num_instances)
    end 

    if num_maps == 0 || num_maps > max_maps  
        num_maps = max_maps 
        # println("WARNING: Number of maps > # possible unique maps for src --> capping at ", max_maps)
    end 

    # target map
    P_map = ones(Int, P_num_dups)

    if m >= 1
        printstyled("target", color=:magenta)
        println("\ndup genes --> multiplicity")
        println(P_dupchar_to_multiplicity, "\n")

        # arbitrary map p for target string 
        println("target map")
        print(P_map, "\n")
    end   
    
    # source maps 
    S_M = Array{Array{Int}}(undef, num_maps)  
    S_M_set = Set{Array{Int}}()


    # r RM of src str S are generated and stored in a set S_M 
    if m >= 1
        printstyled("\nsource", color=:magenta)
        println("\ndup genes --> multiplicity")
        println(S_dupchar_to_multiplicity, "\n")
        println("source maps")
    end 
    for i in 1:num_maps
        s_map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set)
        S_M[i] = s_map
        push!(S_M_set, s_map)
        if m >= 1
            println(s_map)
        end 
    end 


    ## DEDUPICATION 
    if m >= 1
        printstyled("\ndeduplication\n", color=:magenta)
    end 
    dupchar_to_unique_char = OrderedDict{Char, Vector{Char}}()

    P_dedup = deduplicate_genome(P_map, P, P_dupchar_to_multiplicity, dupchar_to_unique_char)
    if m >= 1
        # println("deduplicated target ", P_dedup, "\n")
    end 

    S_dedup_list = Vector{String}()
    dcj_dist_list = Vector{Int}()

    for s_map in S_M 
        s_dedup = deduplicate_genome(s_map, S, S_dupchar_to_multiplicity, dupchar_to_unique_char)
        push!(S_dedup_list, s_dedup)

        d = calculate_distance(P_dedup, s_dedup, "none")
        push!(dcj_dist_list, d)

        println("[deduplicated] ", s_dedup, " --> ", P_dedup, " ", d)

        if m >= 1
            # println("deduplicated source ", s_dedup)
        end 
    end 

    ## DCJ DISTANCE 

    # min 
    min_dist = Inf
    min_src = ""
    min_map = Vector{Int}()

    # max
    max_dist = 0 
    max_src = ""
    max_map = Vector{Int}()

    idx = 1
    for d in dcj_dist_list
        if d < min_dist
            min_dist = d
            min_src = S_dedup_list[idx]
            min_map = S_M[idx]
        end 

        if d > max_dist 
            max_dist = d 
            max_src = S_dedup_list[idx]
            max_map = S_M[idx]
        end 
    end

    print("\n")
    # dcj_dist_list,
    return P_dedup, min_dist, min_src, min_map, max_dist, max_src, max_map
end 


# src = "aaAAaAbbbBbbccd"  # 6a, 6b, 2c, 1d
# target = "abcdabbaacbabab"
# num_maps = 100

# src = ".a.,aab,bc"  
# target = "ba,a,b,ca"
# num_maps = 8

# n = 3
# x = 2
# num_maps = 0
# src, target = generate_genomes_with_xdup(n, x)

# target, min_dist, min_src, min_map, max_dist, max_src, max_map = randommap(src, target, num_maps, "none")

randommap (generic function with 1 method)