In [1]:
using Random
using DataStructures
using Plots
using Statistics
using Combinatorics

using NBInclude
@nbinclude("dcj_algo.ipynb")
@nbinclude("testing_diameter.ipynb")

In [2]:
function setmode(mode::String)
    if mode == "info"
        m = 1
    else  # none 
        m = 0 
    end 
    
    return m 
end 

setmode (generic function with 1 method)

In [3]:
function string_to_genomearr(genome::String)
    chroms = split(genome, ",")
    genome_array = Vector{Vector{String}}() 

    for c in chroms 
        genes = Vector{String}() 
        if c[1] == '.' 
            if length(c) == 2
                push!(genes, ".")
                push!(genes, ".")
            else 
                c = String(c[2:end-1])
                g = split(c, "'")

                push!(genes, ".")
                append!(genes, g)
                push!(genes, ".")
            end 
        else 
            genes = split(c, "'")
        end
        push!(genome_array, genes)
    end 
    return genome_array
end

# src = ".a.,a'ab,..,B'c'b"  
src = "a'b'c'a"
g_arr = string_to_genomearr(src)

1-element Vector{Vector{String}}:
 ["a", "b", "c", "a"]

In [4]:
# returns dup char --> num instances dictionary, num of duplicate chars  
##### returns 2 x N array (N = num dup char) of duplicate char & num instances, num of duplicate chars

function find_dups_in_genome_arr(genome::Vector{Vector{String}})
    seen_genes = Set{String}() 
    duplicates = OrderedDict{String, Int}()  # duplicate char --> num instances 

    for chrom in genome
        for gene in chrom     
            if gene == "."
                continue 
            end     
            gene = lowercase(gene)   
            if gene in seen_genes 
                if gene in keys(duplicates)
                    duplicates[gene] += 1
                else 
                    duplicates[gene] = 2 
                end 
            else 
                push!(seen_genes, gene)
            end 
        end 
    end 

    return duplicates, length(duplicates) 
end 

# find_dups_in_genome_arr(g_arr)

find_dups_in_genome_arr (generic function with 1 method)

In [5]:
function generate_map(num_dups::Int, dup_to_num_instances::OrderedDict{String, Int}, S_M_set::Set{Array{Int}}, mapidx_to_str::Dict{Int, String})
    if isempty(mapidx_to_str)
        first = true   
    else 
        first = false          
    end 
    while true 
        map = Array{Int}(undef, num_dups)
        
        idx = 1
        # for each position of the map (each gene w >1 instances)
        for (dupgene, num_instances) in pairs(dup_to_num_instances)
            # an integer value is selcted uniformly from the interval [0, occ(α, S)! -1] 
            rank_for_dupgene = rand(1:factorial(num_instances))
            
            map[idx] = rank_for_dupgene
            
            if first 
                mapidx_to_str[idx] = dupgene
            end 
            idx += 1
        end 
        if map ∉ S_M_set
            println("Breaking")
            return map
        end 
    end 
end 

# S = "aabc"
# S_dupchar_to_multiplicity, S_num_dups = find_dups_in_str(S)
# S_M_set = Set{Array{Int}}()
# map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set)
# push!(S_M_set, map)
# map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set)
# push!(S_M_set, map)
# map = generate_map(S_num_dups, S_dupchar_to_multiplicity, S_M_set) # should throw error bc alr generated all possible maps

generate_map (generic function with 1 method)

In [6]:
function generate_maps(S_dupstr_to_multiplicity::OrderedDict{String, Int}, S_num_dups::Int, P_dupstr_to_multiplicity::OrderedDict{String, Int}, P_num_dups::Int, num_maps::Int, m::Int)
    # check max num of maps 
    max_maps = 1
    for (dup, mult) in S_dupstr_to_multiplicity
        max_maps *= factorial(mult)
    end 
    if num_maps == 0 || num_maps > max_maps  
        num_maps = max_maps 
        println("warning: capping maps at ", max_maps)
    end 

    # target map
    P_map = ones(Int, P_num_dups)

    if m >= 1
        printstyled("target", color=:magenta)
        println("\ndup genes --> multiplicity")
        println(P_dupstr_to_multiplicity, "\n")

        # arbitrary map p for target string 
        println("target map")
        print(P_map, "\n")
    end   
    
    # source maps 
    S_M = Array{Array{Int}}(undef, num_maps)  
    S_M_set = Set{Array{Int}}()


    # r RM of src str S are generated and stored in a set S_M 
    if m >= 1
        printstyled("\nsource", color=:magenta)
        println("\ndup genes --> multiplicity")
        println(S_dupstr_to_multiplicity, "\n")
        # println("source maps")
    end 
    
    mapidx_to_str = Dict{Int, String}()
    for i in 1:num_maps
        s_map = generate_map(S_num_dups, S_dupstr_to_multiplicity, S_M_set, mapidx_to_str)
        S_M[i] = s_map
        push!(S_M_set, s_map)
    end 

    return P_map, S_M, mapidx_to_str
end 

generate_maps (generic function with 1 method)

In [7]:
function reorder_dupgenes_lexicographically(map::Array{Int}, dupgene_to_multiplicity::OrderedDict{String, Int})
    dup_to_uniq = OrderedDict{String, Vector{String}}()

    # for each duplicated gene's array of unique genes, reorder lexicographically
    idx = 1
    for (dupgene, mult) in pairs(dupgene_to_multiplicity)  # need to process dup genes in the order of ordereddict dupchar_to_mult 
        uniq_genes = Vector{String}() 
        for i in 1:mult
            uniq = string(dupgene) * string(i)
            push!(uniq_genes, uniq)
        end 
        dup_to_uniq[dupgene] = uniq_genes

        # for each elem i in map, convert i --> ith permutation of k ints in lexicographical order (k = multiplicity)
        nthperm!(dup_to_uniq[dupgene], map[idx])  
        idx += 1
    end 

    return dup_to_uniq
end 

# deduplicate the genome with the orderings in dup_to_uniq 
function deduplicate_genome(genome::Vector{Vector{String}}, dupgene_to_multiplicity::OrderedDict{String, Int}, map::Array{Int})
    dup_to_uniq = reorder_dupgenes_lexicographically(map, dupgene_to_multiplicity)
    dedup_genome = ""
    
    for chrom in genome 
        dedup_chrom = ""
        for gene in chrom
            if lowercase(gene) in keys(dup_to_uniq)
                unique_gene = dup_to_uniq[lowercase(gene)][1]
                if all(isuppercase, gene) 
                    unique_gene = uppercase(unique_gene)
                end 
                
                dedup_chrom *= unique_gene
                splice!(dup_to_uniq[lowercase(gene)], 1)
            else 
                if gene == "."
                    dedup_chrom *= gene
                else
                    dedup_chrom *= gene*"1"
                end 
            end 
        end
        dedup_genome *= dedup_chrom*","
    end 

    return dedup_genome[1:end-1]
end 



# P = "a1b1b2A2a3c1"
# P_dup_to_num_instances, P_num_dups = find_dups_in_str(P)
# P_map = generate_map(P_num_dups, P_dup_to_num_instances, Set{Array{Int64}}(), Dict{Int, String}() )

# deduplicate_genome(P, P_dup_to_num_instances, P_map)

deduplicate_genome (generic function with 1 method)

In [8]:
function generate_random_maps_and_calc_distances(S::Vector{Vector{String}}, P::Vector{Vector{String}}, num_maps::Int, m::Int)
    P_dupgene_to_multiplicity, P_num_dups = find_dups_in_genome_arr(P)
    S_dupgene_to_multiplicity, S_num_dups = find_dups_in_genome_arr(S)
    
    # ## GENERATE MAPS  
    P_map, S_M, mapidx_to_gene = generate_maps(S_dupgene_to_multiplicity, S_num_dups, P_dupgene_to_multiplicity, P_num_dups, num_maps, m)

    # ## DEDUPICATION & DISTANCE CALCULATION
    if m >= 1
        printstyled("deduplication of RM\n", color=:magenta)
    end 

    # target genome 
    P_dedup = deduplicate_genome(P, P_dupgene_to_multiplicity, P_map)

    # src genomes 
    map_to_dedupstr_dcjdist = Dict{Vector{Int}, Tuple{String, Int}}()

    for s_map in S_M 
        s_dedup = deduplicate_genome(S, S_dupgene_to_multiplicity, s_map)
        # print(P_dedup, s_dedup)
        d = calculate_distance(P_dedup, s_dedup, "none")
        
        map_to_dedupstr_dcjdist[s_map] = (s_dedup, d)

        if m >= 1
            println(s_map, " ", s_dedup, " --> ", P_dedup, " ", d)
        end 
    end
    
    return P_map, P_dedup, map_to_dedupstr_dcjdist, S_dupgene_to_multiplicity, mapidx_to_gene
end 

generate_random_maps_and_calc_distances (generic function with 1 method)

In [9]:
# mapping S --> T, generates r random maps (RM)
# mode options: "none", "info"
function randommap(S::String, P::String, num_maps::Int, mode::String)
    m = setmode(mode)

    if m >= 1
        printstyled("\nSRC " * S * " --> TARGET " * P * "\n", color=:cyan)
    end 
    
    S_arr = string_to_genomearr(S)
    P_arr = string_to_genomearr(P)

    # generate maps and calculate DCJ distance
    P_map, P_dedup, map_to_dedupstr_dcjdist, _, _ = generate_random_maps_and_calc_distances(S_arr, P_arr, num_maps, m)

    return P_dedup, map_to_dedupstr_dcjdist 
end 

randommap (generic function with 1 method)

use ' to separate genes within a chromosome

In [1]:
# # src = ".a.,a'ab,b'c"  
# # target = "ab,a,b,c'a"
# # num_maps = 8

# # n = 3
# # x = 2
# # num_maps = 0
# # src, target = generate_genomes_with_xdup(n, x)


# src = "a'a'b'c"
# target = ".a'a'b.,c"
# num_maps = 2

# target, map_to_dedupstr_dcjdist = randommap(src, target, num_maps, "info")