In [1]:
using NBInclude
@nbinclude("dcj_algo.ipynb")

In [2]:
# returns the index of the following array 
# a b c ... x y z aa ab ac ... ax ay az ba bb bc ... bx by bz ... ca cb cd ....
# (doesn't build the array)

function first_n_letters_no_limits(n::Int)
    alphabet = 'a':'z'  # Define the alphabet range
    letters = []
    
    for i in 1:n
        idx = i - 1
        letter = ""
        
    while idx >= 0
            place_value = idx % 26 + 1
            letter = string(alphabet[place_value], letter)
            idx = idx รท 26 - 1
        end
        
        push!(letters, letter)
    end
    
    return letters
end

first_n_letters_no_limits (generic function with 1 method)

In [3]:
# generates target genomes that contain the first n letters and adds x duplicate letters 
# x >= 1
function generate_target_with_xdup(n::Int, x::Int)
    # target 
    alphabet = first_n_letters_no_limits(n)
    genome = copy(alphabet)
    
    for i in 1:x
        rand_dup_gene = alphabet[rand(1:length(alphabet))]
        rand_idx = rand(1:length(genome)+1)
        insert!(genome, rand_idx, rand_dup_gene)
    end 

    return join(genome, "'")  # one chrom 
end 

# target = generate_target_with_xdup(5, 2)
# print(target)

generate_target_with_xdup (generic function with 1 method)

In [4]:
function add_gene_counts(input_string::String)  
    chroms = split(input_string, ",")
    result = IOBuffer()
    gene_count = Dict{String, Int}()

    for c in chroms 
        genes = split(c, "'")


        for gene in genes
            if gene == '.' || gene == ',' 
                print(result, gene) 
                continue 
            end 
            if !haskey(gene_count, gene)
                gene_count[gene] = 0
            end
            gene_count[gene] += 1
            print(result, gene, gene_count[gene])
        end
        print(result, ",") 
    end 

    return String(take!(result))[1:end-1]
end

# input_string = "he'll'o"
# input_string = "abc,abc"
# result = add_gene_counts(input_string)
# println(result)  # Output: "he1ll1o1"

add_gene_counts (generic function with 1 method)

In [5]:
tar = generate_target_with_xdup(3, 2)
# tar = "abc,abc"
tar_wcounts = add_gene_counts(tar)
println(tar) 
println(tar_wcounts)

b'c'a'b'c
b1c1a1b2c2


In [6]:
function swap_two_adj(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}})
    randidx1 = rand(1:length(adj_list))
    randidx2 = rand(1:length(adj_list))
    while randidx1 == randidx2 
        randidx2 = rand(1:length(adj_list))
    end

    adj_to_change1 = adj_list[randidx1]
    adj_to_change2 = adj_list[randidx2]

    adj1_swapleft = rand(Bool)
    adj2_swapleft = rand(Bool)

    if adj1_swapleft 
        adj1_ge = adj_list[randidx1].left 
    else 
        adj1_ge = adj_list[randidx1].right 
    end 

    if adj2_swapleft 
        adj2_ge = adj_list[randidx2].left 
    else 
        adj2_ge = adj_list[randidx2].right 
    end 
    
    new_adj1, new_adj2 = update_documentation(adj1_ge, adj2_ge, randidx1, randidx2, adj_list, gid_to_loc)

    return adj_to_change1, adj_to_change2, new_adj1, new_adj2
end 

function split_adj(adj_list, gid_to_loc, telo_idxs)
    randidx = rand(1:length(adj_list))
    adj_to_split = adj_list[randidx]

    new_adj1 = Adjacency(adj_to_split.left, GeneEnd(Telomere())) 
    new_adj2 = Adjacency(adj_to_split.right, GeneEnd(Telomere())) 

    adj_list[randidx] = new_adj1
    push!(adj_list, new_adj2)

    assign_ge_idx_to_gid_to_locdict(adj_to_split.right, Ref{Int}(length(adj_list)), gid_to_loc)

    if adj_list[randidx] !== Adjacency(GeneEnd(Telomere()), GeneEnd(Telomere()))
        push!(telo_idxs, randidx)
    end 
    if adj_list[length(adj_list)] !== Adjacency(GeneEnd(Telomere()), GeneEnd(Telomere()))
        push!(telo_idxs, length(adj_list))
    end 

    return adj_to_split, new_adj1, new_adj2
end 


split_adj (generic function with 1 method)

In [7]:
# each adj must have one telo & one gene (o/w DCJ op doesn't do anything)
function combine_telomeres(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}}, telo_idxs::Set{Int})
    # choose two telomeres randomly 
    rand_adjidx1 = pop!(telo_idxs)
    rand_adjidx2 = pop!(telo_idxs)

    adj_to_change1 = adj_list[rand_adjidx1]
    adj_to_change2 = adj_list[rand_adjidx2]

    nontelo_ge1 = other_adjacency_end(GeneEnd(Telomere()), adj_to_change1)
    nontelo_ge2 = other_adjacency_end(GeneEnd(Telomere()), adj_to_change2)
    
    # merge 
    merged_adj = Adjacency(nontelo_ge1, nontelo_ge2)
    telos = Adjacency(GeneEnd(Telomere()), GeneEnd(Telomere()))

    adj_list[rand_adjidx1] = merged_adj
    adj_list[rand_adjidx2] = telos

    assign_ge_idx_to_gid_to_locdict(nontelo_ge2, Ref{Int}(rand_adjidx1), gid_to_loc)
    
    return adj_to_change1, adj_to_change2, merged_adj, telos
end 


combine_telomeres (generic function with 1 method)

In [18]:
function apply_dcjop(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}}, m::Int, telo_idxs::Set{Int})
    (length(telo_idxs)>=2) ? num_options = 3  : num_options = 2 

    rand_operation = rand(1:num_options)
    if rand_operation == 1
        adj_to_change1, new_adj1, new_adj2 = split_adj(adj_list, gid_to_loc, telo_idxs) 
        op = "SPLIT ONE ADJ"
    elseif rand_operation == 2 
        adj_to_change1, adj_to_change2, new_adj1, new_adj2 = swap_two_adj(adj_list, gid_to_loc)
        op = "SWAP TWO ADJ"
    else 
        adj_to_change1, adj_to_change2, new_adj1, new_adj2 = combine_telomeres(adj_list, gid_to_loc, telo_idxs)
        op = "COMBINE 2 TELOS"
    end 

    if m >= 1
        print("DCJ ", op, " :: ")           
        # print("[idxs ", randidx1, " ", randidx2, "] ")         
        show(adj_to_change1, true) 
        if (@isdefined adj_to_change2) show(adj_to_change2, true) end 

        print(" --> ")
        
        show(new_adj1, true) 
        show(new_adj2, true)
        show(adj_list)
        println()
    end 
end 


apply_dcjop (generic function with 2 methods)

In [19]:
function apply_x_dcjops(target::String, x::Int, m::Int)
    # turn target genome into list of adjacencies
    id_counter = Ref{Int}(1)
    id_to_str = Dict{Int, String}()
    str_to_id = Dict{String, Int}()

    target_genome = string_to_genome(target, id_counter, id_to_str, str_to_id, true)
    adj_list = genome_to_adj_list(target_genome)  
    gid_to_loc = process_adj_list(adj_list)   
    
    if m >= 1
        show(adj_list)
        println()
    end 
    
    single_telo_idxs = Set{Int}()
    for i in 1:x
        apply_dcjop(adj_list, gid_to_loc, m, single_telo_idxs)
    end 

    return adj_list, gid_to_loc
    
    # # convert adj back to genome 
    # inorder_adj_list = reorder_adjs(adj_list, gid_to_loc)
    # show(inorder_adj_list)
    # src_genome_str = adjlist_to_genomestr(inorder_adj_list)
    
    # if m >= 1
    #     println()
    #     print(src_genome_str)
    #     println()
    # end 

    # return src_genome_str
end 


apply_x_dcjops (generic function with 1 method)

In [20]:
m = 1
adj_list, gid_to_loc = apply_x_dcjops(tar_wcounts, 10, m)

# convert adj back to genome 
inorder_adj_list = reorder_adjs(adj_list, gid_to_loc)
show(inorder_adj_list)
src_genome_str = adjlist_to_genomestr(inorder_adj_list)

println()
print(src_genome_str)
println()
 

(b1:h,c1:t)(c1:h,a1:t)(a1:h,b2:t)(b2:h,c2:t)(c2:h,b1:t)
DCJ SPLIT ONE ADJ :: [34m(b2:h,c2:t)[39m --> [34m(b2:h,.)[39m[34m(c2:t,.)[39m(b1:h,c1:t)(c1:h,a1:t)(a1:h,b2:t)(b2:h,.)(c2:h,b1:t)(c2:t,.)
DCJ SWAP TWO ADJ :: [34m(a1:h,b2:t)[39m[34m(b2:h,.)[39m --> [34m(a1:h,.)[39m[34m(b2:t,.)[39m(b1:h,c1:t)(c1:h,a1:t)(a1:h,.)(b2:t,.)(c2:h,b1:t)(c2:t,.)
DCJ SPLIT ONE ADJ :: [34m(c2:t,.)[39m --> [34m(c2:t,.)[39m[34m(.,.)[39m(b1:h,c1:t)(c1:h,a1:t)(a1:h,.)(b2:t,.)(c2:h,b1:t)(c2:t,.)(.,.)
DCJ SPLIT ONE ADJ :: [34m(c2:t,.)[39m --> [34m(c2:t,.)[39m[34m(.,.)[39m(b1:h,c1:t)(c1:h,a1:t)(a1:h,.)(b2:t,.)(c2:h,b1:t)(c2:t,.)(.,.)(.,.)
DCJ COMBINE 2 TELOS :: [34m(b2:t,.)[39m[34m(c2:t,.)[39m --> [34m(b2:t,c2:t)[39m[34m(.,.)[39m(b1:h,c1:t)(c1:h,a1:t)(a1:h,.)(b2:t,c2:t)(c2:h,b1:t)(.,.)(.,.)(.,.)
DCJ SPLIT ONE ADJ :: [34m(.,.)[39m --> [34m(.,.)[39m[34m(.,.)[39m(b1:h,c1:t)(c1:h,a1:t)(a1:h,.)(b2:t,c2:t)(c2:h,b1:t)(.,.)(.,.)(.,.)(.,.)
DCJ SWAP TWO ADJ :: [34m(a1:h,.)[39m[34m(c