In [1]:
using NBInclude
@nbinclude("dcj_algo.ipynb")

In [2]:
# returns the index of the following array 
# a b c ... x y z aa ab ac ... ax ay az ba bb bc ... bx by bz ... ca cb cd ....
# (doesn't build the array)

function first_n_letters_no_limits(n::Int)
    alphabet = 'a':'z'  # Define the alphabet range
    letters = []
    
    for i in 1:n
        idx = i - 1
        letter = ""
        
    while idx >= 0
            place_value = idx % 26 + 1
            letter = string(alphabet[place_value], letter)
            idx = idx ÷ 26 - 1
        end
        push!(letters, letter)
    end
    
    return letters
end

first_n_letters_no_limits (generic function with 1 method)

In [3]:
# generates target genomes that contain the first n letters and adds x duplicate letters 
# x >= 1
function generate_target_with_xdup(n::Int, x::Int)
    # target 
    alphabet = first_n_letters_no_limits(n)
    genome = copy(alphabet)
    
    for i in 1:x
        rand_dup_gene = alphabet[rand(1:length(alphabet))]
        rand_idx = rand(1:length(genome)+1)
        insert!(genome, rand_idx, rand_dup_gene)
    end 

    return join(genome, "'")  # one chrom 
end 

# target = generate_target_with_xdup(5, 2)
# print(target)

generate_target_with_xdup (generic function with 1 method)

In [4]:
function add_gene_counts(input_string::String)  
    chroms = split(input_string, ",")
    result = IOBuffer()
    gene_count = Dict{String, Int}()

    for c in chroms 
        genes = split(c, "'")


        for gene in genes
            if gene == '.' || gene == ',' 
                print(result, gene) 
                continue 
            end 
            if !haskey(gene_count, gene)
                gene_count[gene] = 0
            end
            gene_count[gene] += 1
            print(result, gene, gene_count[gene])
        end
        print(result, ",") 
    end 

    return String(take!(result))[1:end-1]
end

# input_string = "he'll'o"
# input_string = "abc,abc"
# result = add_gene_counts(input_string)
# println(result)  # Output: "he1ll1o1"

add_gene_counts (generic function with 1 method)

In [5]:
function find_genes(s::String)
    result = Set{String}() 
    gene = ""
    i = 1
    while i <= length(s)
        if s[i] == '.' || s[i] == ','
            i += 1
            continue 
        end 
        
        if isdigit(s[i])
            # Collect the digits
            digits = ""
            while i <= length(s) && isdigit(s[i])
                digits *= s[i]
                i += 1
            end
            # Merge the digits with the buffer and add to result
            gene *= digits
            push!(result, gene)
            gene = ""
        else
            # Collect the letters
            gene *= lowercase(s[i])
            i += 1
        end
    end 

    if !isempty(gene)
        push!(result, gene)
    end
    
    return result 
end

# s = "abc123,def456ghi789"
# println(find_genes(s))
# Output should be ["abc123", "def456", "ghi789"]


find_genes (generic function with 2 methods)

In [6]:
function has_onetelo(adj::Adjacency)
    return (adj.left.gene == Telomere() && adj.right.gene != Telomere()) ||  (adj.left.gene != Telomere() && adj.right.gene == Telomere())
end 

function has_twotelos(adj::Adjacency)
    return adj.left.gene == Telomere() && adj.right.gene == Telomere() 
end 

has_twotelos (generic function with 1 method)

In [7]:
function swap_two_adj(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}}, telo_idxs::Set{Int})
    # swap two non-telo adj || one non-telo adj + one telo adj 
    randidx1 = rand(1:length(adj_list))
    while has_twotelos(adj_list[randidx1]) || has_onetelo(adj_list[randidx1])
        randidx1 = rand(1:length(adj_list))
    end 

    randidx2 = rand(1:length(adj_list))
    while randidx1 == randidx2 || has_twotelos(adj_list[randidx2])
       if has_onetelo(adj_list[randidx2])
            break
       else 
            randidx2 = rand(1:length(adj_list))
        end 
    end

    adj_to_change1 = adj_list[randidx1]
    adj_to_change2 = adj_list[randidx2]

    adj1_swapleft = rand(Bool)
    adj2_swapleft = rand(Bool)

    if adj1_swapleft 
        adj1_ge = adj_to_change1.left 
    else 
        adj1_ge = adj_to_change1.right 
    end 
    if adj2_swapleft 
        adj2_ge = adj_to_change2.left 
    else 
        adj2_ge = adj_to_change2.right 
    end 
    
    new_adj1 = Adjacency(adj1_ge, adj2_ge)
    new_adj2 = Adjacency(other_adjacency_end(adj1_ge, adj_to_change1), other_adjacency_end(adj2_ge, adj_to_change2))

    adj_list[randidx1] = new_adj1
    adj_list[randidx2] = new_adj2

    assign_ge_idx_to_gid_to_locdict(adj2_ge, Ref{Int}(randidx1), gid_to_loc)
    assign_ge_idx_to_gid_to_locdict(other_adjacency_end(adj1_ge, adj_to_change1), Ref{Int}(randidx2), gid_to_loc)    

    if has_onetelo(new_adj1) && has_onetelo(new_adj2)  # split adj w no telos 
        push!(telo_idxs, randidx1)
        push!(telo_idxs, randidx2)
    elseif has_onetelo(new_adj1) && !has_onetelo(new_adj2) 
        delete!(telo_idxs, randidx1)
        delete!(telo_idxs, randidx2)
        push!(telo_idxs, randidx1)
    elseif has_onetelo(new_adj2) && !has_onetelo(new_adj1) 
        delete!(telo_idxs, randidx1)
        delete!(telo_idxs, randidx2)
        push!(telo_idxs, randidx2)
    end 

    return adj_to_change1, adj_to_change2, new_adj1, new_adj2
end 


function split_adj(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}}, telo_idxs::Set{Int})
    randidx = rand(1:length(adj_list))
    while has_twotelos(adj_list[randidx])
        randidx = rand(1:length(adj_list))
    end 
    adj_to_split = adj_list[randidx]

    new_adj1 = Adjacency(adj_to_split.left, GeneEnd(Telomere())) 
    new_adj2 = Adjacency(adj_to_split.right, GeneEnd(Telomere())) 

    adj_list[randidx] = new_adj1
    push!(adj_list, new_adj2)

    assign_ge_idx_to_gid_to_locdict(adj_to_split.right, Ref{Int}(length(adj_list)), gid_to_loc)

    if !has_twotelos(new_adj1) 
        push!(telo_idxs, randidx)
    end 
    if !has_twotelos(new_adj2)
        push!(telo_idxs, length(adj_list))
    end 

    return adj_to_split, new_adj1, new_adj2
end 


# each adj must have one telo & one gene (o/w DCJ op doesn't do anything)
function combine_telomeres(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}}, telo_idxs::Set{Int})
    # choose two telomeres randomly 
    rand_adjidx1 = pop!(telo_idxs)
    rand_adjidx2 = pop!(telo_idxs)

    adj_to_change1 = adj_list[rand_adjidx1]
    adj_to_change2 = adj_list[rand_adjidx2]

    nontelo_ge1 = other_adjacency_end(GeneEnd(Telomere()), adj_to_change1)
    nontelo_ge2 = other_adjacency_end(GeneEnd(Telomere()), adj_to_change2)
    
    # merge 
    merged_adj = Adjacency(nontelo_ge1, nontelo_ge2)
    telos = Adjacency(GeneEnd(Telomere()), GeneEnd(Telomere()))

    adj_list[rand_adjidx1] = merged_adj
    adj_list[rand_adjidx2] = telos

    assign_ge_idx_to_gid_to_locdict(nontelo_ge2, Ref{Int}(rand_adjidx1), gid_to_loc)
    
    return adj_to_change1, adj_to_change2, merged_adj, telos
end 


combine_telomeres (generic function with 1 method)

In [8]:
function apply_dcjop(adj_list::Vector{Adjacency}, gid_to_loc::DefaultDict{Int, Vector{Int}}, m::Int, telo_idxs::Set{Int})
    (length(telo_idxs)>=2) ? num_options = 3  : num_options = 2 

    rand_operation = rand(1:num_options)
    if rand_operation == 1
        adj_to_change1, new_adj1, new_adj2 = split_adj(adj_list, gid_to_loc, telo_idxs) 
        op = "SPLIT"
    elseif rand_operation == 2 
        adj_to_change1, adj_to_change2, new_adj1, new_adj2 = swap_two_adj(adj_list, gid_to_loc, telo_idxs)
        op = "SWAP"
    else 
        adj_to_change1, adj_to_change2, new_adj1, new_adj2 = combine_telomeres(adj_list, gid_to_loc, telo_idxs)
        op = "COMBINE"
    end 

    if m >= 1
        print("DCJ ", op, " :: ")                  
        show(adj_to_change1, true) 
        if (@isdefined adj_to_change2) show(adj_to_change2, true) end 

        print(" --> ")
        
        show(new_adj1, true) 
        show(new_adj2, true)
        print("   ", telo_idxs, "    ")
        show(adj_list)
        println()
    end 
end 


apply_dcjop (generic function with 1 method)

In [9]:
function apply_x_dcjops(target::String, x::Int, m::Int)
    # turn target genome into list of adjacencies
    id_counter = Ref{Int}(1)
    id_to_str = Dict{Int, String}()
    str_to_id = Dict{String, Int}()

    target_genome = string_to_genome(target, id_counter, id_to_str, str_to_id, true)
    adj_list = genome_to_adj_list(target_genome)  
    gid_to_loc = process_adj_list(adj_list)   
    
    if m >= 1
        show(adj_list)
        println()
    end 
    
    single_telo_idxs = Set{Int}()
    for i in 1:x
        apply_dcjop(adj_list, gid_to_loc, m, single_telo_idxs)
    end 

    src_genome_str = adjlist_to_str(adj_list, gid_to_loc)

    return src_genome_str    
end 


apply_x_dcjops (generic function with 1 method)

In [11]:
# m = 1

# # generate target string 
# tar = generate_target_with_xdup(3, 2)
# tar_wcounts = add_gene_counts(tar) 
# println(tar_wcounts)

# # apply x dcj ops to target --> src 
# adj_list, gid_to_loc = apply_x_dcjops(tar_wcounts, 20, m)