In [1]:
using NBInclude
@nbinclude("logging.ipynb")
@nbinclude("datastructs.ipynb")
@nbinclude("helpers.ipynb")

find_adj (generic function with 1 method)

In [2]:
# update documentation after a dcj operation

function reassign_locs(pq::Adjacency, excluding_pq::Adjacency, u_idx::Int, v_idx::Int, gid_to_l::DefaultDict{Int, Vector{Int}})
    # want to update dictionary given: 
    # src_adjs[u_idx] = pq 
    # src_adjs[v_idx] = excluding_pq 

    idx = Ref{Int}(u_idx)
    if pq.left.gene != Telomere() 
        assign_ge_idx_to_gid_to_locdict(pq.left, idx, gid_to_l)
    end 
    if pq.right != Telomere() 
        assign_ge_idx_to_gid_to_locdict(pq.right, idx, gid_to_l)
    end 
    
    idx = Ref{Int}(v_idx)
    if excluding_pq.left.gene != Telomere() 
        assign_ge_idx_to_gid_to_locdict(excluding_pq.left, idx, gid_to_l)
    end 
    if excluding_pq.right.gene != Telomere() 
        assign_ge_idx_to_gid_to_locdict(excluding_pq.right, idx, gid_to_l) 
    end 
end 

function update_src_adjs(p_pq::Adjacency, excluding_p_pq::Adjacency, u_idx::Int, v_idx::Int, src_adjs::Vector{Adjacency})
    if u_idx == 0  # created a new adjacency 
        u_idx = length(src_adjs)+1
        push!(src_adjs, p_pq)
    end
    if v_idx == 0  # created a new adjacency 
        v_idx = length(src_adjs)+1
        push!(src_adjs, p_pq)
    end 
    
    src_adjs[u_idx] = p_pq 
    src_adjs[v_idx] = excluding_p_pq 

    return u_idx, v_idx
end 

function update_documentation(p::GeneEnd, q::GeneEnd, u_idx::Int, v_idx::Int, src_adjs::Vector{Adjacency}, src_gid_to_l::DefaultDict{Int, Vector{Int}})
    if q == GeneEnd(Telomere())
        #  replace u in A by...
        # {p} and (u\{p})  
        p_adj = Adjacency(p, GeneEnd(Telomere()))
        excluding_p = Adjacency(other_adjacency_end(p, src_adjs[u_idx]), GeneEnd(Telomere()))
    
        # p, excluding_p
        p_pq  = p_adj
        excluding_p_pq = excluding_p
    else 
        #  replace adj u and v in A by...
        # {p, q} and 
        pq = Adjacency(p, q)
    
        # u\{p}) U (v\{q}
        other_ge_u = other_adjacency_end(p, src_adjs[u_idx])
        other_ge_v = other_adjacency_end(q, src_adjs[v_idx])
        excluding_pq = Adjacency(other_ge_u, other_ge_v) 

        # pq, excluding_pq
        p_pq  = pq
        excluding_p_pq = excluding_pq
    end 


    # update src adjs
    u_idx, v_idx = update_src_adjs(p_pq, excluding_p_pq, u_idx, v_idx, src_adjs)
    # update geneend locations  
    reassign_locs(p_pq, excluding_p_pq, u_idx, v_idx, src_gid_to_l)

    return p_pq, excluding_p_pq
end 


update_documentation (generic function with 1 method)

In [3]:
# find dcj operations and distance

function find_dcj_dist_ops(src_adjs::Vector{Adjacency}, tar_adj_list::Vector{Adjacency}, src_gid_to_l::DefaultDict{Int, Vector{Int}}, tar_gid_to_l::DefaultDict{Int, Vector{Int}}, mode::Int, id_to_str::Dict{Int, String})
    count = 0  
    telomere_idxs = Vector{Int}()
    updated_adj_list = src_adjs

    if mode >= 2
        println("indexing through target adj list...\n") 
        println("processing adj with two gene ends... ")
    end 

    # for each adj {p, q} in target genome 
    for (i, adj) in pairs(tar_adj_list) 
        p = adj.left  
        q = adj.right  
        
        if p.gene == Telomere() || q.gene == Telomere()  # telomeres handled in next loop
            push!(telomere_idxs, i)  

        else 
            if mode >= 2
                print("\nADJ ", i, " :::: ")
                show(adj, false)
                print("\n")
            end 
            if mode >= 3
                print("current source adjacencies :: ")
                show(updated_adj_list)
                println()
            end 
            
            #  u = ge of adj from genome A that == p
            #  v = ge of adj from genome A that == q
            u_idx = find_adj(p, src_gid_to_l, src_adjs)  
            v_idx = find_adj(q, src_gid_to_l, src_adjs)  

            #  if u != v, replace u and v in A by {p, q} and (u\{p}) U (v\{q})
            if u_idx != v_idx  
                adj_to_change1 = src_adjs[u_idx]
                adj_to_change2  = src_adjs[v_idx]

                if mode >= 3
                    prev_adj_list = copy(updated_adj_list)
                end

                pq, excluding_pq = update_documentation(p, q, u_idx, v_idx, src_adjs, src_gid_to_l)
                
                if mode >= 1
                    print("DCJ :: ")                    
                    show(adj_to_change1, true)
                    show(adj_to_change2, true)

                    print(" --> ")
                    
                    show(pq, true) 
                    show(excluding_pq, true)
                    println()
                end 
                
                if mode == 2
                    print("updated src adjs ")
                    show(src_adjs)
                    println()
                end 

                if mode >= 3
                    print("\nprev adj list ::: ")
                    show(prev_adj_list, adj_to_change1, adj_to_change2)
                    print("\nupdated adj list ::: ")
                    updated_adj_list = reorder_adjs(src_adjs, src_gid_to_l)
                    show(updated_adj_list, pq, excluding_pq)
                    
                    println("\nupdated src_gid_to_location")
                    show(src_gid_to_l, id_to_str)
                end 
                
            
                count += 1 
            end   
        end
    end 

    if mode >= 2
        println("\nprocessing telomeres... ")
    end 

    # for each telomere {p} in genome B 
    for idx in telomere_idxs
        p = tar_adj_list[idx]
        p_non_telo = other_adjacency_end(GeneEnd(Telomere()), p)
        
        if mode >= 2
            print("\nADJ ", idx, " :::: ")
            show(p, false)
            print("\n")
        end 

        if mode >= 3
                print("current source adjacencies :: ")
                show(updated_adj_list)
                println()
        end         

        #  u = ge of adj from genome A that == p 
        u_idx = find_adj(p_non_telo, src_gid_to_l, src_adjs)  
        
        # #  if u is an adjacency, then replace u in A by {p} and (u\{p}) 
        if other_adjacency_end(p_non_telo, src_adjs[u_idx]).gene != Telomere()
            adj_to_change = src_adjs[u_idx]

            if mode >= 3
                prev_adj_list = copy(updated_adj_list)
            end 
            
            p, excluding_p = update_documentation(p_non_telo, GeneEnd(Telomere()), u_idx, 0, src_adjs, src_gid_to_l)
            
            if mode >= 1 
                print("DCJ :: ")
                show(adj_to_change, true)

                print(" --> ")
            
                show(p, true) 
                show(excluding_p, true)
                println()
            end 

            if mode >= 3
                print("\nprev adj list ::: ")
                show(prev_adj_list, src_adjs[u_idx])
                print("\nupdated adj list ::: ")
                updated_adj_list = reorder_adjs(src_adjs, src_gid_to_l)
                show(updated_adj_list, p, excluding_p)
                
                println("\nupdated src_gid_to_location")
                show(src_gid_to_l, id_to_str)
            end 
            
            if mode == 2 
                print("updated src adjs ")
                show(src_adjs)
                println()
            end 

            count += 1 
        end 
    end 

    return count
end 


find_dcj_dist_ops (generic function with 1 method)

In [4]:
"""
finds the minimum dcj distance to get from the source genome --> target genome 
O(N) time, O(N) space where n = number of genes in genome

src & target 
    source and target genomes (balanced = have the same genes/letters of the same multiplicity; telomeres don't matter)
    
    genome = list of chromosome(s)
        chromosomes are separated by a comma (no spaces)
    chromosome = list of genes 
        chromosomes can be circular (no periods) or linear (sandwiched between two periods)
    gene = string with non-duplicated letter(s) (letter == gene), followed by its duplicate value in the whole genome (0 if there are no duplicates)
        letters may be capitalized to denote reversed orientation of the gene (which strand of the DNA double helix it's on)
        
    e.g., 
        genome a0b1c0b0,b2
        chromosome [a0 b1 c0 b0] [b2]
        genes "a0" "b1" "c0" "b0" "b2"

mode 
    "none" - doesn't print   
    "info" - informative   
    "debug" - prints information useful for debugging 
    "trace" - prints everything --- note: makes algo O(n^2) 
"""


function calculate_distance(src::String, target::String, mode::String)
    if mode == "info"
        mode_int = 1
    elseif mode == "debug"
        mode_int = 2 
    elseif mode == "trace"
        mode_int = 3 
    else 
        mode_int = 0 
    end 
    
    # check_conditions(src, target)

    id_counter = Ref{Int}(1)
    id_to_str = Dict{Int, String}()
    str_to_id = Dict{String, Int}()

    target_genome = string_to_genome(target, id_counter, id_to_str, str_to_id, true)
    src_genome = string_to_genome(src, id_counter, id_to_str, str_to_id, false)
    
    ag = AdjacencyGraph(src_genome, target_genome, mode_int)

    return ag.num_dcj_ops
end


calculate_distance (generic function with 1 method)

In [6]:
# src = ".a0.,.b0.,.c0.,.d0."
# target="A0B0C0D0"
# mode = "info"
# calculate_distance(src, target, mode)