In [1]:
using NBInclude
@nbinclude("dcj_datastructs.ipynb")

print_adj_set_as_adj_list (generic function with 1 method)

In [2]:
# process adj list 
# create dictionary of gene ID --> index/location of gene's head and tail 

function process_adj_list_helper(ge::GeneEnd, idx::Ref{Int}, gid_to_loc::DefaultDict{Int, Vector{Int}})
    if ge.gene == Telomere() 
        return
    end  

    if ge.head == true
        gid_to_loc[ge.gene.id][2] = idx[]
    else 
        gid_to_loc[ge.gene.id][1] = idx[]
    end
end 

function process_adj_list(adj_list:: Vector{Adjacency})
    geneid_to_location = DefaultDict{Int, Vector{Int}}(() -> zeros(Int, 2)) # tail = idx 1, head = 2 in array
    idx = Ref{Int}(1)

    for adj in adj_list
        process_adj_list_helper(adj.left, idx, geneid_to_location)
        process_adj_list_helper(adj.right, idx, geneid_to_location)
        idx[] += 1
    end 
    
    return geneid_to_location 
end 

process_adj_list (generic function with 1 method)

In [3]:
# helpers for dcj operations and distance

# finds target gene end in source adjacency list 
# returns index and left/right in the source adj list
function find_tar_ge_in_src_adjlist(target_ge::GeneEnd, src_gid_to_l::DefaultDict{Int, Vector{Int}}, tar_gid_to_l::DefaultDict{Int, Vector{Int}}, src_adj_list::Vector{Adjacency})
    (target_ge.head == true) ? th_idx = 2 : th_idx = 1
    # (left_tar_ge == true) ? tar_lr = 'L' : tar_lr = 'R'
        
    gene_id = target_ge.gene.id

    src_ge_idx = src_gid_to_l[gene_id][th_idx]

    adj = src_adj_list[src_ge_idx]
    if adj.left.gene.id == gene_id
        src_ge = adj.left
        # src_lr = 'L'
    else 
        src_ge = adj.right 
        # src_lr = 'R' 
    end 

    # print("\n   GENEEND: ")
    # show(target_ge)
    # print("\n     src genome idx: ", src_ge_idx, " ", src_lr)
    # print("\n     target genome idx: ", tar_gid_to_l[gene_id][th_idx], " ", tar_lr, "\n")

    return [src_ge, src_ge_idx]
end 

function other_adjacency_end(ge::GeneEnd, adj::Adjacency)
    if adj.left == ge
        return adj.right
    else
        return adj.left
    end 
end

function non_telo_end(adj::Adjacency)
    if adj.left.gene == Telomere()
        return adj.right
    else
        return adj.left
    end 
end

function combine_ge(u::GeneEnd, u_idx::Int, v::GeneEnd, v_idx::Int)
    if u_idx < v_idx 
        u_lt_v = true 
    else 
        u_lt_v = false 
    end 

    if !u_lt_v 
        return Adjacency(v, u)
    else 
        return Adjacency(u, v)
    end

end 

function reassign_locs(gid_to_l, adj_list, u_idx, v_idx)
    for (i, adj) in pairs(view(adj_list, min(u_idx, v_idx):max(u_idx, v_idx)))
        left = adj.left 
        right = adj.right 

        idx = Ref{Int}(i + min(u_idx, v_idx) - 1)

        process_adj_list_helper(left, idx, gid_to_l)
        process_adj_list_helper(right, idx, gid_to_l)
    end 
end 



reassign_locs (generic function with 1 method)

In [4]:
function update_adj_set(p::GeneEnd, q::GeneEnd, u::GeneEnd, v::GeneEnd, u_idx::Int, v_idx::Int, src_adj_list::Vector{Adjacency}, src_adj_set::Set{Adjacency}) 
    #  replace adj u and v in A by ( {p, q} and (u\{p}) U (v\{q}) )
    pq = combine_ge(p, u_idx, q, v_idx)

    other_ge_u = other_adjacency_end(u, src_adj_list[u_idx])
    other_ge_v = other_adjacency_end(v, src_adj_list[v_idx])
    excluding_pq = combine_ge(other_ge_u, u_idx, other_ge_v, v_idx) 
    
    # print("******exlcuding pq")
    # show(excluding_pq)
    # print("********")

    #  remove adj with u, v in src_adj_set 
    delete!(src_adj_set, src_adj_list[u_idx])
    delete!(src_adj_set, src_adj_list[v_idx])
    
    #  add adjusted adjacencies to src_adj_set
    push!(src_adj_set, pq)
    push!(src_adj_set, excluding_pq)

    # print("  src_adj_set:")
    # show(src_adj_set)
    # print("\n")
end 

function update_adj_list_gid_loc_dict() 
    # update adj lists 
        print("\nDCJ Operation (Cycle) :: ")
        show(src_adj_list, u_idx, v_idx)
        
        print("\n")
        show(pq)
        show(excluding_pq)
        if u_idx < v_idx 
            src_adj_list[u_idx] = pq
            src_adj_list[v_idx] = excluding_pq
            reverse!(view(src_adj_list, u_idx+1:v_idx-1))
        else 
            src_adj_list[u_idx] = excluding_pq
            src_adj_list[v_idx] = pq
            reverse!(view(src_adj_list, v_idx+1:u_idx-1))
        end  
    
        # update gid-->loc dicts 
        reassign_locs(src_gid_to_l, src_adj_list, u_idx, v_idx)
end 

# find dcj operations and distance

function find_dcj_dist_ops(src_adj_list::Vector{Adjacency}, tar_adj_list::Vector{Adjacency}, src_gid_to_l::DefaultDict{Int, Vector{Int}}, tar_gid_to_l::DefaultDict{Int, Vector{Int}}, src_adj_set::Set{Adjacency}, target_adj_set::Set{Adjacency})
    count = 0  
    # cycles = 0 
    # odd_paths = 0 
    telomere_idxs = Vector{Int}()

    print("indexing through target adj list...")
    
    # for each adj {p, q} in target genome 
    for (i, adj) in pairs(tar_adj_list) 
        print("\n\n")
        show(src_adj_list)
         
        print("\nADJ ", i, " :::: ")
        show(adj)
        print("\n")

        p = adj.left  
        q = adj.right  

        if p.gene == Telomere() || q.gene == Telomere()  # telomeres handled in next loop
            push!(telomere_idxs, i)  
        else 
            #  let u be element of genome A that contains p 
            #  let v be element of genome A that contains q 
            u, u_idx = find_tar_ge_in_src_adjlist(p, src_gid_to_l, tar_gid_to_l, src_adj_list)  
            v, v_idx = find_tar_ge_in_src_adjlist(q, src_gid_to_l, tar_gid_to_l, src_adj_list)  
            
            #  if u != v, replace u and v in A by {p, q} and (u\{p}) U (v\{q})
            if u_idx != v_idx             
                update_adj_set(p, q, u, v, u_idx, v_idx, src_adj_list, src_adj_set)
            
                print("updated adj set ::: ")
                show(src_adj_set)
                print("\nupdated adj list ::: ")
                print_adj_set_as_adj_list(src_adj_set)

                #  update_adj_list_gid_loc_dict()
                
                count += 1 
            end   
        end
    end 

    # for each telomere {p} in genome B 
    # for idx in telomere_idxs
    #     telo_adj = tar_adj_list[idx]
    #     telo_ge = other_adjacency_end(telo_adj, Telomere())

    #     #  let u be element of genome A that contains p 
    #     u, u_idx = find_tar_ge_in_src_adjlist(p, src_gid_to_l, tar_gid_to_l, src_adj_list)  

    #     #  if u is an adjacency, then replace u in A by {p} and (u\{p})
    #     if other_adjacency_end(tar_adj_list[u_idx], u) != Telomere() 
    #         # update_adj_set(p, u, u_idx,src_adj_list, src_adj_set)
    #         # update_adj_list_gid_loc_dict()

    #         count += 1 
    #     end 
    # end 

    return count 
end 

find_dcj_dist_ops (generic function with 1 method)

In [5]:
src = "abc"
target = ".abc."

calculate_distance(src, target)

SRC ADJ LIST
(a:h,b:t)(b:h,c:t)(c:h,a:t)
|
v
TARGET ADJ LIST
(.,a:t)(a:h,b:t)(b:h,c:t)(c:h,.)