In [1]:
using NBInclude
@nbinclude("datastructs.ipynb")

In [2]:
# chromosome 

# parse string of format "{str}{int}{str}{int}...{str}{int}" into an array of genes 
# each gene follows the format of {str}{int} 
# e.g., "a1b1c2" --> ["a1, "b1", "c2"]

# str is ::SubString{chromstr} || ::String
function find_genes(chromstr)
    genes = String[]
    curr_gene = ""

    seen_ints = false 

    for g in chromstr
        if g == '.'  # telomere 
            if !isempty(curr_gene) 
                push!(genes, curr_gene) 
            end
            push!(genes, string(g))
            curr_gene = ""
        else   # non-telomere 
            if isdigit(g) && !seen_ints
                seen_ints = true 
            elseif seen_ints && isletter(g)
                if !isempty(curr_gene) 
                    push!(genes, curr_gene) 
                    seen_ints = false
                end
                curr_gene = ""
            end 
            curr_gene *= g
        end 
    end

    # add last accumulated string if not empty
    if !isempty(curr_gene) push!(genes, curr_gene) end 

    return genes
end



function Chromosome(genes_str::SubString{String}, id_counter::Ref{Int64}, id_to_str::Dict{Int, String}, str_to_id::Dict{String, Int64}, target::Bool)
    content = AbstractGene[]

    genes = find_genes(genes_str)
    
    for (i, gene) in pairs(genes) 
        telomere = gene == "."   

        if telomere && (i == 1 || i == length(genes))
            push!(content, Telomere())
        
        elseif telomere 
            throw(ArgumentError("Linear chromosome must start and end with telomeres."))
        
        else  # gene 
            gene_letters = filter(isletter, gene)

            rev = gene_letters == uppercase(gene_letters)
            dna = lowercase(gene_letters)
            dup = parse(Int, (filter(isdigit, gene)))

            gene = dna*string(dup)
            
            id = id_counter[]
            if target  # defines str_to_id dictionary
                id_to_str[id] = gene
                if gene in keys(str_to_id)  
                    throw(ArgumentError("Duplicate gene."))
                else 
                    str_to_id[gene] = id
                end 
                id_counter[] += 1

            else  # source str, doesn't define str_to_id dictionary
                id = str_to_id[gene]
            end
            
            push!(content, Gene(id, dna, dup, rev))
        end 
    end    
    return Chromosome(content)
end

Chromosome

In [3]:
# genome 

# str -> genome 
function string_to_genome(s, id_counter, id_to_str, str_to_id, target)
    chromlist_str = split(s, ",")

    # convert chromlist_str (list of strings) to chrom_list
    chromlist = Chromosome[]
    
    for c_str in chromlist_str
        c = Chromosome(c_str, id_counter, id_to_str, str_to_id, target)
        push!(chromlist, c)
    end 
    
    return Genome(chromlist, id_to_str, str_to_id)
end 



string_to_genome (generic function with 1 method)

In [4]:
# genome to adjacency list 

function chrom_to_adj(linear::Bool, chrom::Chromosome, genes::Vector{AbstractGene}, adj_list::Vector{Adjacency})
    # first gene end 
    if linear 
        left = GeneEnd(Telomere())
        end_idx = length(genes) - 1
    else 
        left = GeneEnd(genes[1], !genes[1].reverse)
        end_idx = length(genes)
    end  

    # intermediary genes  
    for i in 2:end_idx
        gene = genes[i]
        reversed = gene.reverse  
        
        right = GeneEnd(gene, reversed)  # if reversed, right is head (& left is tail) 
        adj = Adjacency(left, right)
        push!(adj_list, adj)

        left = GeneEnd(gene, !reversed)
    end 

    # last gene end 
    if linear 
        right = GeneEnd(Telomere())
    else 
        right = GeneEnd(genes[1], genes[1].reverse)
    end  

    adj = Adjacency(left, right)
    push!(adj_list, adj)
end 


function genome_to_adj_list(genome::Genome)
    adj_list = Vector{Adjacency}()

    for chrom in genome.data
        genes = chrom.genes
        if genes[1] == Telomere()  # linear 
            linear = true 
        else  # circular 
            linear = false 
        end
        chrom_to_adj(linear, chrom, genes, adj_list)
    end 
    
    return adj_list
end 

genome_to_adj_list (generic function with 1 method)

In [5]:
# default dict stores gene ID --> location of gene's head and tail
 
function Base.show(dict::DefaultDict{Int, Vector{Int}}, id_to_str::Dict{Int, String})
    for (key, value) in dict
        # println("$(key.dna) (id:$(key.id)) => $(value)")
        println(id_to_str[key], " $(key) => $(value)")
    end
end 

In [6]:
# adj list --> dictionary of gene ID to index/location of gene's head and tail 

function assign_geidx_to_gidtolocdict(ge::GeneEnd, idx::Ref{Int}, gid_to_loc::DefaultDict{Int, Vector{Int}})
    if ge.gene == Telomere() 
        return
    end  

    if ge.head == true
        gid_to_loc[ge.gene.id][2] = idx[]
    else 
        gid_to_loc[ge.gene.id][1] = idx[]
    end
end 


function adjlist_to_gidtoloc(adj_list:: Vector{Adjacency})
    gid_to_loc = DefaultDict{Int, Vector{Int}}(() -> zeros(Int, 2)) # tail = idx 1, head = 2 in array
    idx = Ref{Int}(1)

    for adj in adj_list
        assign_geidx_to_gidtolocdict(adj.left, idx, gid_to_loc)
        assign_geidx_to_gidtolocdict(adj.right, idx, gid_to_loc)
        idx[] += 1
    end 
    
    return gid_to_loc 
end 
     

adjlist_to_gidtoloc (generic function with 1 method)

In [7]:
# adjacency graph 


function AdjacencyGraph(src::Genome, target::Genome, mode::Int)
    # process genomes (not explicitly creating the adj graph)    
    src_adj_list = genome_to_adj_list(src)        
    target_adj_list  = genome_to_adj_list(target) 
    
    if mode >= 1
        show(src_adj_list)
        print(" --> ")
        show(target_adj_list)
        println()
    end 
    
    src_geneid_to_location = adjlist_to_gidtoloc(src_adj_list)   
    target_geneid_to_location = adjlist_to_gidtoloc(target_adj_list)
    
    if mode >= 2
        println()
        print("SRC GENE & ID --> LOCATION DICT\n")
        show(src_geneid_to_location, src.id_to_str)  
        print("\n", "TARGET GENE & ID --> LOCATION DICT\n")
        show(target_geneid_to_location, src.id_to_str)
        print("\n*************\n\n")
    end 

    if src.id_to_str != target.id_to_str
        throw(ArgumentError("Source and target genomes should have the same ID-->str dict"))
    end 

    num_dcj_ops = find_dcj_dist_ops(src_adj_list, target_adj_list, src_geneid_to_location, target_geneid_to_location, mode, target.id_to_str) 
    
    if mode >= 1
        print("\n#ops = ", num_dcj_ops)
        print("\n----------------------------------------\n")
    end 

    return AdjacencyGraph(num_dcj_ops)
end 

AdjacencyGraph