# Calculating DCJ distance between two signed, circular and/or linear genomes with no duplicates

Credit:
- https://github.com/mlliou112/py-dcj (python version)
- ChatGPT for converting to Julia 

In [34]:
using Base.Iterators
using Parameters
using DataStructures

abstract type AbstractGene end

In [35]:
# telomere

struct Telomere <: AbstractGene
end

function Base.:(==)(a::Telomere, b)
    return typeof(a) == typeof(b)
end

function Base.show(t::Telomere)
    return "."
end

In [36]:
# gene

@with_kw struct Gene <: AbstractGene
    id::Int
    dna::Char  # lowercase 
    reverse::Bool=false
end

function Base.:(==)(a::Gene, b::Gene)
    return typeof(a) == typeof(b) && a.id == b.id
end

function Base.show(m::Gene)
    if m.reverse
        print(uppercase(m.dna)) # * string(m.id))
    else
        print(m.dna) # string(m.id))
    end
end

In [37]:
# chromosome

mutable struct Chromosome
    genes::Vector{AbstractGene}
end

function Chromosome(genes::String, id_counter::Ref{Int64}, id_to_char::Dict{Int, Char}, char_to_id::Dict{Char, Int64}, target::Bool)
    content = AbstractGene[]

    for (i, gene) in pairs(genes)
        dna = only(lowercase(string(gene)))
        rev = isuppercase(gene)
        telomere = gene == '.'    

        if telomere 
            push!(content, Telomere())
        else  # gene 
            id = id_counter[]

            if target  
                id_to_char[id] = gene
                if gene in keys(char_to_id)  # TODO dups; char --> array of IDs  
                    # push!(char_to_id[dna], id)
                else 
                    char_to_id[dna] = id
                end 
                id_counter[] += 1

            else  # source str 
                ## assuming no duplicates
                id = char_to_id[dna]  # TODO: dups 
            end

            push!(content, Gene(id, dna, rev))
        end 
    end    
    
    return Chromosome(content)
end

Chromosome

In [38]:
# Genome

mutable struct Genome
    data::Vector{Chromosome}
    id_to_char::Dict{Int, Char}
    char_to_id::Dict{Char, Int}
end

function Base.:(==)(a::Genome, b::Genome)
    return a.data == a.data && a.id_to_char == b.id_to_char && a.char_to_id == b.char_to_id
end

In [39]:
# str -> genome 

function string_to_genome(s, id_counter, id_to_char, char_to_id, target)
     """
    Converts string to type Genome 
    """

    chrom_list_str = Vector{String}()

    chrom = ""
    new_chrom = true 
    linear = false

    for (i, g) in pairs(s)
        # creating a new chromosome 
        if new_chrom 
            linear = g == '.'
            chrom = chrom * g
            new_chrom = false  
            if i == length(s) && !linear # single gene chrom 
                throw(ArgumentError("Circular chromosome must start and end with the same gene (case-sensitive)."))
            end 
        # appending more genes to existing chromosome 
        else  
            if linear  
                chrom = chrom * g 
                # close linear chrom (include telo in chrom) 
                if g == '.' 
                    push!(chrom_list_str, chrom) 
                    chrom = ""
                    new_chrom = true 
                elseif i == length(s) 
                    throw(ArgumentError("Linear chromosome must start and end with telomeres."))
                end 
            else  # circular
                if g == chrom[1]  # close circular chrom 
                    push!(chrom_list_str, chrom)
                    chrom = "" 
                    new_chrom = true 
                elseif i == length(s) 
                    throw(ArgumentError("Circular chromosome must start and end with the same gene (case-sensitive)."))
                else 
                    chrom = chrom * g
                end 
            end 
        end 
    end

    # convert chrom_list_str (list of strings) to chrom_list
    chrom_list = Chromosome[]
    
    for c_str in chrom_list_str
        c = Chromosome(c_str, id_counter, id_to_char, char_to_id, target)
        push!(chrom_list, c)
    end 
    
    return Genome(chrom_list, id_to_char, char_to_id)
end 


string_to_genome (generic function with 1 method)

In [40]:
# gene end (used in adj)

@with_kw struct GeneEnd
    gene::AbstractGene
    head::Bool=true
end

function GeneEnd(T::Telomere)
    return GeneEnd(Telomere(), false)  # head value doesn't matter   
end 

function Base.:(==)(a::GeneEnd, b::GeneEnd)
    # print("comparing gene eneds:: ")
    # show(a.gene)
    # show(b.gene)
    # print("::")
    if a.gene == Telomere() && b.gene == Telomere() 
        return true 
    elseif a.gene == Telomere() || b.gene == Telomere() 
        return false 
    else 
        return a.gene.id == b.gene.id && a.head == b.head
    end 
end

function Base.show(ge::GeneEnd)
    if ge.gene == Telomere() 
        show(ge.gene)
        return 
    end 
    
    if ge.head
        print(string(ge.gene.dna) *":h")
    else
        print(string(ge.gene.dna) * ":t")
    end
end

In [41]:
# adjacency
mutable struct Adjacency 
    left::GeneEnd
    right::GeneEnd
end

function Base.:(==)(a::Adjacency, b::Adjacency)
     # order of the gene ends doesn't matter
    if a.left == b.left && a.right == b.right || a.left == b.right && a.right == b.left  
        return true 
    end 
    return false 
end 

function Base.show(adj::Adjacency)
    left = ""
    right = ""
    
    if adj.left.gene == Telomere() 
        left = "."
    else 
        left_gene = adj.left
        (left_gene.head == true) ? left = left_gene.gene.dna * ":h" : left = left_gene.gene.dna * ":t" 
    end
    
    if adj.right.gene == Telomere() 
        right = "."
    else 
        right_gene = adj.right
        (right_gene.head == true) ? right = right_gene.gene.dna * ":h" : right = right_gene.gene.dna * ":t"
    end 

    print("(", left, ",", right, ")")
end 


# logging 

function Base.show(adj::Adjacency, blue::Bool)
    left = ""
    right = ""
    
    if adj.left.gene == Telomere() 
        left = "."
    else 
        left_gene = adj.left
        (left_gene.head == true) ? left = left_gene.gene.dna * ":h" : left = left_gene.gene.dna * ":t" 
    end
    
    if adj.right.gene == Telomere() 
        right = "."
    else 
        right_gene = adj.right
        (right_gene.head == true) ? right = right_gene.gene.dna * ":h" : right = right_gene.gene.dna * ":t"
    end 

    s = "(" * left * "," * right * ")"
    
    if blue 
        printstyled(s; color = :blue)
    else
        printstyled(s; color = :green) 
    end 

end 

In [42]:
# showing/printing collections of adj

function Base.show(adj_list::Vector{Adjacency})
    for adj in adj_list 
        show(adj) 
    end 
end 

function Base.show(adj_set::Set{Adjacency})
    for adj in adj_set 
        show(adj) 
    end 
end 

function Base.show(adj_set::Set{Adjacency})
    for adj in adj_set 
        show(adj) 
    end 
end 

# logging

function Base.show(adj_list::Vector{Adjacency}, changed_adj1::Adjacency, changed_adj2::Adjacency)
    for adj in adj_list 
        if adj == changed_adj1 || adj == changed_adj2
            show(adj, true)
        else 
            show(adj) 
        end 
    end 
end 

function Base.show(adj_list::Vector{Adjacency}, changed_adj::Adjacency) 
    for adj in adj_list 
        if adj == changed_adj
            show(adj, true)
        else 
            show(adj) 
        end 
    end 
end 

In [43]:
function genome_to_adj_listset_helper(linear::Bool, chrom::Chromosome, genes::Vector{AbstractGene}, adj_list::Vector{Adjacency}, adj_set::Set{Adjacency})
    # first gene end 
    if linear 
        left = GeneEnd(Telomere())
        end_idx = length(genes) - 1
    else 
        left = GeneEnd(genes[1], !genes[1].reverse)
        end_idx = length(genes)
    end  

    for i in 2:end_idx
        gene = genes[i]
        reversed = gene.reverse  
        
        right = GeneEnd(gene, reversed)  # if reversed, right_end_adj is head (& left_end_adj is tail) 
        adj = Adjacency(left, right)
        push!(adj_list, adj)
        push!(adj_set, adj)

        left = GeneEnd(gene, !reversed)
    end 

    # last gene end 
    if linear 
        right = GeneEnd(Telomere())
    else 
        right = GeneEnd(genes[1], genes[1].reverse)
    end  

    adj = Adjacency(left, right)
    push!(adj_list, adj)
    push!(adj_set, adj)
end 

function genome_to_adj_listset(genome::Genome)
    adj_list = Vector{Adjacency}()
    adj_set = Set{Adjacency}()

    for chrom in genome.data
        genes = chrom.genes
        if genes[1] == Telomere()  # linear 
            genome_to_adj_listset_helper(true, chrom, genes, adj_list, adj_set)
        else  # circular 
            genome_to_adj_listset_helper(false, chrom, genes, adj_list, adj_set)
        end
    end 
    
    return adj_list, adj_set
end 

genome_to_adj_listset (generic function with 1 method)

In [44]:
# adjacency graph 

mutable struct AdjacencyGraph
    num_dcj_ops::Int
end

function AdjacencyGraph(src::Genome, target::Genome)
    # process genomes (not explicitly creating the adj graph)    
    src_adj_list, src_adj_set = genome_to_adj_listset(src)        
    target_adj_list, target_adj_set = genome_to_adj_listset(target) 
    
    print("SRC ADJ LIST\n")
    show(src_adj_list)
    # print("\nSRC ADJ SET\n")
    # show(src_adj_set)
    
    print("\n|\nv\nTARGET ADJ LIST\n")
    show(target_adj_list)
    # print("\nTARGET ADJ SET\n")
    # show(target_adj_set)
    
    print("\n\n*************\n")
    
    src_geneid_to_location = process_adj_list(src_adj_list)   
    target_geneid_to_location = process_adj_list(target_adj_list)
    
    # print("SRC GENE --> LOCATION DICT\n")
    # show(src_geneid_to_location, src.id_to_char)  
    # print("\n\n", "TARGET GENE --> LOCATION DICT\n")
    # show(target_geneid_to_location, src.id_to_char)
    # print("\n*************\n\n")

    if src.id_to_char != target.id_to_char
        throw(ArgumentError("ERROR: source and target genomes should have the same ID-->char dict"))
    end 

    # use adj graph representation to find dcj distance and operations 
    num_dcj_ops = find_dcj_dist_ops(src_adj_list, target_adj_list, src_geneid_to_location, target_geneid_to_location, src_adj_set, target_adj_set) 
    
    return AdjacencyGraph(num_dcj_ops)
end 

AdjacencyGraph

In [45]:
"""
assumptions 
genomes A and B are balanced (same letters of the same multiplicity; telomeres don't matter) 
no empty chromosomes (2 telomeres ..)
"""

# id_to_char = Dict{Int, Char}()  # TODO 

function calculate_distance(src::String, target::String)
    # check_conditions(src, target)

    id_counter = Ref{Int}(1)
    id_to_char = Dict{Int, Char}()
    char_to_id = Dict{Char, Int}()

    target_genome = string_to_genome(target, id_counter, id_to_char, char_to_id, true)
    src_genome = string_to_genome(src, id_counter, id_to_char, char_to_id, false)

    ag = AdjacencyGraph(src_genome, target_genome)

    return ag.num_dcj_ops
end

calculate_distance (generic function with 1 method)

In [46]:
function reset_documentation(id_counter::Ref{Int}, id_to_char::Dict{Int, Char}, char_to_id::Dict{Char, Int})
    id_counter = Ref{Int}(1)
    id_to_char = Dict{Int, Char}()
    char_to_id = Dict{Char, Int}() 
end 


reset_documentation (generic function with 1 method)