# Calculating DCJ distance between two unsigned, circular and/or linear genomes with no duplicates

Credit:
- https://github.com/mlliou112/py-dcj (python version)
- ChatGPT for converting to Julia 

In [1]:
using Base.Iterators
using Parameters
using DataStructures

abstract type AbstractGene end

In [2]:
"""
Representation for a telomere in DCJ model
"""

@with_kw struct Telomere <: AbstractGene
    reverse::Bool=false  # TODO
end

function Base.isequal(a::Telomere, b)
    return typeof(a) == typeof(b)
end

function Base.show(t::Telomere)
    return "."
end

# x = Telomere()
# show(x)

In [3]:
"""
Representation for DNA fragment (a gene) in DCJ model.

    Parameters
    ----------
    id: int (required)
        unique id

    dna: str
        What dna fragment does this gene represent?

    reverse: boolean
        Is the DNA fragment reversed?
"""
@with_kw struct Gene <: AbstractGene
    id::Int
    dna::String=""
    reverse::Bool=false
end

function Base.isequal(a::Gene, b::Gene)
    return typeof(a) == typeof(b) && a.id == b.id
end

function Base.show(m::Gene)
    if m.reverse
        print("-" * string(m.id))
    else
        print(string(m.ud))
    end
end

# y = Gene(1, "d", true)
# show(y)

In [4]:
 """The abstraction of a chromosome in DCJ model, a list of telomeres and genes

    Parameters
    ----------
    genes: list of genes or string (req)
        The object to be converted into a circular or linear chromosome.

    """
mutable struct Chromosome
    genes::Vector{AbstractGene}
end

function Chromosome(genes::String, id_counter::Ref{Int64}, id_to_char::Dict{Int, Char}, char_to_id::Dict{Char, Int64}, target::Bool)
    content = AbstractGene[]

    if !((genes[1] == '.') == (genes[end] == '.'))
        throw(ArgumentError("Linear geneosome must start and end with telomeres."))
    end

    for (i, gene) in pairs(genes)
        dna = string(gene)
        rev = isuppercase(gene)
        telomere = gene == '.'        
        if telomere 
            push!(content, Telomere())
        else  # gene 
            id = id_counter[]

            if target  
                id_to_char[id] = gene
                if gene in keys(char_to_id)
                    push!(char_to_id[lowercase(gene)], id)
                else 
                    char_to_id[lowercase(gene)] = id
                end 
                id_counter[] += 1

            else  # source str 
                ## assuming no duplicates
                id = char_to_id[lowercase(gene)]
            end
            push!(content, Gene(id, dna, rev))
        end 
    end    
    
    return Chromosome(content)
end


# x = "b"
# id_counter = Ref{Int}(1)

# Chromosome(x, id_counter, Dict{Int, Char}(), Dict{Char, Int64}(), true)

Chromosome

In [5]:
# Genome
mutable struct Genome
    data::Vector{Chromosome}
end

In [6]:
function genome_to_string(A)
end


function string_to_genome(s, id_counter, id_to_char, char_to_id, target)
     """
    Converts string to type Genome 
    """

    chrom_list_str = Vector{String}()

    chrom = ""
    new_chrom = true 
    linear = false

    for (i, g) in pairs(s)
        # if creating a new chromosome 
        if new_chrom 
            if chrom == ""
                linear = g == '.'
            else  # chrom isn't empty 
                linear = chrom[1] == '.'
            end 

            chrom = chrom * g
            new_chrom = false  
            if i == length(s) # last element is circular chrom 
                push!(chrom_list_str, chrom)
               
            end 
        
        # appending more genes to chromosome 
        else  
            if linear  
                chrom = chrom * g 
                if g == '.' || i == length(s)  # find another telomere or end of str
                    push!(chrom_list_str, chrom) 
                    chrom = ""
                    new_chrom = true 
                end 
            else  # circular
                if g =='.'  # close circular chrom (won't incude telomere in chrom)
                    push!(chrom_list_str, chrom)
                    chrom = "."  # put telomere in next chrom 
                    new_chrom = true 
                elseif i == length(s)  # reach end of str 
                    chrom = chrom * g
                    push!(chrom_list_str, chrom)
                    chrom = "."
                    new_chrom = true 
                else 
                    chrom = chrom * g
                end 
            end 
        end 
    end

    # convert chrom_list_str (list of strings) to chrom_list
    chrom_list = Chromosome[]
    
    for c_str in chrom_list_str
        c = Chromosome(c_str, id_counter, id_to_char, char_to_id, target)
        push!(chrom_list, c)
    end 

    return Genome(chrom_list)
end 

string_to_genome (generic function with 1 method)

In [7]:
"""
Gene head or tail end. Used in adjacencies
"""
@with_kw struct GeneEnd
    gene::AbstractGene
    head::Bool=true
end

"""Adjacency Data Structure"""
mutable struct Adjacency 
    left::GeneEnd
    right::GeneEnd
end

function Base.isequal(a::GeneEnd, b::GeneEnd)
    return a.gene == b.gene && a.head == b.head
end

function Base.show(ge::GeneEnd)
    if ge.head
        print(string(ge.gene))
    else
        print(string(ge.gene) * "*")
    end
end

function Base.show(adj::Adjacency)
    left = ""
    right = ""
    
    if adj.left.gene == Telomere() 
        left = "."
    else 
        left_gene = adj.left
        (left_gene.head == true) ? left = left_gene.gene.dna * ":h" : left = left_gene.gene.dna * ":t" 
    end
    
    if adj.right.gene == Telomere() 
        right = "."
    else 
        right_gene = adj.right
        (right_gene.head == true) ? right = right_gene.gene.dna * ":h" : right = right_gene.gene.dna * ":t"
    end 

    print("(", left, ",", right, ")")
end 

function Base.show(adj_list::Vector{Adjacency})
    for adj in adj_list 
        show(adj) 
    end 
end 

# function other_adjacency_end(me::GeneEnd, adj::Adjacency)
#     if adj.left == me
#         return adj.right
#     else
#         return adj.left
#     end
# end

In [8]:
function genome_to_adj_list_helper(chrom::Chromosome, genes::Vector{AbstractGene}, reversed::Bool, adj_list::Vector{Adjacency})
    linear = genes[1] == Telomere()
    
    # handle first gene end 
    if linear 
        left_end_adj = GeneEnd(Telomere(), false)
    else  # circular 
        left_end_adj = GeneEnd(genes[1], !reversed)
    end 

    # handle chrom bt first and last 
    for gene in genes
        if gene == Telomere() || gene == genes[1] 
            continue 
        end 
        reversed = gene.reverse  
        
        right_gene_adj = GeneEnd(gene, reversed)  # if reversed, right_end_adj is head (& left_end_adj is tail) 
        adj = Adjacency(left_end_adj, right_gene_adj)

        push!(adj_list, adj)

        left_end_adj = GeneEnd(gene, !reversed)
    end 

    # handle last gene end 
    if linear 
        right_end_adj = GeneEnd(Telomere(), false)

    else   
        right_end_adj = GeneEnd(genes[1], reversed)
    end

    return left_end_adj, right_end_adj

end 


function genome_to_adj_list(genome::Genome)
    adj_list = Vector{Adjacency}()

    for chrom in genome.data
        genes = chrom.genes
        reversed = genes[1].reverse  
        
        if length(genes) == 1  # single gene edge case   
            left_end_adj = GeneEnd(genes[1], reversed)  # if reversed, left_end_adj is head (& right_end_adj is tail) 
            right_end_adj = GeneEnd(genes[1], !reversed) 
        else # > 1 gene in chrom  
            left_end_adj, right_end_adj = genome_to_adj_list_helper(chrom, genes, reversed, adj_list)
        end 
        adj = Adjacency(left_end_adj, right_end_adj)
        push!(adj_list, adj)
    end 
    return adj_list
end 

genome_to_adj_list (generic function with 1 method)

In [11]:
function process_adj_list_helper(ge::GeneEnd, idx::Ref{Int}, gene_to_location::DefaultDict{Gene, Vector{Int}})
    if ge.gene == Telomere() 
        return
    end  

    if ge.head == true
        gene_to_location[ge.gene][2] = idx[]
    else 
        gene_to_location[ge.gene][1] = idx[]
    end
end 

function process_adj_list(adj_list:: Vector{Adjacency})
    gene_to_location = DefaultDict{Gene, Vector{Int}}(() -> zeros(Int, 2)) # tail = idx 1, head = 2 in array
    idx = Ref{Int}(1)

    for adj in adj_list
        process_adj_list_helper(adj.left, idx, gene_to_location)
        process_adj_list_helper(adj.right, idx, gene_to_location)
        idx[] += 1
    end 
    
    return gene_to_location 
end 

process_adj_list (generic function with 1 method)

In [12]:
mutable struct AdjacencyGraph
    src_adj_list::Vector{Adjacency}
    target_adj_list::Vector{Adjacency}
    
    src_gene_to_location::DefaultDict{Gene, Vector{Int}}
    target_gene_to_locationJ::DefaultDict{Gene, Vector{Int}}

    # cycles::Int
    # odd_paths::Int
    
    # commonGenes::Set{AbstractGene}
    # ab_paths::Int
    # a_runs::Int
    # b_runs::Int
    # run_potential::Int
end

function AdjacencyGraph(src::Genome, target::Genome)

    # process genomes (not explicitly creating the adj graph)    
    src_adj_list = genome_to_adj_list(src)        
    target_adj_list = genome_to_adj_list(target)        
  
    # show(src_adj_list)
    # print("\n")
    # show(target_adj_list)
    
    src_gene_to_location = process_adj_list(src_adj_list)
    target_gene_to_location = process_adj_list(target_adj_list)
    

    
    # return AdjacencyGraph(src_adj_list, target_adj_list, src_gene_to_location, target_gene_to_location)
end 

"""
genomes A and B have the same letters of the same multiplicity; telomeres don't matter 
no empty chromosomes (2 telomeres ..)
"""

function calculate_distance(src::String, target::String)
    # check_conditions(src, target)

    id_counter = Ref{Int}(1)
    id_to_char = Dict{Int, Char}()
    char_to_id = Dict{Char, Int}()

    target_genome = string_to_genome(target, id_counter, id_to_char, char_to_id, true)
    src_genome = string_to_genome(src, id_counter, id_to_char, char_to_id, false)

    ag = AdjacencyGraph(src_genome, target_genome)

    # dcj_distance = length(ag.commonGenes) - ag.cycles - ag.ab_paths / 2
    # operations = TODO 
end

src = ".ba.cde"
target = "abcde"
calculate_distance(src, target)

DefaultDict{Gene, Vector{Int64}, var"#12#13"} with 5 entries:
  Gene(5, "e", false) => [4, 5]
  Gene(3, "c", false) => [2, 3]
  Gene(4, "d", false) => [3, 4]
  Gene(1, "a", false) => [5, 1]
  Gene(2, "b", false) => [1, 2]