# Calculating DCJ distance between two signed, circular and/or linear genomes with no duplicates

Credit:
- data structures from https://github.com/mlliou112/py-dcj 
- ChatGPT for converting to Julia 

In [5]:
using Base.Iterators
using Parameters
using DataStructures

abstract type AbstractGene end

In [6]:
# telomere

struct Telomere <: AbstractGene
end

function Base.:(==)(a::Telomere, b)
    return typeof(a) == typeof(b)
end

function Base.show(t::Telomere)
    print(".")
end

In [23]:
# gene

@with_kw struct Gene <: AbstractGene
    id::Int
    dna::String  # lowercase char 
    dup::Int
    reverse::Bool=false
end

function Base.:(==)(a::Gene, b::Gene)
    return typeof(a) == typeof(b) && a.id == b.id
end

function Base.show(gene::Gene)
    g = gene.dna
    if gene.reverse
        g = uppercase(g)
    end 

    print(g*string(gene.dup))
end

function return_ge_str(gene::Gene)
    g = gene.dna
    if gene.reverse
        g = uppercase(g)
    end 

    return (g*string(gene.dup))
end

return_ge_str (generic function with 1 method)

In [8]:
# str is ::SubString{String} || ::String
function find_genes(str)
    genes = String[]
    curr_gene = ""

    seen_ints = false 

    for g in str
        if g == '.'  # telomere 
            if !isempty(curr_gene) 
                push!(genes, curr_gene) 
            end
            push!(genes, string(g))
            curr_gene = ""
        else   # non-telomere 
            if isdigit(g) && !seen_ints
                seen_ints = true 
            elseif seen_ints && isletter(g)
                if !isempty(curr_gene) 
                    push!(genes, curr_gene) 
                    seen_ints = false
                end
                curr_gene = ""
            end 
            curr_gene *= g
        end 
    end

    # add last accumulated string if not empty
    if !isempty(curr_gene) push!(genes, curr_gene) end 

    return genes
end

find_genes (generic function with 1 method)

In [9]:
# chromosome

mutable struct Chromosome
    genes::Vector{AbstractGene}
end

function Base.show(chrom::Chromosome)
    for gene in chrom.genes 
        show(gene)
    end 
end 

function Chromosome(genes_str::SubString{String}, id_counter::Ref{Int64}, id_to_str::Dict{Int, String}, str_to_id::Dict{String, Int64}, target::Bool)
    content = AbstractGene[]

    genes = find_genes(genes_str)
    
    for (i, gene) in pairs(genes) 
        telomere = gene == "."   
        if telomere && (i == 1 || i == length(genes))
            push!(content, Telomere())
        elseif telomere 
            throw(ArgumentError("Linear chromosome must start and end with telomeres."))
        else  # gene 
            gene_letters = filter(isletter, gene)
            rev = gene_letters == uppercase(gene_letters)
            dna = lowercase(gene_letters)
            dup = parse(Int, (filter(isdigit, gene)))
            gene = dna*string(dup)
            id = id_counter[]

            if target  
                id_to_str[id] = gene
                if gene in keys(str_to_id)  
                    throw(ArgumentError("Duplicate gene."))
                else 
                    str_to_id[gene] = id
                end 
                id_counter[] += 1

            else  # source str 
                id = str_to_id[gene]
            end
            
            push!(content, Gene(id, dna, dup, rev))
        end 
    end    
    return Chromosome(content)
end

Chromosome

In [10]:
# Genome

mutable struct Genome
    data::Vector{Chromosome}
    id_to_str::Dict{Int, String}
    str_to_id::Dict{String, Int}
end

function Base.:(==)(a::Genome, b::Genome)
    return a.data == a.data && a.id_to_str == b.id_to_str && a.str_to_id == b.str_to_id
end

function Base.show(genome::Genome)
    for chrom in genome.data
        show(chrom)
    end 
end 
    

In [11]:
# str -> genome 

function string_to_genome(s, id_counter, id_to_str, str_to_id, target)
    chrom_list_str = split(s, ",")

    # convert chrom_list_str (list of strings) to chrom_list
    chrom_list = Chromosome[]
    
    for c_str in chrom_list_str
        c = Chromosome(c_str, id_counter, id_to_str, str_to_id, target)
        push!(chrom_list, c)
    end 
    
    return Genome(chrom_list, id_to_str, str_to_id)
end 


function genome_to_string(genome::Genome)
    genome_str = ""

    chroms = genome.data 

    for c in chroms 
        genome_str *= c.genes[1]
        for g in c.genes[2:end ]
            genome_str *= "'"*g
        end 
        genome_str *= ","
    end 

    return genome_str 
end 

genome_to_string (generic function with 1 method)

In [12]:
# gene end (used in adj)

@with_kw struct GeneEnd
    gene::AbstractGene
    head::Bool=true
end

function GeneEnd(T::Telomere)
    return GeneEnd(Telomere(), false)  # head value doesn't matter   
end 

function Base.:(==)(a::GeneEnd, b::GeneEnd)
    if a.gene == Telomere() && b.gene == Telomere() 
        return true 
    elseif a.gene == Telomere() || b.gene == Telomere() 
        return false 
    else 
        return a.gene.id == b.gene.id && a.head == b.head
    end 
end

function Base.show(ge::GeneEnd)
    if ge.gene == Telomere() 
        return "."
    end 
    
    ge_info = ge.gene.dna * string(ge.gene.dup)
    (ge.head == true) ? ge_info *= ":h" : ge_info *= ":t" 

    return ge_info
end


In [13]:
# adjacency
mutable struct Adjacency 
    left::GeneEnd
    right::GeneEnd
end

function Base.:(==)(a::Adjacency, b::Adjacency)
     # order of the gene ends doesn't matter
    if a.left == b.left && a.right == b.right || a.left == b.right && a.right == b.left  
        return true 
    end 
    return false 
end 

function Base.show(adj::Adjacency)
    left = show(adj.left)
    right = show(adj.right)

    print("(", left, ",", right, ")")
end 


# logging 

function Base.show(adj::Adjacency, blue::Bool)
    left = show(adj.left)
    right = show(adj.right)
    
    s = "(" * left * "," * right * ")"
    
    if blue 
        printstyled(s; color = :blue)
    else
        printstyled(s; color = :green) 
    end 

end 

In [14]:
# showing/printing list of adjs

function Base.show(adj_list::Vector{Adjacency})
    for adj in adj_list 
        show(adj) 
    end 
end 


# logging

function Base.show(adj_list::Vector{Adjacency}, changed_adj1::Adjacency, changed_adj2::Adjacency)
    for adj in adj_list 
        if adj == changed_adj1 || adj == changed_adj2
            show(adj, true)
        else 
            show(adj) 
        end 
    end 
end 

function Base.show(adj_list::Vector{Adjacency}, changed_adj::Adjacency) 
    for adj in adj_list 
        if adj == changed_adj
            show(adj, true)
        else 
            show(adj) 
        end 
    end 
end 

In [15]:
function genome_to_adj_list_helper(linear::Bool, chrom::Chromosome, genes::Vector{AbstractGene}, adj_list::Vector{Adjacency})
    # first gene end 
    if linear 
        left = GeneEnd(Telomere())
        end_idx = length(genes) - 1
    else 
        left = GeneEnd(genes[1], !genes[1].reverse)
        end_idx = length(genes)
    end  

    for i in 2:end_idx
        gene = genes[i]
        reversed = gene.reverse  
        
        right = GeneEnd(gene, reversed)  # if reversed, right_end_adj is head (& left_end_adj is tail) 
        adj = Adjacency(left, right)
        push!(adj_list, adj)

        left = GeneEnd(gene, !reversed)
    end 

    # last gene end 
    if linear 
        right = GeneEnd(Telomere())
    else 
        right = GeneEnd(genes[1], genes[1].reverse)
    end  

    adj = Adjacency(left, right)
    push!(adj_list, adj)
end 

function genome_to_adj_list(genome::Genome)
    adj_list = Vector{Adjacency}()

    for chrom in genome.data
        genes = chrom.genes
        if genes[1] == Telomere()  # linear 
            genome_to_adj_list_helper(true, chrom, genes, adj_list)
        else  # circular 
            genome_to_adj_list_helper(false, chrom, genes, adj_list)
        end
    end 
    
    return adj_list
end 

genome_to_adj_list (generic function with 1 method)

In [3]:
function common_gene_helper(common_ge::GeneEnd, adj::Adjacency)    
    common_gene = common_ge.gene
    if common_gene == adj.left.gene 
        if adj.left.head == true 
            return uppercase(common_gene.dna*string(common_gene.dup))
        else 
            return lowercase(common_gene.dna*string(common_gene.dup))
        end 
    else 
        if adj.right.head == true 
            return uppercase(common_gene.dna*string(common_gene.dup))
        else 
            return lowercase(common_gene.dna*string(common_gene.dup))
        end 
    end
end 

function common_gene(adj1::Adjacency, adj2::Adjacency)
    if adj1.left.gene == Telomere() 
        common_ge = adj1.right
    elseif adj1.right.gene == Telomere() 
        common_ge = adj1.left 
    elseif adj1.left.gene == adj2.left.gene || adj1.left.gene == adj2.right.gene
        common_ge = adj1.left
    else 
        common_ge = adj1.right
    end 
    letter  = common_gene_helper(common_ge, adj1) 
    
    return letter, common_ge
end 

function adjlist_to_genomestr(adj_list::Vector{Adjacency})
    genome_str = ""

    chrom = ""
    circ = false 
    first_gene_circ = nothing 
    for i in range(1,length(adj_list))
        # single adj contains one chrom (empty w telomeres or one gene) 
        if adj_list[i].left.gene == adj_list[i].right.gene 
            if adj_list[i].left.gene  == Telomere()
                chrom =  "..,"
                genome_str *= chrom 
                chrom = ""
                continue
            else 
                if adj_list[i].left.head == true 
                    gene = adj_list[i].left.gene
                    uppercase(gene.dna*string(gene.dup))
                else 
                    gene = adj_list[i].left.gene
                    lowercase(gene.dna*string(gene.dup))
                end 
                genome_str *= chrom * ","
                chrom = ""
                continue 
            end 
        end 

        # reached end 
        if i == length(adj_list) 
            if  (adj_list[i].left == GeneEnd(Telomere()) || adj_list[i].right == GeneEnd(Telomere()))
                chrom *= "."
                genome_str *= chrom 
                break
            else # circular 
                # adj_list[i].left == first_gene_circ || adj_list[i].right == first_gene_circ
                chrom *= common_gene_helper(first_gene_circ, adj_list[i]) 
                genome_str *= chrom 
                break
            end 
        end 

        cg_char, cg_ge = common_gene(adj_list[i], adj_list[i+1]) 
        
        if isempty(chrom) 
            if adj_list[i].left.gene != Telomere() &&  adj_list[i].right.gene != Telomere() 
                # starting new circ chrom 
                first_gene_circ = other_adjacency_end(cg_ge, adj_list[i])
                circ = true 
            else # new linear chrom 
                chrom *= "."
            end 
            chrom *= cg_char
        else # adding to existing chrom 
            if circ 
                chrom *= cg_char
                if other_adjacency_end(cg_ge, adj_list[i]) == first_gene_circ 
                    # end the circ chrom 
                    chrom *= first_gene_circ * ","
                    genome_str *= chrom 
                    chrom = ""
                end 
            else 
                if other_adjacency_end(cg_ge, adj_list[i]) == GeneEnd(Telomere()) 
                    chrom *= "." * ","
                    genome_str *= chrom 
                    chrom = ""
                else 
                    chrom *= cg_char
                end 
            end 
        end 
    end 
    
    return genome_str[1:end-1]
end 

LoadError: UndefVarError: `GeneEnd` not defined

In [None]:
# adjacency graph 

mutable struct AdjacencyGraph
    num_dcj_ops::Int
end

function AdjacencyGraph(src::Genome, target::Genome, mode::Int)
    # process genomes (not explicitly creating the adj graph)    
    src_adj_list = genome_to_adj_list(src)        
    target_adj_list  = genome_to_adj_list(target) 
    
    if mode >= 1
        show(src_adj_list)
        print(" --> ")
        show(target_adj_list)
        println()
    end 
    
    src_geneid_to_location = process_adj_list(src_adj_list)   
    target_geneid_to_location = process_adj_list(target_adj_list)
    
    if mode >= 2
        println()
        print("SRC GENE & ID --> LOCATION DICT\n")
        show(src_geneid_to_location, src.id_to_str)  
        print("\n", "TARGET GENE & ID --> LOCATION DICT\n")
        show(target_geneid_to_location, src.id_to_str)
        print("\n*************\n\n")
    end 

    if src.id_to_str != target.id_to_str
        throw(ArgumentError("Source and target genomes should have the same ID-->str dict"))
    end 

    num_dcj_ops = find_dcj_dist_ops(src_adj_list, target_adj_list, src_geneid_to_location, target_geneid_to_location, mode, target.id_to_str) 
    
    if mode >= 1
        print("\n#ops = ", num_dcj_ops)
        print("\n----------------------------------------\n")
    end 

    return AdjacencyGraph(num_dcj_ops)
end 

LoadError: UndefVarError: `Genome` not defined