# Calculating DCJ distance between two unsigned, circular and/or linear genomes with no duplicates

Credit:
- https://github.com/mlliou112/py-dcj (python version)
- ChatGPT for converting to Julia 

In [24]:
using Base.Iterators
using Parameters

abstract type AbstractMarker end

In [25]:
"""
Representation for a telomere in DCJ model
"""

@with_kw struct Telomere <: AbstractMarker
    reverse::Bool=false
end

function Base.isequal(a::Telomere, b)
    return typeof(a) == typeof(b)
end

function Base.show(t::Telomere)
    return "."
end

# x = Telomere()
# show(x)

In [26]:
"""
Representation for DNA fragment (a gene) in DCJ model.

    Parameters
    ----------
    uid: int (required)
        unique id

    dna: str
        What dna fragment does this marker represent?

    reverse: boolean
        Is the DNA fragment reversed?
"""
@with_kw struct Marker <: AbstractMarker
    uid::Int
    dna::String=""
    reverse::Bool=false
end

function Base.isequal(a::Marker, b::Marker)
    return typeof(a) == typeof(b) && a.uid == b.uid
end

function Base.show(m::Marker)
    if m.reverse
        print("-" * string(m.uid))
    else
        print(string(m.uid))
    end
end

# y = Marker(1, "d", true)
# show(y)

In [27]:
using Test 

function test_telomere_type()
    a = Telomere()
    @test typeof(a) == Telomere
end

function test_forward_reverse_marker()
    @test isequal(Marker(0, "d", true), Marker(0, "d", false))
end

function test_telomere_repr()
    @test show(Telomere()) == "."
end

function test_reverse_equality()
    @test !isequal(Marker(0, "D", true), Marker(1, "D", true))
end

@testset "Markers Tests" begin
    test_telomere_type()
    test_forward_reverse_marker()
    test_telomere_repr()
    test_reverse_equality()
end

[0m[1mTest Summary: | [22m[32m[1mPass  [22m[39m[36m[1mTotal  [22m[39m[0m[1mTime[22m
Markers Tests | [32m   4  [39m[36m    4  [39m[0m1.1s


Test.DefaultTestSet("Markers Tests", Any[], 4, false, false, true, 1.717725485875e9, 1.717725486934e9, false, "In[27]")

LoadError: UndefVarError: `MarkerEnd` not defined

In [29]:
"""
Markers with designated head or tail end. Used in adjacencies
"""
@with_kw struct MarkerEnd
    marker::AbstractMarker
    head::Bool=true
end

"""Adjacency Data Structure"""
mutable struct Adjacency # TODO markerEnd or marker? i think markerend 
    left_end_marker::MarkerEnd
    right_end_marker::MarkerEnd
    label::Vector{Marker}

    Adjacency(left_end_marker::MarkerEnd = nothing, right_end_marker::MarkerEnd = nothing; label::Vector{Marker} = nothing) = new([left_end_marker, right_end_marker], left_end_marker, right_end_marker, label)
end

function Base.isequal(a::MarkerEnd, b::MarkerEnd)
    return a.marker == b.marker && a.head == b.head
end

function Base.show(me::MarkerEnd)
    if me.head
        print(string(me.marker))
    else
        print(string(me.marker) * "*")
    end
end

function other_adjacency_end(me::MarkerEnd, adj::Adjacency)
    if adj.left_end_marker == me
        return adj.right_end_marker
    else
        return adj.left_end_marker
    end
end

other_adjacency_end (generic function with 1 method)

In [30]:
 """The abstraction of a chromosome in DCJ model, a list of telomeres and markers

    Parameters
    ----------
    markers: list or string (req)
        The object to be converted into a chromosome.

    """
mutable struct Chromosome
    markers::Vector{AbstractMarker}
    marker_set::Set{String}
end

# mutable struct ChromosomeStatic
#     string_d::Dict{String, Int}  # gene to ID 
#     uid_counter::Int
#     ChromosomeStatic() = new(Dict{String, Int}(), 0)
# end

const chromosome_static = ChromosomeStatic()

function Chromosome(markers::Vector{AbstractMarker})
    marker_set = Set{AbstractMarker}()
    content = AbstractMarker[]
    
    # If markers is already a list of markers, 
    if isa(markers, Vector{AbstractMarker})
        content = markers
     
    # throw(TypeError("Chromosome must be a list of Marker instances or a string of unique markers."))

    # Otherwise loop through and parse string
    else 
        markers_str = collect(markers)
        if !((markers_str[1] == '.') == (markers_str[end] == '.'))
            throw(ArgumentError("Linear Chromosome must start and end with telomeres."))
        end
        for (i, s) in enumerate(markers_str)
            dna = string(s)
            rev = isuppercase(s)
            telomere = s == '.'
            # handle  marker 
            if s in marker_set && !telomere
                throw(ArgumentError("Duplicated markers are not allowed. ($s)"))
            elseif s in keys(chromosome_static.string_d)
                uid = chromosome_static.string_d[s]
                push!(marker_set, s)
            else
                chromosome_static.string_d[s] = chromosome_static.uid_counter
                push!(marker_set, s)
                uid = chromosome_static.uid_counter
                chromosome_static.uid_counter += 1
            end
            # handle telomere
            if telomere && !(i == 1 || i == length(markers_str))
                throw(ArgumentError("Telomere cannot appear in middle of chromosome."))
            elseif telomere
                push!(content, Telomere())
            else
                push!(content, Marker(uid, dna, rev))
            end
        end
    end
    return Chromosome(content, marker_set)
end

function clear_chromosome_static()
    chromosome_static.string_d = Dict{String, Int}()
    chromosome_static.uid_counter = 0
end

LoadError: invalid redefinition of constant Main.Chromosome

In [None]:
Marker(0, "d", true), Marker(0, "d", false)

workshopping below....


In [None]:


# Chromosome

mutable struct Chromosome
    marker_set::Set{String}
    data::Vector{AbstractMarker}
    Chromosome(markers::AbstractVector{AbstractMarker}) = new(Set(), markers)
    Chromosome(markers::AbstractString) = new(Set(), [])
end

# Genome

mutable struct Genome
    data::Vector{Chromosome}
    Genome(chromosomes::Chromosome...) = new(chromosomes)
end



# AdjacencyGraph

mutable struct AdjacencyGraph
    commonMarkers::Set{AbstractMarker}
    adjA::Vector{Adjacency}
    adjB::Vector{Adjacency}
    cycles::Int
    ab_paths::Int
    a_runs::Int
    b_runs::Int
    run_potential::Int
    indel_potential::Int
end

function AdjacencyGraph(A::Genome, B::Genome)
    marker_set_a = Set(Iterators.flatten(A.data))
    marker_set_b = Set(Iterators.flatten(B.data))
    commonMarkers = intersect(marker_set_a, marker_set_b)
    if Telomere() in commonMarkers
        delete!(commonMarkers, Telomere())
    end
    marker_set_a = setdiff(marker_set_a, commonMarkers)
    marker_set_b = setdiff(marker_set_b, commonMarkers)

    adjA = Adjacency[]
    adjB = Adjacency[]
    adj = [MarkerEnd(Telomere()), MarkerEnd(Telomere())]
    adjacencies = [adjA, adjB]
    reference_A = Dict{MarkerEnd, Int}(MarkerEnd(Telomere(), true) => 0)
    reference_B = Dict{MarkerEnd, Int}(MarkerEnd(Telomere(), true) => 0)
    references = [reference_A, reference_B]

    for (i, genome) in enumerate([A, B])
        adjacency_length = 0
        for chromosome in genome.data
            index = 0
            adj_index = 0
            current_marker = chromosome.data[index]
            chromosome_markers = Set(chromosome.data)
            adjacency_length += length(intersect(commonMarkers, chromosome_markers))
            if chromosome.data[1] isa Telomere
                adjacency_length += 1
            end
            while length(adjacencies[i]) < adjacency_length
                label = AbstractMarker[]
                if current_marker isa Telomere
                    adj[1] = MarkerEnd(Telomere(), true)
                else
                    adj[1] = MarkerEnd(current_marker, !current_marker.reverse)
                end
                next_marker = ifelse(index >= length(chromosome.data) - 1, chromosome.data[1], chromosome.data[index + 1])
                while next_marker ∉ commonMarkers
                    if next_marker isa Telomere
                        break
                    end
                    push!(label, next_marker)
                    index += 1
                    next_marker = ifelse(index >= length(chromosome.data) - 1, chromosome.data[1], chromosome.data[index + 1])
                end
                if next_marker isa Telomere
                    adj[2] = MarkerEnd(Telomere(), true)
                else
                    adj[2] = MarkerEnd(next_marker, next_marker.reverse)
                end
                push!(adjacencies[i], Adjacency(adj[1], adj[2], label))
                if adj[1].marker != Telomere()
                    references[i][adj[1]] = length(adjacencies[i])
                end
                if adj[2].marker != Telomere()
                    references[i][adj[2]] = length(adjacencies[i])
                end
                current_marker = next_marker
                index += 1
                adj_index += 1
            end
        end
    end

    to_visit_a_index = Set(1:length(adjA))
    to_visit_b_index = Set(1:length(adjB))
    visited_a_index = Set{Int}()
    visited_b_index = Set{Int}()
    cycles = 0
    ab_paths = 0
    a_runs = 0
    on_a_run = false
    b_runs = 0
    on_b_run = false

    while !isempty(to_visit_a_index)
        current_adj_index = pop!(to_visit_a_index)
        push!(visited_a_index, current_adj_index)
        left_marker = adjA[current_adj_index].left_end_marker
        right_marker = adjA[current_adj_index].right_end_marker
        if !isempty(adjA[current_adj_index].label)
            a_runs += 1
            on_a_run = true
        end
        paths_end_on_a = [true, true]
        for (i, current_marker) in enumerate([left_marker, right_marker])
            a_side = true
            next_adj_index = reference_B[current_marker]
            if next_adj_index in visited_b_index
                continue
            end
            while next_adj_index != nothing
                current_adj_index = next_adj_index
                a_side = !a_side
                adj_side = a_side ? adjA : adjB
                current_adj = adj_side[current_adj_index]
                reference_side = a_side ? reference_B : reference_A
                if a_side
                    push!(visited_a_index, current_adj_index)
                    delete!(to_visit_a_index, current_adj_index)
                else
                    push!(visited_b_index, current_adj_index)
                    delete!(to_visit_b_index, current_adj_index)
                end
                if !isempty(current_adj.label)
                    if !a_side && on_a_run
                        on_b_run = true
                        on_a_run = false
                        b_runs += 1
                    elseif !a_side && !on_b_run
                        on_b_run = true
                        b_runs += 1
                    elseif a_side && on_b_run
                        on_a_run = true
                        on_b_run = false
                        a_runs += 1
                    elseif a_side && !on_a_run
                        on_a_run = true
                        a_runs += 1
                    end
                end
                current_marker = current_marker == current_adj.left_end_marker ? current_adj.right_end_marker : current_adj.left_end_marker
                next_adj_index = reference_side[current_marker]
                if next_adj_index == nothing
                    paths_end_on_a[i] = a_side
                    break
                elseif (a_side && next_adj_index in visited_b_index) || (!a_side && next_adj_index in visited_a_index)
                    cycles += 1
                    break
                end
            end
        end
        if paths_end_on_a[1] != paths_end_on_a[2]
            ab_paths += 1
        end
    end

    run_potential = a_runs + b_runs
    indel_potential = run_potential > 0 ? (run_potential + 1) ÷ 2 + ((run_potential ÷ 2) % 2) : 0

    return AdjacencyGraph(commonMarkers, adjA, adjB, cycles, ab_paths, a_runs, b_runs, run_potential, indel_potential)
end



LoadError: UndefVarError: `AbstractMarker` not defined

In [46]:
function calculate_distance(A::String, B::String)
    check_conditions(A, B)

    # Chromosome.clear()
    # genome_a = transform_genome(A)
    # genome_b = transform_genome(B)
    # ag = AdjacencyGraph(genome_a, genome_b)
    # dcj_distance = length(ag.commonMarkers) - ag.cycles - ag.ab_paths / 2
end

# function transform_genome(genome)
#     return typeof(genome) == Genome ? genome : Genome([Chromosome(c) for c in genome]...)
# end

# Chromosome.clear() = (Chromosome.string_d = Dict(), Chromosome.uid_counter = 0)



println(calculate_distance(".abcd.", ".aCBd."))



true


In [42]:
# Funcs 

"""
checks for correct conditions 
    - only letters and dots
    - no duplicate genes (letters)
    - even number of telomeres in both genomes (can't have the start of a chromosome without closing it)  
"""
function check_conditions(A::String, B::String)
    set_A = Set{Char}()
    set_B = Set{Char}()
    tel_count_A = 0 
    tel_count_B = 0 

    for i in 1:length(A)
        if A[i] == '.'
            tel_count_A += 1
        elseif A[i] ∉ set_A && isletter(A[i])
            push!(set_A, lowercase(A[i])) # or uppercase, just keep consistent
        else
            throw(ArgumentError("Error: Genome A has duplicate genes or non-letter genes"))
            return false
        end 
    end

    for i in 1:length(B) 
        if B[i] == '.'
            tel_count_B += 1
        elseif  B[i] ∉ set_B  && isletter(A[i])
            push!(set_B, lowercase(B[i]))
        else
            throw(ArgumentError("Error: Genome B has duplicate genes or non-letter genes"))
            return false
        end 
    end

    if mod(tel_count_A, 2) != 0 || mod(tel_count_B, 2) != 0  
         throw(ArgumentError("Error: Telomere error"))
        return false 
    elseif set_A != set_B
        throw(ArgumentError("Error: Sets of genes in the two given genomes don't match"))
        return false
    end 


    return true 
end 

# correct_conditions(".abdc.", "abCd")

check_conditions