In [2]:
"""
diameter(n) = max number of DCJ operations to get from one genome of size n to another (assuming no duplicates and genomes are balanced)

diameter(n) = n 
    a farthest genome from a single circular chromosome genome (where DCJ distance = diameter)is a linear genome where each gene is a telomere 
    e.g., abcdefg --> .a.,.b.,.c.,.d.,.e.,.f.,.g.


this code estimates diameter(n) for n = 1:26 and the source genome that yields the largest diameter 
    estimates by sampling the exponentially large number of possible source genomes (generates y random source genomes) 
    then calculating the dcj distance 
    keeping track of largest distance and the genome that yields that largest distance
"""

"diameter(n) = max number of DCJ operations to get from one genome of size n to another (assuming no duplicates and genomes are balanced)\n\ndiameter(n) = n \n    a farthest genome from a single circular chromosome genome (where DCJ distance = diameter)is a linear genome whe"[93m[1m ⋯ 158 bytes ⋯ [22m[39m" the largest diameter \n    estimates by sampling the exponentially large number of possible source genomes (generates y random source genomes) \n    then calculating the dcj distance \n    keeping track of largest distance and the genome that yields that largest distance\n"

In [3]:
using Random
using Plots
using GLM
using DataFrames
using Statistics

using NBInclude
@nbinclude("dcj_algo.ipynb")

In [4]:
# Function to get the first n letters of the alphabet
function first_n_letters(n::Int)
    return collect('a':'z')[1:n]
end

first_n_letters (generic function with 1 method)

In [5]:
function Base.show(max_dist_genomes::Matrix{String})
    println()
    println("max distance genomes :: ")
    for y in 1:size(max_dist_genomes, 1)  # Loop over rows
        s = "y=" * string(y) * "\n"
        printstyled(s; color = :yellow)

        for m in 1:size(max_dist_genomes, 2)  # Loop over columns
            tar = first_n_letters(m)
            src = max_dist_genomes[y, m]
            
            println(string(m) * " " * src * " --> " * join(tar))
        end
    end 
end 


function Base.show(diameters::Matrix{Float64})
    println("diameters :: ")
    for y in 1:size(diameters, 1)  # Loop over rows
        s = "y = " * string(y) * " "
        printstyled(s; color = :yellow)

        println(diameters[y,:])
    end 
end 

In [23]:
# generates genomes/permutations/strings of length n 
# (up to 26 bc alphabet length)
function generate_genome_str(n::Int)
    if n > 26 
        throw(ArgumentError("Trying to generate a genome with no duplicates of size >26 (Alphabet has 26 letters)"))
    end 
    
    # first n letters
    letters = first_n_letters(n)

    # randomize capitalization (gene orientation)
    for i in length(letters) 
        if rand(Bool)
            letters[i] = uppercase(letters[i])
        end
    end 

    letters = shuffle(letters)

    # randomly choose length of chromosomes and circular/linear 
    max_chrom_length = n 
    curr_letters_idx = 1

    genome_str = ""
    while max_chrom_length != 0 
        chrom_length = rand(1:max_chrom_length)
        max_chrom_length -= chrom_length
        chrom = ""

        linear = rand(Bool)

        end_idx = curr_letters_idx + chrom_length - 1
        chrom = letters[curr_letters_idx:end_idx]
        if linear 
            chrom = "." * join(chrom) * "."
        end 
        
        chrom = join(chrom)
        if max_chrom_length != 0 
            genome_str = genome_str * chrom * ","
        else
            genome_str = genome_str * chrom 
        end 
        
        curr_letters_idx += chrom_length 
    end 

    return genome_str
end

generate_genome_str(10)

"ghbfj,.ca.,.id.,e"

In [7]:
# calculate diameter between target & src genome of size n 
# fixed number of src genomes to generate 
function calculate_diameter_given_num_src_genomes(n::Int, target::String, num_src_permutations_to_generate::Int)
    max_dist = 0 
    # distance = Vector{Int}() 
    src = generate_genome_str(n)
    max_dist_permutation = src
   
    
    for i in 1:num_src_permutations_to_generate
        
        dist = calculate_distance(src, target, "none")
        if max_dist < dist 
            max_dist = dist
            max_dist_permutation = src 
        end 

        # if dist == n
        #     println("n = ", n)
        #     println(src * " --> " * target)
        # end 
        # push!(distance, dist)
        
        src = generate_genome_str(n)
    end 

    return max_dist, max_dist_permutation
    # return round(mean(distance), digits = 2)
end 

# calculate diameters for genomes size 1:n 
# by sampling a fixed number of src genomes to generate 
function calculate_diameters_given_num_of_src_genomes(n::Int, x::Int, diameters::Matrix{Float64}, max_dist_genomes::Matrix{String})
    for m in 1:n
        # nts = "\nx=" * string(x) * " || " * "m=" * string(m) * "\n"
        # printstyled(s; color = :red)

        target = join(first_n_letters(m))
        
        #  diam = calculate_diameter_given_num_src_genomes(m, target, x)
        diam, max_dist_src = calculate_diameter_given_num_src_genomes(m, target, x)
        
        diameters[x, m] = diam
        max_dist_genomes[x, m] = max_dist_src
    end 
end 


calculate_diameters_given_num_of_src_genomes (generic function with 1 method)

In [8]:
# calculates diameters for genomes size 1:n 
# by sampling 1:y src genomes (randomly generated)
function calculate_diameters(y::Int, n::Int)
    # note: matrix[a, b] = diameter of a genome of length 'b" with "a" randomly generated src permutations 
    diameters = zeros(y, n)
    max_dist_genomes = fill("", y, n)

    for x in 1:y 
        # s = "x=" * string(x) * " || " * "n=" * string(n) * "\n"
        # printstyled(s; color = :green)

        calculate_diameters_given_num_of_src_genomes(n, x, diameters, max_dist_genomes)

        # println("======================================================================================================================================================")
    end 

    return diameters, max_dist_genomes 
end 

calculate_diameters (generic function with 1 method)

In [9]:
# n = 5
# x = 10 # num_src_permutations to generate 

# # note: matrix[a, b] = diameter of a genome of length 'b" with "a" randomly generated src permutations 
# diameters, max_dist_genomes = calculate_diameters(x, n)

# show(diameters)
# # show(max_dist_genomes)


In [10]:
## VISUAlS -- buggy 

# using Plots


function random_color(previous_colors)
    while true
        r = rand()
        g = rand()
        b = rand()
        new_color = RGB(r, g, b)

        # Check distance from previous colors
        if isempty(previous_colors)
            return new_color
        else
            min_distance = minimum([colordiff(c, new_color) for c in previous_colors])
            if min_distance > 0.4  # Adjust threshold to ensure colors are different enough
                return new_color
            end
        end
    end
end

# plot()
# previous_colors = Vector{RGBA{Float64}}() 

# x = range(0, size(max_dist_genomes, 1), size(max_dist_genomes, 1))

# # for i in 1:size(max_dist_genomes, 2)
# #     y = diameters[:, i]
# #     labeli = "Plot " * string(i)
# #     scatter!(x, y, label=labeli, color=random_color(previous_colors), markersize=4)
    
# # end 

# i = 10
# y = diameters[:, i]
# labeli = "n = " * string(i)
# scatter!(x, y, label=labeli, color=random_color(previous_colors), markersize=4)

# xlabel!("X randomly sampled/generated source genomes")
# ylabel!("estimated diameter")
# title!("Estimated Diameter after X Samples")

# plot!()

random_color (generic function with 1 method)

In [11]:
## regression --- buggy
# y = diameters[:, 1]
# df = DataFrame(x=x, y=y)
# logistic_model = glm(@formula(y ~ x), df, Binomial(), LogitLink())
# y_pred_sorted = predict(logistic_model, DataFrame(x=x))

# scatter(x, y, label="1", color=random_color(previous_colors), markersize=4)
# plot!(x, y_pred_sorted, label="Logistic Regression Curve", color=:red)
# plot!()