In [1]:
using NBInclude
@nbinclude("randommap.ipynb")

randommap (generic function with 1 method)

In [2]:
function check_src_tar_balanced(src::String, target::String)
    src_char_to_mult = Dict{Char, Int}()
    tar_char_to_mult = Dict{Char, Int}()

    for g in src 
        if g == '.' || g == ','
            continue 
        end 

        if g in keys(src_char_to_mult)
            src_char_to_mult[g] += 1 
        else 
            src_char_to_mult[g] = 1 
        end 
    end 

    for g in target 
        if g == '.' || g == ','
            continue 
        end 

        if g in keys(tar_char_to_mult)
            tar_char_to_mult[g] += 1 
        else 
            tar_char_to_mult[g] = 1 
        end 
    end 
    
    if src_char_to_mult != tar_char_to_mult
        throw(ArgumentError("src and target from generate_genomes_with_xdup function aren't balanced"))
    end 
    
end 

check_src_tar_balanced (generic function with 1 method)

In [3]:
# generates target genomes that contain the first n letters and adds x duplicate letters 
# x >= 1
function generate_target_with_xdup(n::Int, x::Int)
    # target 
    og_genes = first_n_letters(n)
    genome = copy(og_genes)
    dup_genes = Vector{Char}()
    
    for i in 1:x
        rand_dup_gene = og_genes[rand(1:length(og_genes))]
        rand_idx = rand(1:length(genome)+1)
        insert!(genome, rand_idx, rand_dup_gene)
        push!(dup_genes, rand_dup_gene)
    end 

    return join(genome)
end 

# target = generate_target_with_xdup(5, 2)
# print(target)

function scramble_target(target::String)
    t_char_array = collect(target)
    shuffled_tar = shuffle(t_char_array)

    max_chrom_length = length(target)
    pointer = 1

    source = ""
    while max_chrom_length != 0 
        chrom_length = rand(1:max_chrom_length)
        max_chrom_length -= chrom_length      
        end_idx = pointer + chrom_length - 1
        chrom = shuffled_tar[pointer:end_idx]

        linear = rand(Bool)
        if linear 
            chrom = "." * join(chrom) * "."
        end 

        chrom = join(chrom)
        
        if max_chrom_length != 0 
            source = source * chrom * ","
        else
            source = source * chrom 
        end 
        
        pointer += chrom_length 
    end 
    
    check_src_tar_balanced(source, target)
    return source
end 


# scramble_target(target)

scramble_target (generic function with 1 method)

In [10]:
function test_randommaps(n::Int, x::Int, num_maps::Int, iterations::Int)
    glob_min = Inf
    glob_min_src = ""
    glob_min_map = Vector{Int}()
    glob_src_dedup_tar = ""
    
    glob_max = 0 
    glob_max_src = ""
    glob_max_map = Vector{Int}()
    glob_tar_dedup_tar = ""
    
    target = generate_target_with_xdup(n, x)
    
    for i in 1:iterations 
        src = scramble_target(target)
        dedup_tar, min_dist, min_src, min_map, max_dist, max_src, max_map = randommap(src, target, num_maps, "none")

        if min_dist < glob_min
            glob_min = min_dist 
            glob_min_src = min_src
            glob_min_map = min_map
            glob_src_dedup_tar = dedup_tar
        end 

        if max_dist > glob_max 
            glob_max = max_dist 
            glob_max_src = max_src
            glob_max_map = max_map
            glob_tar_dedup_tar = dedup_tar
        end 
    end 

    println(n, " gene(s), ", x, " duplicate(s)\n")
    return glob_src_dedup_tar, glob_min, glob_min_src, glob_min_map, glob_tar_dedup_tar, glob_max, glob_max_src, glob_max_map
end 

n = 3
x = 1
num_maps = 0  # == max
num_iterations = 5

min_tar, min_dist, min_src, min_map, max_tar, max_dist, max_src, max_map = test_randommaps(n, x, num_maps, num_iterations)

println(min_tar, " --> ", min_src, " || DCJ Dist=", min_dist, " || map=", min_map)
println(max_tar, " --> ", max_src," || DCJ Dist=", max_dist, " || map=", max_map)

[36mSRC cc,ab --> TARGET abcc[39m
[deduplicated] ch,ab --> abch 1
[deduplicated] hc,ab --> abch 1

[36mSRC cb,ac --> TARGET abcc[39m
[deduplicated] eb,ac --> abce 3
[deduplicated] cb,ae --> abce 1

[36mSRC .cb.,c,.a. --> TARGET abcc[39m
[deduplicated] .cb.,e,.a. --> abce 4
[deduplicated] .eb.,c,.a. --> abce 4

[36mSRC abc,c --> TARGET abcc[39m
[deduplicated] abk,c --> abck 1
[deduplicated] abc,k --> abck 1

[36mSRC .cca.,.b. --> TARGET abcc[39m
[deduplicated] .cqa.,.b. --> abcq 2
[deduplicated] .qca.,.b. --> abcq 4

3 gene(s), 1 duplicate(s)

abch --> ch,ab || DCJ Dist=1 || map=[1]
abce --> .cb.,e,.a. || DCJ Dist=4 || map=[1]


In [5]:
# n = 3
# x = 1
# num_maps = 10

# dcj_dist_list, target, min_src, min_map, max_src, max_map = test_randommap(n, x, num_maps)


In [6]:

### visualization 

max_value = length(target)
println("\n**\nmax dcj distance = ", max_value, "\n")

# SOCS of distribution
# shape -- skewed right usually 
# outliers - none 
# center - depends 
# spread -- narrow 
min = minimum(dcj_dist_list)
max = maximum(dcj_dist_list)
avg = mean(dcj_dist_list)

println("min=", min)
println("max=", max)
println("avg=", avg, "\n")
println("range=", max-min)


# range for x-ticks from 0 to max_value with a step of 2
xticks = 0:1:max_value

# Create a histogram
histogram(dcj_dist_list, title="Distribution", label="", xlabel="DCJ Distance", ylabel="Frequency",  xlims=(0, max_value+2), xticks=(xticks, xticks))

LoadError: UndefVarError: `target` not defined

In [None]:
n = 3
x = 3
println(n, " gene(s), ", x, " duplicate(s)")
src, target = generate_genomes_with_xdup(n, x)

num_maps = 10

dcj_dist_list = randommap(src, target, num_maps, "none")

### visualization 

max_value = length(target)+2
println("\n**\nmax dcj distance = ", max_value, "\n")

# SOCS of distribution
# shape -- skewed right usually 
# outliers - none 
# center - depends 
# spread -- narrow 
min = minimum(dcj_dist_list)
max = maximum(dcj_dist_list)
avg = mean(dcj_dist_list)

println("min=", min)
println("max=", max)
println("avg=", avg, "\n")
println("range=", max-min)


# range for x-ticks from 0 to max_value with a step of 2
xticks = 0:1:max_value

# Create a histogram
histogram(dcj_dist_list, title="Distribution", label="", xlabel="DCJ Distance", ylabel="Frequency",  xlims=(0, max_value), xticks=(xticks, xticks))



In [None]:
n = 3
x = 3
println(n, " gene(s), ", x, " duplicate(s)")
src, target = generate_genomes_with_xdup(n, x)

num_maps = 50

dcj_dist_list = randommap(src, target, num_maps, "info")

### visualization 

max_value = length(target)+2
println("\n**\nmax dcj distance = ", max_value, "\n")

# SOCS of distribution
# shape -- skewed right usually 
# outliers - none 
# center - depends 
# spread -- narrow 
min = minimum(dcj_dist_list)
max = maximum(dcj_dist_list)
avg = mean(dcj_dist_list)

println("min=", min)
println("max=", max)
println("avg=", avg, "\n")
println("range=", max-min)


# range for x-ticks from 0 to max_value with a step of 2
xticks = 0:1:max_value

# Create a histogram
histogram(dcj_dist_list, title="Distribution", label="", xlabel="DCJ Distance", ylabel="Frequency",  xlims=(0, max_value), xticks=(xticks, xticks))



In [None]:
n = 4
x = 3
println(n, " gene(s), ", x, " duplicate(s)")
src, target = generate_genomes_with_xdup(n, x)

num_maps = 50

dcj_dist_list = randommap(src, target, num_maps, "info")

### visualization 

max_value = length(target)+2
println("\n**\nmax dcj distance = ", max_value, "\n")

# SOCS of distribution
# shape -- skewed right usually 
# outliers - none 
# center - depends 
# spread -- narrow 
min = minimum(dcj_dist_list)
max = maximum(dcj_dist_list)
avg = mean(dcj_dist_list)

println("min=", min)
println("max=", max)
println("avg=", avg, "\n")
println("range=", max-min)


# range for x-ticks from 0 to max_value with a step of 2
xticks = 0:1:max_value

# Create a histogram
histogram(dcj_dist_list, title="Distribution", label="", xlabel="DCJ Distance", ylabel="Frequency",  xlims=(0, max_value), xticks=(xticks, xticks))



In [None]:
n = 4
x = 4
println(n, " gene(s), ", x, " duplicate(s)")
src, target = generate_genomes_with_xdup(n, x)

num_maps = 50

dcj_dist_list, min_src, min_map, max_src, max_map = randommap(src, target, num_maps, "info")

### visualization 

max_value = length(target)+2
println("\n**\nmax dcj distance = ", max_value, "\n")

# SOCS of distribution
# shape -- skewed right usually 
# outliers - none 
# center - depends 
# spread -- narrow 

min = minimum(dcj_dist_list)
max = maximum(dcj_dist_list)
avg = mean(dcj_dist_list)

println("min=", min)
println("max=", max)
println("avg=", avg, "\n")
println("range=", max-min)


# range for x-ticks from 0 to max_value with a step of 2
xticks = 0:1:max_value

# Create a histogram
histogram(dcj_dist_list, title="Distribution", label="", xlabel="DCJ Distance", ylabel="Frequency",  xlims=(0, max_value), xticks=(xticks, xticks))



In [None]:
n = 5
x = 2

repeats = 10 # for every n,x pair 
max_range = 0 




println(n, " gene(s), ", x, " duplicate(s)")
src, target = generate_genomes_with_xdup(n, x)

num_maps = 50

dcj_dist_list = randommap(src, target, num_maps, "info")

### visualization 

max_value = length(target)+2
println("\n**\nmax dcj distance = ", max_value, "\n")

# SOCS of distribution
# shape -- skewed right usually 
# outliers - none 
# center - depends 
# spread -- narrow 
min = minimum(dcj_dist_list)
max = maximum(dcj_dist_list)
avg = mean(dcj_dist_list)

println("min=", min)
println("max=", max)
println("avg=", avg, "\n")
println("range=", max-min)


# range for x-ticks from 0 to max_value with a step of 2
xticks = 0:1:max_value

# Create a histogram
histogram(dcj_dist_list, title="Distribution", label="", xlabel="DCJ Distance", ylabel="Frequency",  xlims=(0, max_value), xticks=(xticks, xticks))

