# generate REP database 


In [16]:
function get_geneset(genome::String)
    gene_set = Set{String}() 

    chroms = split(genome, ",")
    for c in chroms 
        gene_arr = split(c, "'")
        g_set = Set(gene_arr)
        
        gene_set = union(gene_set, g_set)
    end 

    return gene_set
end 

function check_balanced(g1::String, g2::String)
    gset1 = get_geneset(g1)
    gset2 = get_geneset(g2)

    if gset1 != gset2
         throw(ArgumentError("ERROR: generated a repeat map"))
    end         
end 


check_balanced (generic function with 1 method)

In [17]:
# returns the nth letter of the infinite alphabet 
# a b c ... x y z aa ab ac ... ax ay az ba bb bc ... bx by bz ... ca cb cd ....

function nth_letter(n::Int)
    # Define the alphabet
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    
    # Initialize the result
    result = ""

    # Compute the letters iteratively for n > 26
    while n > 0
        n -= 1  # Adjust for one-based index
        result = string(alphabet[n % 26 + 1], result)
        n = div(n, 26)
    end

    return result
end

function create_tar(str_length::Int, alphabet::Set{String}, tar::Vector{String})
    # select (w repetiion) |S| numbers from the set {1, .., |S|} 
    # one chrom genome 
    for i in 1:str_length 
        gene = nth_letter(rand(1:str_length))
        push!(tar, gene) 
        push!(alphabet, gene)
    end 
    
    # println("unique genes: ", length(alphabet), "  total genes: ", length(tar))

    return tar
end 

function create_src(tar::Vector{String})
    # shuffle genes randomly & separate into rand # chroms 
    genes = deepcopy(tar)
    max_chrom_length = length(tar)
    src = ""
    
    
    while !(isempty(genes))
        # choose random chrom length
        chrom_length = rand(1:max_chrom_length)

        # choose random genes to fill up chrom of that length 
        for i in 1:chrom_length 
            g_idx = rand(1:length(genes))
            g = genes[g_idx]
            src *= g * "'"
            deleteat!(genes, g_idx)
        end 
        src = src[1:end-1]
        src *= ","
        
        max_chrom_length -= chrom_length 
    end 

    return src[1:end-1]
end 


function generate_srctarstr_pair(str_length::Int)
    src = Vector{Vector{String}}() 
    tar = Vector{String}()

    alphabet = Set{String}() 
    
    # TARGET 
    tar_arr = create_tar(str_length, alphabet, tar)
    
    # SOURCE 
    src_str = create_src(tar_arr)

    # convert arrays of arrays to strings 
    tar_str = join(tar, "'")

    check_balanced(tar_str, src_str)

    return src_str, tar_str
end 

# src, tar = generate_srctarstr_pair(50)
# print(src, "\n", tar)

generate_srctarstr_pair (generic function with 1 method)

In [19]:
function generate_one_rep_set(size::Int) 
    str = ""
    for i in 1:500 
        src, tar = generate_srctarstr_pair(size)
        str *= "|" * src * ";" * tar 
    end 

    return str[2:end]
end 

function generate_rep() 
    for i in 50:50:500
        str = generate_one_rep_set(i) 
        filename = "db/rep_db" * string(i) * ".txt"

        open(filename, "w") do file
           write(file, str)
        end
    end 
end 

# generate_rep()