In [115]:
using Random
using DataStructures

In [116]:
# returns dup char --> num instances dictionary, num of duplicate chars  
##### returns 2 x N array (N = num dup char) of duplicate char & num instances, num of duplicate chars

function find_dups_in_str(str::String)
    alphabet = Set{Char}() 
    duplicates = OrderedDict{Char, Int}()  # duplicate char --> num instances 

    for char in str
        char = lowercase(char)   
        if char in alphabet 
            if char in keys(duplicates)
                duplicates[char] += 1
            else 
                duplicates[char] = 2 
            end 
        else 
            push!(alphabet, char)
        end 
    end 

    return duplicates, length(duplicates) 
end 

find_dups_in_str (generic function with 1 method)

In [117]:
function generate_map(num_dups::Int, dup_to_num_instances::OrderedDict{Char, Int})
    map = Array{Int}(undef, num_dups)
    
    idx = 1
    # for each position of the map (each gene w >1 instances)
    for (dup_char, num_instances) in pairs(dup_to_num_instances)
        # an integer value is selcted uniformly from the interval [0, occ(α, S)! -1] 
        mapidx_for_dup_char = rand(0:factorial(num_instances)-1)
        
        map[idx] = mapidx_for_dup_char
        idx += 1
    end 
    
    return map
end 

generate_map (generic function with 1 method)

In [118]:
# DP implementation of factorial
function Base.factorial(n::Int, factorials::Vector{Int})
    if n == 1 || n == 0 
        return 1
    end 

    if 1 <= n <= length(factorials) 
        return factorials[n]
    else 
        fact =  n * factorial(n - 1, factorials)
        push!(factorials, fact)
        return fact
    end 
end

# returns kth lexicogrpahical order of the first n ints 
# k ε [1,n!]
function nth_lex_permutation(n::Int, k::Int)
    # Adjust n to 0-based indexing for convenience
    k -= 1
    factorials = Vector{Int}()
    
    numbers = collect(1:n)
    permutation = Vector{Int}()
    
    for i in 0:(n - 1)
        fact = factorial(n - 1 - i, factorials)
        index = div(k, fact)
        k %= fact

        push!(permutation, numbers[index + 1])
        splice!(numbers, index + 1)
    end
    
    return permutation
end

# n = 3  # num duplicates 
# k = 2  #  kth lexicogrpahical order of the first n ints [1:n!]

# perm = nth_lexicographical_permutation(n, k)
# println(perm)

nth_lex_permutation (generic function with 1 method)

In [119]:
function deduplicate_genome(map::Array{Int}, dup_genome::String, dupchar_to_multiplicity::OrderedDict{Char, Int})
    idx = 1
    for (dupchar, mult) in pairs(dupchar_to_multiplicity)
        print(dupchar, mult, idx) 
        # print(map[idx])
        dup_perm = nth_lex_permutation(mult, map[idx])
        
        print(dup_perm)

        idx += 1
    end 

    # for k in dupchar_to_multiplicity 
    #     permutation = nth_lex_permutation
    # end 
    # return dedup_genome 
end 


P = "abAac"
P_dup_to_num_instances, P_num_dups = find_dups_in_str(P)
P_map = generate_map(P_num_dups, P_dup_to_num_instances)
print(P_map)

deduplicate_genome(P_map, P, P_dup_to_num_instances)

[3]a31[2, 1, 3]

In [120]:
# mapping S --> T, generates r random maps (RM)

function randommap(S::String, P::String, num_maps::Int)
    println("SRC ", S, " --> TARGET ", P, "\n")

    S_dupchar_to_multiplicity, S_num_dups = find_dups_in_str(S)
    P_dupchar_to_multiplicity, P_num_dups = find_dups_in_str(P)
    
    ## TARGET 
    printstyled("target", color=:magenta)
    println("\ndup genes --> multiplicity")
    println(P_dupchar_to_multiplicity, "\n")

    # arbitrary map p for target string 
    P_map = generate_map(P_num_dups, P_dupchar_to_multiplicity)
    println("target map")
    print(P_map, "\n")
    

    ## SOURCE
    printstyled("\nsource", color=:magenta)
    println("\ndup genes --> multiplicity")
    println(S_dupchar_to_multiplicity, "\n")

    S_M = Array{Array{Int}}(undef, num_maps)  # TODO does copy elements or point to them? 

    # r RM of src str S are generated and stored in a set S_M 
    println("source maps")
    for i in 1:num_maps
        s_map = generate_map(S_num_dups, S_dupchar_to_multiplicity)
        println(s_map)
        push!(S_M, s_map)
    end 

    # use maps to deduplicate genomes 
    # dedup_P = deduplicate_genome(P_map, P, P_dup_to_num_instances)

    # lehmer code to convert string S and P --> deduplicated S and P using the generated maps  
    # nth_lex_permutation(n, k)
    
        # for each elem in each map convert int n --> nth permutation of n ints in lexicographical order 

   
        
    # intiially the target string is mapped into permutation Pp using an arbitrary map p 
    
    # for each map m in M 
        # distance between permutation Sm and Pp is computed using estimator algo 

    #return shortest map iwth shortest dist between permutations 
end 


src = "aabc"
target = "abAc"
num_maps = 3

randommap(src, target, num_maps)

SRC aabc --> TARGET abAc

[35mtarget[39m
dup genes --> multiplicity
OrderedDict('a' => 2)

target map
[1]

[35msource[39m
dup genes --> multiplicity
OrderedDict('a' => 2)

source maps
[1]
[0]
[1]
