# The shortest chain which contains all hexamers from all 5 bases (A,T,C,G,U).

## How to generate the reads?
Create a slots (`np.array`) for reads.\
If the given hexamer does not exists in any reads, then append it somewhere with the following rules:
- Append the hexamer to the read where the overlap is the biggest. 
    - If more than one exists append the shortest read.
- If there is no overlap anywhere, then 
    - append the hexamert to a new read (if it is possible) OR
    - append to the shorthest existing read

### Todo
- how can the `Too many frequently occurring hexamers.` problem solved?

## Todo later
- check the G-C frequency for all reads

In [1]:
# packages
import numpy as np
import itertools

In [2]:
# functions
def combine_strings(string):
    ''' Add strings to each other. '''
    return ''.join(string)


def generate_k_mer_list(possible_bases, k_mer):
    ''' All possible repeated variations of k-mers. '''
    itr_prod = itertools.product(possible_bases, repeat=k_mer)
    k_mer_list = []
    for i in range(len(possible_bases)**k_mer):
        k_mer_list.append(combine_strings(next(itr_prod)))
    return np.array(k_mer_list)


def shortest_chain(k_mers): ## --> ChatGPT code, promt: given the following array; create the shortest string which contains all element of the given array
    '''
    With the help of ChatGPT, the shortest string that contains all possible hexamers,
    extending it only as much as necessary so that if a given k-mer already exists in the chain, it is not added again.
    '''
    result = k_mers[0]
    for i in range(1, len(k_mers)):
        overlap = len(k_mers[i]) - 1
        # If the k-mer already exists in the word chain, do not append it to the result.
        if result.count(k_mers[i]) == 0: # added to chatGPT code
            while overlap >= 0: # Backtracking until the k-mer matches the end of the chain.
                if result.endswith(k_mers[i][:overlap]):
                    break
                overlap -= 1
            result += k_mers[i][overlap:]
    return result


def kmers_sorted_by_frequency(chains, k_mers):
    ''' The frequency of possible k-mers in a given sequence in descending order. '''
    numbers = {}
    for element in k_mers:
        n=0
        for chain in chains:
            n += chain.count(element)
            numbers[element] = n
    sort_numbers = np.array(sorted(numbers.items(), key=lambda x:x[1], reverse=True))
    return sort_numbers


def run_chech(chains, k_mers, print_count=True, num_of_skipped_bases=0):
    ''' Counting identical frequencies --> how many k-mers appear n times in a sequence. '''
    sort_numbers = kmers_sorted_by_frequency(chains, k_mers)
    count = []
    for i in range(1, int(sort_numbers[0,1])+1):
        count.append([i, sum(np.array(sort_numbers[:,1], dtype=int) == i)])
    # missing elemets 
    count.append([0, sum(np.array(sort_numbers[:,1], dtype=int) == 0)])
    if print_count:
        print('\\begin{run_chech()}')
        print(f'  Length of the chain: {len(combine_strings(chains)) + num_of_skipped_bases}')
        print('  frequency - the number of k-mers:')
        for i in range(len(count)):
            # print if element type is not missing
            if count[i][1] !=0:
                print(f'  {count[i][0]} - {count[i][1]}')
        print('\end{run_chech()}')
    else:
        return count  
    return None


def count_bases(shortest_chain, print_count=True):
    ''' Count and print the number of bases. '''
    bases = ['A', 'T', 'C', 'G', 'U']
    count = {b: shortest_chain.count(b) for b in bases}
    # Order the dictionary by count in descending order
    count = dict(sorted(count.items(), key=lambda item: item[1], reverse=True))
    if print_count:
        print('\\begin{count_bases()}')
        for base, cnt in count.items():
            print(f'  {base}: {cnt}')
        print('\end{count_bases()}')
        return None
    return count


def calculate_GC_ratio(reads, tolerance=0.03, print_out_of_tolerance=False):
    ''' Calculate the GC ration. '''
    G_ratios, C_ratios = [], []
    out_of_tolerance = []
    for read in reads:
        num_of_G = read.count('G')
        num_of_C = read.count('C')
        sum_GC = num_of_G + num_of_C
        G_ratio = num_of_G/sum_GC
        G_ratios.append(G_ratio) 
        C_ratios.append(1 - G_ratio) 
        if G_ratio >= 0.5+tolerance or G_ratio <= 0.5-tolerance:
            out_of_tolerance.append(True)
        else:
            out_of_tolerance.append(False)
    if print_out_of_tolerance:
        print(f'{sum(out_of_tolerance)}/{len(reads)} (={(sum(out_of_tolerance)/len(reads)):.2f}) are out of tolerance.')
    return G_ratios, C_ratios, out_of_tolerance 

In [3]:
hexamers = generate_k_mer_list('ACTGU', 6)

print(f'Length without overlap: {len(combine_strings(hexamers))}')
shortest_hexamer_chain = shortest_chain(hexamers)
print(f'Length of the shortest string: {len(shortest_hexamer_chain)}')
print(f'It became {round(100 - len(shortest_hexamer_chain)/len(combine_strings(hexamers))*100, 3)}% shorter compared to the concatenated chain.')

Length without overlap: 93750
Length of the shortest string: 15635
It became 83.323% shorter compared to the concatenated chain.


#### Randomizations is needed better for G-C ratio.

In [4]:
# random seed for reproduction
np.random.seed(137)
np.random.shuffle(hexamers)

---
---
---

Calculate the minimum number of slots.
- Each slot must have 240 bases or fewer.
- The maximum length of the combined chain must be less than 20,000 bases.

$\xrightarrow{} 20000/240\simeq 83$ **or more** slots needed

In [5]:
def fill_k_mers_into_reads(reads, k_mers, min_overlap, num_of_while_loops):
    
    # an empty list for the skipped k-mers
    skipped_k_mers = []
    
    for i, k_mer in enumerate(k_mers):
            
        # if the k-mer does not exists in any read, then put append it using the rules.
        if not any(k_mer in read for read in reads):
                        
            overlaps_in_reads = []
            # Iterate trough all reads
            for j in range(len(reads)):   
                # the number of possible overlapping bases
                overlap = len(k_mer) - 1
                # backtracking until the k-mer matches to the end of read j REWRITE
                while overlap >= 0: 
                        # if the read end with the same character(s) as the hexamer begin.
                        if reads[j].endswith(k_mer[:overlap]):
                            # save the number of overlapping characters
                            overlaps_in_reads.append(overlap)
                            # break the while loop
                            break
                        # there is no overlap with n character --> try with n-1
                        overlap -= 1
        
            # if all overlaps are less then or equal to the minimal overlap 
            if np.all(np.array(overlaps_in_reads) <= min_overlap):
                # skip the k-mer
                skipped_k_mers.append(k_mer)                                  
            # else there is/are overlap(s) somewhere 
            else:          
                # find the indices of maximal overlaps
                max_overlaps_indices = np.where(overlaps_in_reads == np.max(overlaps_in_reads))[0]
                # find too long reads 
                too_long_reads = np.where((np.vectorize(len)(reads) + max(overlaps_in_reads)) >= 239)[0]    
                # use only those indices what are not too long
                selected_reads = np.setdiff1d(max_overlaps_indices, too_long_reads)
                
                # if more than one minimal overlap exists
                if len(selected_reads) > 1:                   
                    # get the length of selected reads
                    length_of_selected_reads = np.vectorize(len)(reads[selected_reads])
                    # find the indices of the minimal read lenght
                    min_read_len_idx = np.where(length_of_selected_reads == np.min(length_of_selected_reads))[0]
                    # random seed for reproduction
                    np.random.seed(137)
                    # append the hexamer (w/o overlap) to one of the shortest read                  
                    reads[selected_reads[np.random.choice(min_read_len_idx)]] += k_mer[max(overlaps_in_reads):]
                    
                # else one minimal overlap exists
                elif len(selected_reads) == 1:
                    #append the hexamer (w/o overlap)
                    reads[selected_reads[0]] += k_mer[max(overlaps_in_reads):]
                else:
                    # skip the k-mer
                    skipped_k_mers.append(k_mer)
     
        # print 
        print(f'{((i+1)/len(k_mers)*100):.0f}% finished, {num_of_while_loops} while loop(s) remaining.\t', end='\r')
              
    return reads, skipped_k_mers

In [6]:
def generate_reads_v3(k_mers, num_of_reads, min_overlap=None, max_while_counter=5):
    ''' Generate reads '''
    # the result will be stored in 'reads' var. as a numpy array
    reads = np.empty(num_of_reads, dtype='U300')
    
    
    # fill all reads with an initial k-mer
    reads[:num_of_reads] = k_mers[:num_of_reads]
    
    skipped_k_mers = k_mers[num_of_reads:].copy()


    if min_overlap == None:
        min_overlap = len(k_mers[0]) // 2
        print(f'minimum overlap set to {min_overlap}')

    while_counter = 0
    
    #k_mers_copy = k_mers.copy()
    while max_while_counter >= while_counter:
        k_mers_len_before = len(skipped_k_mers)
        reads, skipped_k_mers = fill_k_mers_into_reads(reads, skipped_k_mers, min_overlap, 
                                                       max_while_counter-while_counter)
        k_mers_len_after = len(skipped_k_mers)
        
        if len(skipped_k_mers) == 0:
            break
        if k_mers_len_before == k_mers_len_after:
            break 
        
        while_counter+=1
        
        
        
    print(f'Number of skipped k-mers: {len(skipped_k_mers)}  {20*" "}')
    if any(np.vectorize(len)(reads) > 240):
        print('something went wrong: there is a too long read.')

    return reads, skipped_k_mers

In [20]:
reads, skipped_hexamers = generate_reads_v3(hexamers, num_of_reads=100, min_overlap=2, max_while_counter=7)

Number of skipped k-mers: 150                      


In [21]:
run_chech(reads, hexamers, num_of_skipped_bases=6*len(skipped_hexamers))

\begin{run_chech()}
  Length of the chain: 23864
  frequency - the number of k-mers:
  1 - 9630
  2 - 4809
  3 - 939
  4 - 90
  5 - 7
  0 - 150
\end{run_chech()}


In [22]:
np.vectorize(len)(reads)

array([219, 222, 221, 218, 236, 226, 236, 225, 232, 236, 233, 235, 234,
       235, 226, 233, 221, 237, 227, 237, 219, 219, 229, 219, 216, 231,
       237, 237, 236, 224, 235, 226, 237, 237, 238, 216, 233, 237, 227,
       228, 217, 222, 236, 238, 236, 237, 237, 228, 233, 237, 229, 237,
       236, 218, 220, 231, 214, 221, 237, 224, 229, 232, 237, 234, 225,
       236, 213, 236, 220, 236, 213, 238, 238, 234, 235, 229, 229, 238,
       226, 230, 227, 237, 222, 236, 234, 236, 223, 233, 219, 223, 216,
       230, 216, 238, 233, 234, 237, 236, 230, 238])

In [23]:
G_ratios, C_ratios, out_of_tolerance = calculate_GC_ratio(reads, print_out_of_tolerance=True)

58/100 (=0.58) are out of tolerance.
