# The shortest chain which contains all hexamers from all 5 bases (A,T,C,G,U).

Possible bases
- Adenine
- Thymine
- Cytosine
- Guanine
- Uracil

In [1]:
# packages
import numpy as np
import itertools

In [2]:
# functions
def combine_strings(string):
    ''' Add strings to each other. '''
    return ''.join(string)


def generate_k_mer_list(possible_bases, k_mer):
    ''' All possible repeated variations of k-mers. '''
    itr_prod = itertools.product(possible_bases, repeat=k_mer)
    k_mer_list = []
    for i in range(len(possible_bases)**k_mer):
        k_mer_list.append(combine_strings(next(itr_prod)))
    return np.array(k_mer_list)


def shortest_chain(k_mers): ## --> ChatGPT code, promt: given the following array; create the shortest string which contains all element of the given array
    '''
    With the help of ChatGPT, the shortest string that contains all possible hexamers,
    extending it only as much as necessary so that if a given k-mer already exists in the chain, it is not added again.
    '''
    result = k_mers[0]
    for i in range(1, len(k_mers)):
        overlap = len(k_mers[i]) - 1
        # If the k-mer already exists in the word chain, do not append it to the result.
        if k_mers[i] not in result: # added to chatGPT code

            while overlap >= 0: # Backtracking until the k-mer matches the end of the chain.
                if result.endswith(k_mers[i][:overlap]):
                    break
                overlap -= 1
            result += k_mers[i][overlap:]
    return result


def kmers_sorted_by_frequency(chain, k_mers):
    ''' The frequency of possible k-mers in a given sequence in descending order. '''
    numbers = {}
    for element in k_mers:
        n = chain.count(element)
        numbers[element] = n
    sort_numbers = np.array(sorted(numbers.items(), key=lambda x:x[1], reverse=True))
    return sort_numbers


def run_chech(chains, k_mers, print_count=True, num_of_skipped_bases=0):
    ''' Counting identical frequencies --> how many k-mers appear n times in a sequence. '''
    sort_numbers = kmers_sorted_by_frequency(chains, k_mers)
    count = []
    for i in range(1, int(sort_numbers[0,1])+1):
        count.append([i, sum(np.array(sort_numbers[:,1], dtype=int) == i)])
    # missing elemets 
    count.append([0, sum(np.array(sort_numbers[:,1], dtype=int) == 0)])
    if print_count:
        print('\\begin{run_chech()}')
        print(f'  Length of the chain: {len(combine_strings(chains)) + num_of_skipped_bases}')
        print('  frequency - the number of k-mers:')
        for i in range(len(count)):
            # print if element type is not missing
            if count[i][1] !=0:
                print(f'  {count[i][0]} - {count[i][1]}')
        print('\end{run_chech()}')
    else:
        return count  
    return None


def count_bases(shortest_chain_list, print_count=True):
    ''' Count and print the number of bases. '''
    bases = ['A', 'T', 'C', 'G', 'U']
    count = {b: shortest_chain_list.count(b) for b in bases}
    # Order the dictionary by count in descending order
    count = dict(sorted(count.items(), key=lambda item: item[1], reverse=True))
    if print_count:
        print('\\begin{count_bases()}')
        for base, cnt in count.items():
            print(f'  {base}: {cnt}')
        print('\end{count_bases()}')
        return None
    return count



def delete_k_mers(k_mer_list, k_mers_to_del=None):
    ''' Delete given k-mers from a generated k-mer list. '''
    mask = np.isin(k_mer_list, k_mers_to_del, invert=True)
    k_mers_modified = k_mer_list[mask]
    shortest_modified_k_mer_chain = shortest_chain(k_mers_modified)

    run_chech(shortest_modified_k_mer_chain, k_mers_modified)

    s = kmers_sorted_by_frequency(shortest_modified_k_mer_chain, k_mers_modified)
    return s[np.array(s[:,1], dtype=int) == 2]


def split_to_sequences(long_string, seq_length=240):
    return np.array([long_string[i:i+seq_length] for i in range(0, len(long_string), seq_length)])

---
## Hexamers

Permutation with Repetition:

There are $5^6= 15625$ possible hexamers (see:  $\texttt{hexamers}$ variable)

$ 6 \cdot 5^6= 93750$ base length chain which contains all hexamer **without overlapping**

In [3]:
print(f'There are 5**6={5**6} possible hexamers.')
print(f'Length of the chain without overlap 6*5**6={6*5**6}.')

There are 5**6=15625 possible hexamers.
Length of the chain without overlap 6*5**6=93750.


In [4]:
hexamers = generate_k_mer_list('GCTAU', 6)

print(f'Length without overlap: {len(combine_strings(hexamers))}')
shortest_hexamer_chain = shortest_chain(hexamers)
print(f'Length of the shortest string: {len(shortest_hexamer_chain)}')
print(f'It became {round(100 - len(shortest_hexamer_chain)/len(combine_strings(hexamers))*100, 3)}% shorter compared to the concatenated chain.')

Length without overlap: 93750
Length of the shortest string: 15635
It became 83.323% shorter compared to the concatenated chain.


In [5]:
run_chech(shortest_hexamer_chain, hexamers)

\begin{run_chech()}
  Length of the chain: 15635
  frequency - the number of k-mers:
  1 - 15620
  2 - 5
\end{run_chech()}


In [6]:
count_bases(shortest_hexamer_chain)

\begin{count_bases()}
  G: 3130
  U: 3130
  A: 3125
  T: 3125
  C: 3125
\end{count_bases()}


In [7]:
### With random order --> worse solution
hexamers2 = hexamers.copy()
np.random.seed(42)
np.random.shuffle(hexamers2)
print(f'Length without overlap: {len(combine_strings(hexamers2))}')
shortest_hexamer_chain2 = shortest_chain(hexamers2)
print(f'Length of the shortest string: {len(shortest_hexamer_chain2)}')
print(f'It became {round(100 - len(shortest_hexamer_chain2)/len(combine_strings(hexamers2))*100, 3)}% shorter compared to the concatenated chain.')


Length without overlap: 93750
Length of the shortest string: 31759
It became 66.124% shorter compared to the concatenated chain.


In [8]:
run_chech(shortest_hexamer_chain2, hexamers2)


\begin{run_chech()}
  Length of the chain: 31759
  frequency - the number of k-mers:
  1 - 5786
  2 - 5398
  3 - 3034
  4 - 1042
  5 - 304
  6 - 50
  7 - 8
  8 - 3
\end{run_chech()}


In [9]:
count_bases(shortest_hexamer_chain2)

\begin{count_bases()}
  C: 6421
  U: 6355
  T: 6352
  A: 6323
  G: 6308
\end{count_bases()}


In [10]:
if False: # Write shortest chain to txt file.
    with open("shortest_hexamer_chain.txt", "w") as file:
        file.write(shortest_hexamer_chain)

---
---
---

In [11]:
sequences = split_to_sequences(shortest_hexamer_chain)
len(sequences)

66

---
\* Solution 2) A part of the 240-chain is the useful hexamers, while the other part consists of G and C bases to restore the G-C ratio.
For example: 200 characters are the useful hexamer, and the rest are C, to ensure the correct ratio.

// not enough good


In [12]:
shorter_seq = sequences[0][:200] + 40*'C'

In [13]:
hex_count_example =  count_bases(shorter_seq, print_count=False)

In [14]:
round(hex_count_example['G'] / (hex_count_example['G'] + hex_count_example['C']), 3) , round(hex_count_example['C'] / (hex_count_example['G'] + hex_count_example['C']), 3)

(0.648, 0.352)

In [15]:
shorter_seq

'GGGGGGCGGGGGTGGGGGAGGGGGUGGGGCCGGGGCTGGGGCAGGGGCUGGGGTCGGGGTTGGGGTAGGGGTUGGGGACGGGGATGGGGAAGGGGAUGGGGUCGGGGUTGGGGUAGGGGUUGGGCGCGGGCGTGGGCGAGGGCGUGGGCCCGGGCCTGGGCCAGGGCCUGGGCTCGGGCTTGGGCTAGGGCTUGGGCACGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'