# A legrövidebb lánc, mely tartalmazza az 5 bázisból álló hexa-/heptamerek mindegyikét.

Lehetséges bázisok:
- Adenin
- Timin
- Citozin
- Guanin
- Uracil

In [1]:
# packages
import numpy as np
import itertools

In [2]:
# functions
def combine_strings(string):
    ''' Add strings to each other. '''
    return ''.join(string)


def generate_k_mer_list(possible_bases, k_mer):
    ''' All possible repeated variations of k-mers. '''
    itr_prod = itertools.product(possible_bases, repeat=k_mer)
    k_mer_list = []
    for i in range(len(possible_bases)**k_mer):
        k_mer_list.append(combine_strings(next(itr_prod)))
    return np.array(k_mer_list)


def shortest_chain(k_mers): ## --> ChatGPT code, promt: given the following array; create the shortest string which contains all element of the given array
    '''
    With the help of ChatGPT, the shortest string that contains all possible hexamers,
    extending it only as much as necessary so that if a given k-mer already exists in the chain, it is not added again.
    '''
    result = k_mers[0]
    for i in range(1, len(k_mers)):
        overlap = len(k_mers[i]) - 1
        # If the k-mer already exists in the word chain, do not append it to the result.
        if k_mers[i] not in result: # added to chatGPT code
            while overlap >= 0: # Backtracking until the k-mer matches the end of the chain.
                if result.endswith(k_mers[i][:overlap]):
                    break
                overlap -= 1
            result += k_mers[i][overlap:]
    return result


def sorted_by_numbers(string, k_mers):
    ''' The frequency of possible k-mers in a given sequence in descending order. '''
    numbers = {}
    for element in k_mers:
        n = string.count(element)
        numbers[element] = n
    sort_numbers = np.array(sorted(numbers.items(), key=lambda x:x[1], reverse=True))
    return sort_numbers


def run_chech(chain, k_mers, print_count=True):
    ''' Counting identical frequencies --> how many k-mers appear n times in a sequence. '''
    sort_numbers = sorted_by_numbers(chain, k_mers)
    count = []
    for i in range(1,int(sort_numbers[0,1])+1):
        count.append([sum(np.array(sort_numbers[:,1], dtype=int) == i), i])
    if print_count:
        print('\\begin{run_chech()}')
        print(f'  Length of the chain: {len(chain)}')
        print('  Frequency - number of k-mers:')
        for i in range(len(count)):
            print(f'  {count[i][1]} - {count[i][0]}')
        print('\end{run_chech()}')
    else:
        return count  
    return None


def count_bases(shortest_chain, print_count=True):
    ''' Count and print the number of bases. '''
    count = [] 
    bases = ['A', 'C', 'G', 'T', 'U']
    print('\\begin{count_bases()}')
    for b in bases:
        count.append(shortest_chain.count(b))
    if print_count:
        for i, b in enumerate(bases):
            print(f'  {b}: {count[i]}')
        print('\end{count_bases()}')
        return None
    print('\end{count_bases()}')    
    return count


def delete_k_mers(k_mer_list, k_mers_to_del=None):
    ''' Delete given k-mers from a generated k-mer list. '''
    mask = np.isin(k_mer_list, k_mers_to_del, invert=True)
    k_mers_modified = k_mer_list[mask]
    shortest_modified_k_mer_chain = shortest_chain(k_mers_modified)

    run_chech(shortest_modified_k_mer_chain, k_mers_modified)
    print('return the k-mers whose number is greater than one:')
    s = sorted_by_numbers(shortest_modified_k_mer_chain, k_mers_modified)
    return s[np.array(s[:,1], dtype=int) > 1]


In [3]:
# Testing the `shortest_string` function with different concatenations
shortest_chain(['AAAATAT', 'TAT', 'AT', 'G', 'TGA', 'ATC', 'TCU', 'UC', 'ATC', 'AAA', 'AAAAA'])

'AAAATATGATCUCAAAAA'

---
## Hexamer A,C,T,G,U bázisokból

Ismétléses variáció

$5^6= 15625$ darab hexamer (lásd: $\texttt{hexamers}$ variable)

$ 6 \cdot 5^6= 93750$ bázispár hosszú szakasz tartalmaz minden lehetséges hexamert **átfedés nélkül**

In [4]:
print(f'There are 5**6={5**6} possible hexamers.')
print(f'Length of the chain without overlap 6*5**6={6*5**6}.')

There are 5**6=15625 possible hexamers.
Length of the chain without overlap 6*5**6=93750.


In [5]:
hexamers = generate_k_mer_list('ACGTU', 6)
print(f'Length without overlap: {len(combine_strings(hexamers))}')
shortest_hexamer_chain = shortest_chain(hexamers)
print(f'Length of the shortest string: {len(shortest_hexamer_chain)}')
print(f'It became {round(100 - len(shortest_hexamer_chain)/len(combine_strings(hexamers))*100, 3)}% shorter compared to the concatenated chain.')

Length without overlap: 93750
Length of the shortest string: 15635
It became 83.323% shorter compared to the concatenated chain.


In [6]:
run_chech(shortest_hexamer_chain, hexamers)

\begin{run_chech()}
  Length of the chain: 15635
  Frequency - number of k-mers:
  1 - 15620
  2 - 5
\end{run_chech()}


In [7]:
count_bases(shortest_hexamer_chain)

\begin{count_bases()}
  A: 3130
  C: 3125
  G: 3125
  T: 3125
  U: 3130
\end{count_bases()}


In [8]:
if False: # Write shortest chain to txt file.
    with open("shortest_hexamer_chain.txt", "w") as file:
        file.write(shortest_hexamer_chain)

---
### Lánc adott hexamerek nélkül

In [9]:
hexamers_to_delete = 'AAAAAU'
delete_k_mers(hexamers, hexamers_to_delete)

\begin{run_chech()}
  Length of the chain: 15670
  Frequency - number of k-mers:
  1 - 15588
  2 - 33
  3 - 3
\end{run_chech()}
return the k-mers whose number is greater than one:


array([['TTTTUU', '3'],
       ['TTTUUU', '3'],
       ['TTUUUU', '3'],
       ['AAAAAT', '2'],
       ['AAAACU', '2'],
       ['AAAATT', '2'],
       ['AAAAUA', '2'],
       ['AAACUT', '2'],
       ['AAATTT', '2'],
       ['AAAUAA', '2'],
       ['AACUTT', '2'],
       ['AATTTT', '2'],
       ['AAUAAA', '2'],
       ['ACUTTT', '2'],
       ['ATTTTT', '2'],
       ['AUAAAA', '2'],
       ['CUTTTT', '2'],
       ['TAAAAC', '2'],
       ['TTTTTU', '2'],
       ['TTTUUT', '2'],
       ['TTUUTT', '2'],
       ['TTUUUT', '2'],
       ['TUUTTT', '2'],
       ['TUUUTT', '2'],
       ['TUUUUT', '2'],
       ['TUUUUU', '2'],
       ['UTAAAA', '2'],
       ['UTTTTU', '2'],
       ['UTTTUU', '2'],
       ['UTTUUU', '2'],
       ['UUTAAA', '2'],
       ['UUTTTU', '2'],
       ['UUTTUU', '2'],
       ['UUUTAA', '2'],
       ['UUUTTU', '2'],
       ['UUUUTA', '2']], dtype='<U11')

---

# Heptamer  A,C,T,G,U bázisokból

Ismétléses variáció

$5^7= 78125$ darab hexamer (lásd: $\texttt{heptamers}$ variable)

$ 7 \cdot 5^7= 546875$ bázispár hosszú szakasz tartalmaz minden lehetséges heptamert **átfedés nélkül**

In [10]:
print(f'There are 5**7={5**7} possible heptamers.')
print(f'Length of the chain without overlap 7*5**7={7*5**7}.')

There are 5**7=78125 possible heptamers.
Length of the chain without overlap 7*5**7=546875.


In [11]:
heptamers = generate_k_mer_list('ACGTU', 7)
print(f'Length without overlap: {len(combine_strings(heptamers))}')
shortest_heptamer_chain = shortest_chain(heptamers)
print(f'Length of the shortest string: {len(shortest_heptamer_chain)}')
print(f'It became {round(100 - len(shortest_heptamer_chain)/len(combine_strings(heptamers))*100, 3)}% shorter compared to the concatenated chain.')

Length without overlap: 546875
Length of the shortest string: 78137
It became 85.712% shorter compared to the concatenated chain.


In [12]:
run_chech(shortest_heptamer_chain, heptamers)

\begin{run_chech()}
  Length of the chain: 78137
  Frequency - number of k-mers:
  1 - 78119
  2 - 6
\end{run_chech()}


In [13]:
count_bases(shortest_heptamer_chain)

\begin{count_bases()}
  A: 15631
  C: 15625
  G: 15625
  T: 15625
  U: 15631
\end{count_bases()}


In [14]:
if False: # Write shortest chain to txt file.
    with open("shortest_heptamer_chain.txt", "w") as file:
        file.write(shortest_heptamer_chain)

---
### Lánc adott heptamerek nélkül

In [15]:
heptamers_to_delete =  [7*'A', 7*'T', 7*'C', 7*'G', 7*'U']
delete_k_mers(heptamers, heptamers_to_delete)

\begin{run_chech()}
  Length of the chain: 78126
  Frequency - number of k-mers:
  1 - 78120
\end{run_chech()}
return the k-mers whose number is greater than one:


array([], shape=(0, 2), dtype='<U11')