# The shortest chain which contains all hexamers from all 5 bases (A,T,C,G,U).

In [1]:
# packages
import numpy as np
import itertools

In [2]:
# functions
def combine_strings(string):
    ''' Add strings to each other. '''
    return ''.join(string)


def generate_k_mer_list(possible_bases, k_mer):
    ''' All possible repeated variations of k-mers. '''
    itr_prod = itertools.product(possible_bases, repeat=k_mer)
    k_mer_list = []
    for i in range(len(possible_bases)**k_mer):
        k_mer_list.append(combine_strings(next(itr_prod)))
    return np.array(k_mer_list)


def shortest_chain(k_mers): ## --> ChatGPT code, promt: given the following array; create the shortest string which contains all element of the given array
    '''
    With the help of ChatGPT, the shortest string that contains all possible hexamers,
    extending it only as much as necessary so that if a given k-mer already exists in the chain, it is not added again.
    '''
    result = k_mers[0]
    for i in range(1, len(k_mers)):
        overlap = len(k_mers[i]) - 1
        # If the k-mer already exists in the word chain, do not append it to the result.
        if result.count(k_mers[i]) == 0: # added to chatGPT code
            while overlap >= 0: # Backtracking until the k-mer matches the end of the chain.
                if result.endswith(k_mers[i][:overlap]):
                    break
                overlap -= 1
            result += k_mers[i][overlap:]
    return result


def kmers_sorted_by_frequency(chains, k_mers):
    ''' The frequency of possible k-mers in a given sequence in descending order. '''
    numbers = {}
    for element in k_mers:
        n=0
        for chain in chains:
            n += chain.count(element)
            numbers[element] = n
    sort_numbers = np.array(sorted(numbers.items(), key=lambda x:x[1], reverse=True))
    return sort_numbers


def run_chech(chains, k_mers, print_count=True, num_of_skipped_bases=0):
    ''' Counting identical frequencies --> how many k-mers appear n times in a sequence. '''
    sort_numbers = kmers_sorted_by_frequency(chains, k_mers)
    count = []
    for i in range(1, int(sort_numbers[0,1])+1):
        count.append([i, sum(np.array(sort_numbers[:,1], dtype=int) == i)])
    # missing elemets 
    count.append([0, sum(np.array(sort_numbers[:,1], dtype=int) == 0)])
    if print_count:
        print('\\begin{run_chech()}')
        print(f'  Length of the chain: {len(combine_strings(chains)) + num_of_skipped_bases}')
        print('  frequency - the number of k-mers:')
        for i in range(len(count)):
            # print if element type is not missing
            if count[i][1] !=0:
                print(f'  {count[i][0]} - {count[i][1]}')
        print('\end{run_chech()}')
    else:
        return count  
    return None


def count_bases(shortest_chain, print_count=True):
    ''' Count and print the number of bases. '''
    bases = ['A', 'T', 'C', 'G', 'U']
    count = {b: shortest_chain.count(b) for b in bases}
    # Order the dictionary by count in descending order
    count = dict(sorted(count.items(), key=lambda item: item[1], reverse=True))
    if print_count:
        print('\\begin{count_bases()}')
        for base, cnt in count.items():
            print(f'  {base}: {cnt}')
        print('\end{count_bases()}')
        return None
    return count


def calculate_GC_ratio(reads, tolerance=0.03, print_out_of_tolerance=False):
    ''' Calculate the GC ration. '''
    G_ratios, C_ratios = [], []
    out_of_tolerance = []
    for read in reads:
        num_of_G = read.count('G')
        num_of_C = read.count('C')
        sum_GC = num_of_G + num_of_C
        G_ratio = num_of_G/sum_GC
        G_ratios.append(G_ratio) 
        C_ratios.append(1 - G_ratio) 
        if G_ratio >= 0.5+tolerance or G_ratio <= 0.5-tolerance:
            out_of_tolerance.append(True)
        else:
            out_of_tolerance.append(False)
    if print_out_of_tolerance:
        print(f'{sum(out_of_tolerance)}/{len(reads)} (={(sum(out_of_tolerance)/len(reads)):.2f}) are out of tolerance.')
    return G_ratios, C_ratios, out_of_tolerance 

In [3]:
hexamers = generate_k_mer_list('ATCGU', 6)

print(f'Length without overlap: {len(combine_strings(hexamers))}')
shortest_hexamer_chain = shortest_chain(hexamers)
print(f'Length of the shortest string: {len(shortest_hexamer_chain)}')
print(f'It became {round(100 - len(shortest_hexamer_chain)/len(combine_strings(hexamers))*100, 3)}% shorter compared to the concatenated chain.')

Length without overlap: 93750
Length of the shortest string: 15635
It became 83.323% shorter compared to the concatenated chain.


---
---
---

In [4]:
def generate_reads_v1(k_mers, max_read_len=240):
    # the result will be stored in 'reads' var. 
    reads = [k_mers[0]]

    for i in range(1, len(k_mers)):
        # the number of possible overlapping bases
        overlap = len(k_mers[0]) - 1
        # If the k-mer already exists in a read, do not append it to the result.
        if not any(k_mers[i] in s for s in reads): 
            # backtracking until the k-mer matches the end of the read.
            while overlap >= 0: 
                # if the read end with the same character(s) as the hexamer begin.
                if reads[-1].endswith(k_mers[i][:overlap]):
                    # break the while loop
                    break
                # there is no overlap with n character --> try with n-1
                overlap -= 1
            # check if the hexamer can be appended
            if len(reads[-1]) + len(k_mers[i][overlap:]) <= max_read_len:
                # append if it is possible
                reads[-1] += k_mers[i][overlap:]
            else:
                # create a new read 
                reads.append(k_mers[i])        
                
    return np.array(reads)               

In [5]:
reads = generate_reads_v1(hexamers, max_read_len=240)
# show each tenth rows
reads[::10]

array(['AAAAAATAAAAACAAAAAGAAAAAUAAAATTAAAATCAAAATGAAAATUAAAACTAAAACCAAAACGAAAACUAAAAGTAAAAGCAAAAGGAAAAGUAAAAUTAAAAUCAAAAUGAAAAUUAAATATAAATACAAATAGAAATAUAAATTTAAATTCAAATTGAAATTUAAATCTAAATCCAAATCGAAATCUAAATGTAAATGCAAATGGAAATGUAAATUTAAATUCAAATUG',
       'AAUAAUATTAAUATCAAUATGAAUATUAAUACTAAUACCAAUACGAAUACUAAUAGTAAUAGCAAUAGGAAUAGUAAUAUTAAUAUCAAUAUGAAUAUUAAUTATAAUTACAAUTAGAAUTAUAAUTTTAAUTTCAAUTTGAAUTTUAAUTCTAAUTCCAAUTCGAAUTCUAAUTGTAAUTGCAAUTGGAAUTGUAAUTUTAAUTUCAAUTUGAAUTUUAAUCATAAUCACAAUCAG',
       'ATGCTUATGCCTATGCCCATGCCGATGCCUATGCGTATGCGCATGCGGATGCGUATGCUTATGCUCATGCUGATGCUUATGGACATGGAGATGGAUATGGTTATGGTCATGGTGATGGTUATGGCTATGGCCATGGCGATGGCUATGGGTATGGGCATGGGGATGGGUATGGUTATGGUCATGGUGATGGUUATGUACATGUAGATGUAUATGUTTATGUTCATGUTGATGUTUATGUCT',
       'ACGCCTACGCCCACGCCGACGCCUACGCGTACGCGCACGCGGACGCGUACGCUTACGCUCACGCUGACGCUUACGGAGACGGAUACGGTTACGGTCACGGTGACGGTUACGGCTACGGCCACGGCGACGGCUACGGGTACGGGCACGGGGACGGGUACGGUTACGGUCACGGUGACGGUUACGUAGACGUAUACGUTTACGUTCACGUTGACGUTUACGUCTACGUCCACGUCGACGUCU',
    

In [6]:
# lenght of all rows
for i, r in enumerate(reads):
    print(len(r), end=', ')
    if i%20 == 19: print()

235, 240, 237, 240, 239, 237, 240, 237, 240, 239, 237, 240, 236, 240, 236, 240, 237, 240, 239, 237, 
240, 237, 240, 239, 237, 240, 239, 237, 240, 237, 240, 236, 238, 236, 237, 240, 237, 240, 240, 237, 
240, 239, 237, 237, 240, 235, 240, 237, 240, 239, 237, 237, 236, 240, 239, 236, 240, 237, 238, 237, 
237, 240, 237, 237, 240, 238, 236, 237, 238, 235, 129, 

In [7]:
run_chech(reads, hexamers)

\begin{run_chech()}
  Length of the chain: 16798
  frequency - the number of k-mers:
  1 - 14869
  2 - 713
  3 - 38
  4 - 5
\end{run_chech()}


In [8]:
_ = calculate_GC_ratio(reads, print_out_of_tolerance=True)

51/71 (=0.72) are out of tolerance.


### --> Too many frequently occurring hexamers.
The problem of ratio is unsolved here.