Priprema podataka
- dobivanje parova nizova iz poravnanja višestrukih sekvenci (engl. Multiple sequence alignment)
- korištenje podataka ([HIV Sequence Alignments](https://www.hiv.lanl.gov/content/sequence/NEWALIGN/align.html)) za inicijalizaciju matrica u Baum-Welch algoritmu


In [1]:
import itertools
import numpy as np
import random

from collections import Counter
from collections import defaultdict 
from tqdm import tqdm

In [2]:

def read_fasta(file_path):
    sequences = []
    current_sequence = ""
    current_header = ""
    dozvoljeni_znakovi = set(["-", "G", "C", "T", "A"])



    total_lines = sum(1 for line in open(file_path, 'r'))

    with open(file_path, 'r') as file:
        for line in tqdm(file, total=total_lines, desc="Reading FASTA file"):
            line = line.strip()
            if line.startswith('>'):
                if current_header: # dodaj sekvencu
                    #provjeri je li unutra neki znak različit od "-AGTC"
                    neki_znakovi = set(current_sequence) - dozvoljeni_znakovi
                    if (len(neki_znakovi) == 0):
                        sequences.append(current_sequence)
                current_header = line[1:]
                current_sequence = ""
            else:
                current_sequence += line

    # zadnja sekvenca
    if current_header:
        neki_znakovi = set(current_sequence) - dozvoljeni_znakovi
        if (len(neki_znakovi) == 0):
            sequences.append(current_sequence)

    return sequences


fasta_file_path = 'HIV1_ALL_2021_genome_DNA.fasta'

sequences_all = read_fasta(fasta_file_path)[:500]  # radi ograničenih resursa zadržat ćemo 500 sekvenci
sequences = sequences_all


Reading FASTA file: 100%|██████████| 1515900/1515900 [00:07<00:00, 208936.15it/s]


In [3]:
print(f"Ukupan broj sekvenci: {len(sequences)}")
print(f"Duljina sekvenci: {Counter([len(i) for i in sequences])}")

Ukupan broj sekvenci: 500
Duljina sekvenci: Counter({14937: 500})


In [4]:
#napravi sve parove od poravnatih sekvenci (i (x,y) i (y,x))
pairs = list(itertools.permutations(sequences_all, 2))
len(pairs)

249500

In [5]:
pairs = random.sample(pairs, k=100)

In [6]:
def process_pair(pair):
    
    prvi = ''.join(a for a, b in zip(pair[0], pair[1]) if a != '-' or b != '-' )
    drugi = ''.join(b for a, b in zip(pair[0], pair[1]) if a != '-' or b != '-' )
    
    novi_prvi = ''.join('' if ((prvi[i] == '-' and i > 0 and drugi[i-1] == '-') or (prvi[i] == '-' and i < (len(drugi) - 1) and drugi[i + 1] == '-')) else prvi[i] for i in range(len(prvi)))
    novi_drugi = ''.join('' if ((drugi[i] == '-' and i > 0 and prvi[i-1] == '-') or (drugi[i] == '-' and i < (len(prvi) - 1) and prvi[i + 1] == '-')) else drugi[i] for i in range(len(drugi)))
    
    return novi_prvi, novi_drugi

def get_pair_values(pair):
    return [
        'M' if a != '-' and b != '-' else
        'Ix' if a != '-' and b == '-' else
        'Iy'
        for a, b in zip(pair[0], pair[1])
    ]
    
def izracunaj_matricu_pocetnih_vrijednosti(elementi):
    pi = [
        elementi["M"] / sum(elementi.values()), 
        elementi["Ix"] / sum(elementi.values()), 
        elementi["Iy"] / sum(elementi.values())
    ]
    # ako slucajno bude 0 to treba zamjeniti s malom vrijednošću
    eps = 0.00000001
    pi = [max(vjerojatnost,eps) for vjerojatnost in pi]
    return pi

def normaliziraj_matricu(matrica):
    eps = 0.00000001
    matrica = matrica/ np.sum(matrica, axis=1, keepdims=True)
    return np.where(matrica == 0, eps, matrica)


In [7]:
stanja_set = ['M', 'Ix', 'Iy']
emisija = sorted(["A", "AA", "AC", "AG", "AT", "C", "CA", "CC", "CG", "CT", "G", "GA", "GC", "GG", "GT", "T", "TA", "TC", "TG", "TT"])
prvi_element_brojac = defaultdict(int)

matrica_prijelaza = np.zeros((len(stanja_set), len(stanja_set)))
matrica_emisija = np.zeros((len(stanja_set), len(emisija)))


# for pair in itertools.permutations(sequences, 2):  # ukoliko zelimo izbjeci spremanje svih permutacija sekvenci u memoriju (maknuti varijablu pairs)
with open("parovi.txt", "w") as file:    
    for pair in tqdm(pairs):
        
        processed_pair = process_pair(pair[:1000])
        pair_values = get_pair_values(processed_pair)
        
        prvi_element_brojac[pair_values[0]] += 1
        
        for i in range(len(pair_values) - 1):
            trenutno_stanje = stanja_set.index(pair_values[i])
            sljedece_stanje = stanja_set.index(pair_values[i + 1])
            matrica_prijelaza[trenutno_stanje, sljedece_stanje] += 1
            
        svi_parovi_simbola = []
        for i, simbol_stanja in enumerate(pair_values):
            x,y = processed_pair
    
            if x[i] != '-' and y[i] != '-':
                par_simbola = x[i] + y[i]
            else:
                if x[i] == '-':
                    par_simbola = y[i]
                else:
                    par_simbola = x[i]
        
            svi_parovi_simbola.append(par_simbola)
            matrica_emisija[stanja_set.index(simbol_stanja), emisija.index(par_simbola)] += 1

        file.write(str(svi_parovi_simbola))
        file.write("\n")

100%|██████████| 100/100 [00:02<00:00, 36.26it/s]


In [8]:
pi = izracunaj_matricu_pocetnih_vrijednosti(prvi_element_brojac)
pi

[0.04, 0.51, 0.45]

In [9]:
#pi2 = [0.07368336673346694, 0.4631583166332665, 0.4631583166332665]
#pi2

In [10]:
A = normaliziraj_matricu(matrica_prijelaza)
A

array([[9.97793003e-01, 1.08662900e-03, 1.12036801e-03],
       [3.71980109e-02, 9.62801989e-01, 1.00000000e-08],
       [3.18895923e-02, 1.00000000e-08, 9.68110408e-01]])

In [11]:
#A2 = np.array([[9.97843253e-01, 1.07837332e-03, 1.07837332e-03],
#       [4.12591903e-02, 9.58740196e-01, 6.13427300e-07],
#       [4.12591903e-02, 6.13427300e-07, 9.58740196e-01]])
#A2

In [12]:
E = normaliziraj_matricu(matrica_emisija)
E

array([[1.00000000e-08, 3.24012155e-01, 8.73947703e-03, 2.61032543e-02,
        6.31378352e-03, 1.00000000e-08, 8.72900641e-03, 1.47195968e-01,
        3.46693844e-03, 1.51451693e-02, 1.00000000e-08, 2.63882878e-02,
        3.43319977e-03, 2.04755290e-01, 3.80897867e-03, 1.00000000e-08,
        6.20791393e-03, 1.52091564e-02, 3.92764569e-03, 1.96563775e-01],
       [2.88339720e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 2.16821772e-01, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 2.75553296e-01, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.19285212e-01,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       [2.91718212e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 2.14316761e-01, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 2.77307834e-01, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.1665

In [13]:
#E2 = np.array([[1.00000000e-08, 3.25734397e-01, 8.15964895e-03, 2.39453468e-02,
#        5.72147873e-03, 1.00000000e-08, 8.15964895e-03, 1.50013314e-01,
#        3.06291168e-03, 1.39968051e-02, 1.00000000e-08, 2.39453468e-02,
#        3.06291168e-03, 2.08718026e-01, 3.67871353e-03, 1.00000000e-08,
#        5.72147873e-03, 1.39968051e-02, 3.67871353e-03, 1.98404453e-01],
#       [2.90796102e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 2.18200828e-01, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 2.74978179e-01, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.16024890e-01,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
#       [2.90796102e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 2.18200828e-01, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 2.74978179e-01, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.16024890e-01,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08]])
#E2

In [14]:
""" if __name__ == "__main__":
    with open("datoteka.txt", "w") as file:
        file.write("M\n")
        file.write("20\n")
        file.write("N\n")
        file.write("3\n")
        file.write("STANJA\n")
        for stanje in stanja_set:
            file.write(stanje + " ")
        file.write("\n")
        file.write("SIMBOLI\n")
        for simbol in emisija:
            file.write(simbol + " ")
        file.write("\n")
        
        file.write("<A>\n")
        for red in A:
            file.write(" ".join(map(str, red)) + "\n")

        file.write("<E>\n")        
        for red in E:
            file.write(" ".join(map(str, red)) + "\n")
            
        file.write("<pi>\n")        
        for prob in pi:
            file.write(str(prob) + " ")
             """

In [None]:
##priprema sekvenci za poravanje
def process_pair2(pair):
    
    prvi = ''.join(a for a in pair[0] if a != '-' )
    drugi = ''.join(b for b in pair[1] if b != '-' )

    return prvi, drugi

In [None]:
##treba nam 50 parova
sequences2 = read_fasta(fasta_file_path)[1500:1600] 
pairs2 = list(itertools.permutations(sequences2, 2))
parovi_za_poravnanje = [process_pair2(pair) for pair in tqdm(pairs2)]
random50_sekvenci = random.sample(parovi_za_poravnanje, 50)

In [None]:

if __name__ == "__main__":
    with open('sekvence_za_poravnanje.txt', 'w') as datoteka:
        for element in random50_sekvenci:
            datoteka.write(f"[{', '.join(map(str, element))}]\n")

In [None]:
duljine_stringova = [len(string) for tuple_par in parovi_za_poravnanje for string in tuple_par]

#najdulja sekvenca - manje od 10000 - bitno za c++ kod
najdulja_sekvenca = max(duljine_stringova)
najdulja_sekvenca