Priprema podataka
- dobivanje parova nizova iz poravnanja višestrukih sekvenci (engl. Multiple sequence alignment)
- korištenje podataka ([HIV Sequence Alignments](https://www.hiv.lanl.gov/content/sequence/NEWALIGN/align.html)) za inicijalizaciju matrica u Baum-Welch algoritmu


In [142]:
import itertools
import numpy as np

from collections import Counter
from collections import defaultdict 
from tqdm import tqdm

In [143]:
def is_valid_dna_string(input_string):
    valid_characters = set('>ACTG-actg')
    line_set = set(input_string.lower())
    return line_set - valid_characters == set()

def read_fasta(file_path):
    sequences = []
    current_sequence = ""
    current_header = ""

    total_lines = sum(1 for line in open(file_path, 'r'))

    with open(file_path, 'r') as file:
        for line in tqdm(file, total=total_lines, desc="Reading FASTA file"):
            line = line.strip()
            if line.startswith('>'):
                if current_header and is_valid_dna_string(current_sequence): # dodaj sekvencu
                    sequences.append(current_sequence)
                current_header = line[1:]
                current_sequence = ""
            else:
                current_sequence += line

    # zadnja sekvenca
    if current_header:
        sequences.append(current_sequence)

    return sequences


fasta_file_path = 'HIV1_ALL_2021_genome_DNA.fasta'

sequences = read_fasta(fasta_file_path)[:10]  # radi ograničenih resursa zadržat ćemo 500 sekvenci

Reading FASTA file: 100%|██████████| 1515900/1515900 [00:04<00:00, 312826.30it/s]


In [144]:
print(f"Ukupan broj sekvenci: {len(sequences)}")
print(f"Duljina sekvenci: {Counter([len(i) for i in sequences])}")

Ukupan broj sekvenci: 10
Duljina sekvenci: Counter({14937: 10})


In [145]:
#napravi sve parove od poravnatih sekvenci (i (x,y) i (y,x))
pairs = list(itertools.permutations(sequences, 2))
len(pairs)

90

In [146]:
def process_pair(pair):
    
    prvi = ''.join(a for a, b in zip(pair[0], pair[1]) if a != '-' or b != '-' )
    drugi = ''.join(b for a, b in zip(pair[0], pair[1]) if a != '-' or b != '-' )
    
    novi_prvi = ''.join('' if ((prvi[i] == '-' and i > 0 and drugi[i-1] == '-') or (prvi[i] == '-' and i < (len(drugi) - 1) and drugi[i + 1] == '-')) else prvi[i] for i in range(len(prvi)))
    novi_drugi = ''.join('' if ((drugi[i] == '-' and i > 0 and prvi[i-1] == '-') or (drugi[i] == '-' and i < (len(prvi) - 1) and prvi[i + 1] == '-')) else drugi[i] for i in range(len(drugi)))
    
    return novi_prvi, novi_drugi

def get_pair_values(pair):
    return [
        'M' if a != '-' and b != '-' else
        'Ix' if a != '-' and b == '-' else
        'Iy'
        for a, b in zip(pair[0], pair[1])
    ]
    
def izracunaj_matricu_pocetnih_vrijednosti(elementi):
    pi = [
        elementi["M"] / sum(elementi.values()), 
        elementi["Ix"] / sum(elementi.values()), 
        elementi["Iy"] / sum(elementi.values())
    ]
    # ako slucajno bude 0 to treba zamjeniti s malom vrijednošću
    eps = format(1e-08, '.8f')
    pi = [max(vjerojatnost,eps) for vjerojatnost in pi]
    return pi

def normaliziraj_matricu(matrica):
    eps = format(1e-08, '.8f')
    matrica = matrica/ np.sum(matrica, axis=1, keepdims=True)
    return np.where(matrica == 0, eps, matrica)

In [147]:
pairs[7][1]

'----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CTCTAGCAGTGGCG------------------------CCCGA----------------------------------------------------------------

In [148]:
stanja_set = ['M', 'Ix', 'Iy']
emisija = sorted(["A", "AA", "AC", "AG", "AT", "C", "CA", "CC", "CG", "CT", "G", "GA", "GC", "GG", "GT", "T", "TA", "TC", "TG", "TT"])
prvi_element_brojac = defaultdict(int)

matrica_prijelaza = np.zeros((len(stanja_set), len(stanja_set)))
matrica_emisija = np.zeros((len(stanja_set), len(emisija)))


# for pair in itertools.permutations(sequences, 2):  # ukoliko zelimo izbjeci spremanje svih permutacija sekvenci u memoriju (maknuti varijablu pairs)
for pair in tqdm(pairs):
    
    processed_pair = process_pair(pair)
    pair_values = get_pair_values(processed_pair)
    
    prvi_element_brojac[pair_values[0]] += 1
    
    for i in range(len(pair_values) - 1):
        trenutno_stanje = stanja_set.index(pair_values[i])
        sljedece_stanje = stanja_set.index(pair_values[i + 1])
        matrica_prijelaza[trenutno_stanje, sljedece_stanje] += 1
        
    
    for i, simbol_stanja in enumerate(pair_values):
        x,y = processed_pair
   
        if x[i] != '-' and y[i] != '-':
            par_simbola = x[i] + y[i]
        else:
            if x[i] == '-':
                par_simbola = y[i]
            else:
                par_simbola = x[i]
    
        try:
            matrica_emisija[stanja_set.index(simbol_stanja), emisija.index(par_simbola)] += 1
        except ValueError:#bio je Y ili neki drugi znak u sekvenci
            continue
    
    



100%|██████████| 90/90 [00:08<00:00, 10.26it/s]


In [149]:
pi = izracunaj_matricu_pocetnih_vrijednosti(prvi_element_brojac)
pi

[0.13333333333333333, 0.43333333333333335, 0.43333333333333335]

In [150]:
#pi2 = [0.07368336673346694, 0.4631583166332665, 0.4631583166332665]
#pi2

In [151]:
A = normaliziraj_matricu(matrica_prijelaza)
A

array([[9.97719354e-01, 1.14032286e-03, 1.14032286e-03],
       [3.29081359e-02, 9.67091864e-01, 1.00000000e-08],
       [3.29081359e-02, 1.00000000e-08, 9.67091864e-01]])

In [152]:
#A2 = np.array([[9.97843253e-01, 1.07837332e-03, 1.07837332e-03],
#       [4.12591903e-02, 9.58740196e-01, 6.13427300e-07],
#       [4.12591903e-02, 6.13427300e-07, 9.58740196e-01]])
#A2

In [153]:
E = normaliziraj_matricu(matrica_emisija)
E

array([[1.00000000e-08, 3.25486147e-01, 7.20088946e-03, 2.26228769e-02,
        5.17780601e-03, 1.00000000e-08, 7.20088946e-03, 1.53685008e-01,
        2.75976317e-03, 1.25260314e-02, 1.00000000e-08, 2.26228769e-02,
        2.75976317e-03, 2.12621862e-01, 3.52491958e-03, 1.00000000e-08,
        5.17780601e-03, 1.25260314e-02, 3.52491958e-03, 2.00582410e-01],
       [2.78134589e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 2.15478484e-01, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 2.78312995e-01, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.28073931e-01,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       [2.78134589e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 2.15478484e-01, 1.00000000e-08, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 2.78312995e-01, 1.00000000e-08,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.2807

In [154]:
#E2 = np.array([[1.00000000e-08, 3.25734397e-01, 8.15964895e-03, 2.39453468e-02,
#        5.72147873e-03, 1.00000000e-08, 8.15964895e-03, 1.50013314e-01,
#        3.06291168e-03, 1.39968051e-02, 1.00000000e-08, 2.39453468e-02,
#        3.06291168e-03, 2.08718026e-01, 3.67871353e-03, 1.00000000e-08,
#        5.72147873e-03, 1.39968051e-02, 3.67871353e-03, 1.98404453e-01],
#       [2.90796102e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 2.18200828e-01, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 2.74978179e-01, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.16024890e-01,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
#       [2.90796102e-01, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 2.18200828e-01, 1.00000000e-08, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 2.74978179e-01, 1.00000000e-08,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 2.16024890e-01,
#        1.00000000e-08, 1.00000000e-08, 1.00000000e-08, 1.00000000e-08]])
#E2

In [155]:

with open("datoteka.txt", "w") as file:
    file.write("M\n")
    file.write("20\n")
    file.write("N\n")
    file.write("3\n")
    file.write("STANJA\n")
    for stanje in stanja_set:
        file.write(stanje + " ")
    file.write("\nSIMBOLI\n")
    for par in emisija:
        file.write(par + " ")
    file.write("\nA\n")
    for red in A:
        file.write(" ".join(map(str, red)) + "\n")
    file.write("E\n")        
    for red in E:
        file.write(" ".join(map(str, red)) + "\n")
    file.write("pi\n")
    for value in pi:
        file.write(str(value) + " ")



#### matrica emisija E

In [156]:
#emitirati se mogu parovi AA,AC,AG,AT,CC,CA,CG,CT,GG,GA,GC,GT,TT,TA,TC,TG,A,C,G,T,-(ne znam trebamo li ga kad obrišemo -- slučaj)
#stanja M,Ix,Iy

In [157]:
emisija = sorted(["A", "AA", "AC", "AG", "AT", "C", "CA", "CC", "CG", "CT", "G", "GA", "GC", "GG", "GT", "T", "TA", "TC", "TG", "TT"])

In [159]:
matrica_emisija = np.zeros((len(stanja_set), len(emisija)))

In [None]:
matrica_emisija = matrica_emisija/np.sum(matrica_emisija, axis=1, keepdims=True)#zbroj po retcima

matrica_emisija

array([[0.        , 0.        , 0.33333333, 0.33333333, 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [None]:
#zamijeniti nule s epsilonom
matrica_emisija = np.where(matrica_emisija == 0, eps, matrica_emisija)
matrica_emisija

array([[1.00000000e-05, 1.00000000e-05, 3.33333333e-01, 3.33333333e-01,
        3.33333333e-01, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05],
       [1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e+00, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05],
       [1.00000000e+00, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.00000000e-05,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05, 1.0000