In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio.SeqIO import parse

In [15]:
# Assuming you have:
# transition_matrix - 64x64 numpy array
# initial_vector - 1x64 numpy array (or 64x1, will need to transpose if needed)

def simulate_markov(transition_matrix, initial_vector, num_iterations):
    current_state = initial_vector
    for _ in range(num_iterations):
        current_state = np.dot(current_state, transition_matrix)
        print(f"Current state after iteration: {current_state[:5]}")
    return current_state


def simulate_markov_power(transition_matrix, initial_vector, num_iterations):
    powered_matrix = np.linalg.matrix_power(transition_matrix, num_iterations)
    return np.dot(initial_vector, powered_matrix)

In [None]:
# read cds of full proteome
# caclulate transition matrix from spectrum


In [23]:
for rec in parse('./data/refseq_data/cds.fna', 'fasta'):
    if ' ORF1ab polyprotein ' in rec.description or ' ORF1a polyprotein ' in rec.description:
        print(' pass', rec.description)
    else:
        # print(rec.description, len(rec), len(rec) % 3)
        seq = str(rec.seq)
        codons = [seq[i:i+3] for i in range(0, len(rec), 3)]
        assert seq == ''.join(codons)
        gene = ' '.join(rec.description.split()[1:]).split(' [organism=')[0]

        print(gene, codons[-1], len(codons))

3C-like proteinase [polyprotein=ORF1ab polyprotein] CAA 306
nsp6 [polyprotein=ORF1ab polyprotein] CAG 290
nsp7 [polyprotein=ORF1ab polyprotein] CAA 83
nsp8 [polyprotein=ORF1ab polyprotein] CAG 198
nsp9 [polyprotein=ORF1ab polyprotein] CAA 113
nsp10 [polyprotein=ORF1ab polyprotein] CAG 139
RNA-dependent RNA polymerase [polyprotein=ORF1ab polyprotein] CAG 932
nsp11 [polyprotein=ORF1a polyprotein] GTG 13
helicase [polyprotein=ORF1ab polyprotein] CAA 601
3'-to-5' exonuclease [polyprotein=ORF1ab polyprotein] CAG 527
endoRNAse [polyprotein=ORF1ab polyprotein] CAA 346
2'-O-ribose methyltransferase [polyprotein=ORF1ab polyprotein] AAC 298
surface glycoprotein TAA 1274
ORF3a protein TAA 276
envelope protein TAA 76
membrane glycoprotein TAA 223
 pass NC_045512.2:266-13468,13468-21555 ORF1ab polyprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=Wuhan-Hu-1]
 pass NC_045512.2:266-13483 ORF1a polyprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=Wu

In [21]:
' '.join(rec.description.split()[1:]).split(' [organism=')[0]

'nsp4 [polyprotein=ORF1ab polyprotein]'

In [17]:
# Create a random 64x64 transition matrix (rows sum to 1)
transition_matrix = np.random.rand(64, 64)
transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)

# Create a random initial probability vector (sums to 1)
initial_vector = np.random.rand(64)
initial_vector = initial_vector / initial_vector.sum()

# Run 100 iterations
result = simulate_markov(transition_matrix, initial_vector, 100)
# print(result)

Current state after iteration: [0.01415406 0.01631691 0.01646024 0.014585   0.01564774]
Current state after iteration: [0.0147629  0.01641658 0.01719909 0.01479115 0.01601034]
Current state after iteration: [0.01477424 0.0164538  0.01712154 0.01480823 0.01602121]
Current state after iteration: [0.01477343 0.0164477  0.01712531 0.01480268 0.01602277]
Current state after iteration: [0.01477353 0.01644763 0.0171254  0.01480216 0.01602277]
Current state after iteration: [0.01477355 0.0164476  0.01712541 0.01480213 0.01602277]
Current state after iteration: [0.01477355 0.0164476  0.01712541 0.01480212 0.01602277]
Current state after iteration: [0.01477355 0.0164476  0.01712541 0.01480212 0.01602277]
Current state after iteration: [0.01477355 0.0164476  0.01712541 0.01480212 0.01602277]
Current state after iteration: [0.01477355 0.0164476  0.01712541 0.01480212 0.01602277]
Current state after iteration: [0.01477355 0.0164476  0.01712541 0.01480212 0.01602277]
Current state after iteration: [

In [None]:
# Run 100 iterations
result = simulate_markov_power(transition_matrix, initial_vector, 1)
print(result)

[0.01766681 0.01493436 0.01730691 0.01456401 0.01806449 0.01593828
 0.01585003 0.01682262 0.01629439 0.01531913 0.01395805 0.01568287
 0.01583625 0.01624106 0.01671092 0.01744115 0.01606366 0.01801877
 0.01523116 0.01648605 0.01442255 0.01409631 0.01544474 0.01680914
 0.01659947 0.01715618 0.01413501 0.01592202 0.01434678 0.01508118
 0.01434343 0.0145671  0.01596697 0.01424543 0.01572329 0.01401728
 0.01419798 0.01600666 0.01484067 0.01482251 0.01491952 0.01592255
 0.01600862 0.01566378 0.0160215  0.01598121 0.01529994 0.01415771
 0.01446273 0.0156333  0.01578723 0.01529678 0.01637806 0.01558807
 0.01628312 0.01632156 0.01558149 0.01426011 0.01594996 0.01589215
 0.01672819 0.01529541 0.01433922 0.01505212]
