In [1]:
import numpy as np
import pandas as pd
from hmmlearn.hmm import MultinomialHMM
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [2]:
import urllib.request

print('Beginning file download with urllib2...')

url = 'https://ftp.ncbi.nlm.nih.gov/genomes/genbank/plant/Arabidopsis_thaliana/latest_assembly_versions/GCA_000001735.2_TAIR10.1/GCA_000001735.2_TAIR10.1_genomic.gff.gz'
urllib.request.urlretrieve(url, 'GCA_000001735.2_TAIR10.1_genomic.gff.gz')

Beginning file download with urllib2...


('GCA_000001735.2_TAIR10.1_genomic.gff.gz',
 <http.client.HTTPMessage at 0x7f9058e45490>)

Helper functions

In [2]:
def get_list_of_codons(dna_seq):
    codons = []
    for i in range(0, len(dna_seq), 3):
        codons.append(dna_seq[i:i+3])
    return codons
assert get_list_of_codons('ATGCCCGGGAAATTTTAG') == ['ATG', 'CCC', 'GGG', 'AAA', 'TTT', 'TAG']

In [4]:
u_aas = set()
u_codons = set()
all_codons = []
initial_states = []
emissions = {}
for record in SeqIO.parse('GCF_000009045.1_ASM904v1_genomic.gbff', "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            protein = feature.qualifiers['translation'][0] + '*'
            aas = set([aa for aa in protein])
            codon = get_list_of_codons(str(feature.extract(record.seq)))
            all_codons.append(codon)
            initial_states.append(codon[0])
            u_aas = u_aas.union(aas)
            u_codons = u_codons.union(set(codon))
            for i, cdn in enumerate(codon):
                emissions[cdn] = protein[i]
lu_aas = list(u_aas)
lu_codons = list(u_codons)

BSU_00010
BSU_00020
BSU_00030
BSU_00040
BSU_00050
BSU_00060
BSU_00070
BSU_00080
BSU_00090
BSU_00100
BSU_00110
BSU_00120
BSU_00130
BSU_00140
BSU_00150
BSU_00160
BSU_00170
BSU_00180
BSU_00190
BSU_00200
BSU_00210
BSU_00220
BSU_00230
BSU_00240
BSU_00250
BSU_00260
BSU_00270
BSU_00280
BSU_00290
BSU_00300
BSU_00310
BSU_00320
BSU_00330
BSU_00340
BSU_00350
BSU_00360
BSU_00370
BSU_00380
BSU_00390
BSU_00400
BSU_00410
BSU_00420
BSU_00430
BSU_00440
BSU_00450
BSU_00460
BSU_00470
BSU_00480
BSU_00490
BSU_00500
BSU_00510
BSU_00520
BSU_00530
BSU_00540
BSU_00550
BSU_00560
BSU_00570
BSU_00580
BSU_00590
BSU_00600
BSU_00610
BSU_00620
BSU_00630
BSU_00640
BSU_00650
BSU_00660
BSU_00670
BSU_00680
BSU_00690
BSU_00700
BSU_00710
BSU_00720
BSU_00730
BSU_00740
BSU_00750
BSU_00760
BSU_00770
BSU_00780
BSU_00790
BSU_00800
BSU_00810
BSU_00820
BSU_00830
BSU_00840
BSU_00850
BSU_00860
BSU_00870
BSU_00880
BSU_00890
BSU_00900
BSU_00910
BSU_00920
BSU_00930
BSU_00940
BSU_00950
BSU_00960
BSU_00970
BSU_00980
BSU_00990
BSU_01000


In [9]:
emissions['AAA'], emissions['ATG']

('K', 'M')

In [None]:
def encode_seq(seq_obj, seqtype='dna'):
    encdr = lu_codons
    symbols = get_list_of_codons(seq_obj)
    if seqtype != 'dna':
        encdr = lu_aas
        symbols = [c for c in seq_obj]
    outseq = np.array([encdr.index(s) for s in symbols])
    return outseq

test_aa = 'MENILD'
test_nuc = 'AAAAAAATAAGATAG'
assert encode_seq(test_aa, seqtype='prot')[0] == lu_aas.index(test_aa[0]) and \
       encode_seq(test_aa, seqtype='prot')[-1] == lu_aas.index(test_aa[-1])
assert encode_seq(test_nuc, seqtype='dna')[0] == lu_codons.index(test_nuc[0:3]) and \
       encode_seq(test_nuc, seqtype='dna')[-1] == lu_codons.index(test_nuc[-3:])

def decode_seq(num_array, seqtype='dna'):
    encdr = lu_codons
    if seqtype != 'dna':
        encdr = lu_aas
    outseq = [encdr[s] for s in num_array]
    return ''.join(outseq)

assert decode_seq(encode_seq(test_nuc)) == test_nuc
assert decode_seq(encode_seq(test_aa, seqtype='prot'), seqtype='prot') == test_aa

In [None]:
emission_prob = np.zeros((len(lu_codons), len(lu_aas)))
for i, codon in enumerate(lu_codons):
    aa = emissions[codon]
    j = lu_aas.index(aa)
    emission_prob[i, j] = 1

In [None]:
initial_probabilities = {k:0 for k in lu_codons}
for i in initial_states:
    initial_probabilities[i] += 1
for k in initial_probabilities.keys():
    initial_probabilities[k] = initial_probabilities[k] / len(initial_states)
initial_probs_np = np.array([initial_probabilities[x] for x in lu_codons])

In [None]:
transition_counts = np.ones((len(lu_codons), len(lu_codons)))
for gene in all_codons:
    for i in range(0, len(gene)-1):
        codon0 = lu_codons.index(gene[i])
        codon1 = lu_codons.index(gene[i+1])
        transition_counts[codon0, codon1] += 1
transition_totals = transition_counts.sum(axis=1)
transition_probs = np.dot(np.diag(1/transition_totals), transition_counts)
transition_probs_df = pd.DataFrame(transition_probs, index=lu_codons, columns=lu_codons)
transition_probs_df.head()

Build model

In [None]:
hmm = MultinomialHMM(n_components=len(lu_codons), 
                     startprob_prior=initial_probs_np, 
                     transmat_prior=transition_probs, 
                     verbose=False,  
                     init_params='')
hmm.transmat_ = transition_probs
hmm.emissionprob_ = emission_prob
hmm.startprob_ = initial_probs_np
hmm.n_features = len(lu_aas)

In [None]:
sample_aa, sample_nuc = hmm.sample(n_samples=10, random_state=21)

In [None]:
assert emission_prob[sample_nuc[0], sample_aa[0]] == 1
decode_seq(sample_aa.reshape(-1), seqtype='prot'), decode_seq(sample_nuc.reshape(-1), seqtype='dna')

Testing

In [None]:
protein = ''
dna = ''
for record in SeqIO.parse('GCF_000009045.1_ASM904v1_genomic.gbff', "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            protein = Seq(feature.qualifiers['translation'][0] + '*')
            dna = feature.extract(record.seq)
            break

In [None]:
protein

In [None]:
prot_e = encode_seq(protein, seqtype='prot')
# protein, prot_e.reshape(-1, 1)

In [None]:
mle_dna_indices = hmm.predict(prot_e.reshape(-1, 1))
mle_dna = decode_seq(mle_dna_indices)

In [None]:
Seq(mle_dna).translate()

In [None]:
# from Bio.HMM.Utilities import pretty_print_prediction

# pretty_protein = ''.join([x + '  ' for x in protein])
# pretty_print_prediction(pretty_protein, 
#                         dna, 
#                         mle_dna)

In [None]:
state_probabilities = hmm.predict_proba(prot_e.reshape(-1, 1))
state_probabilities[4,]