In [1]:
import urllib.request
import gzip
import shutil
import os

import numpy as np
import pandas as pd
from hmmlearn.hmm import MultinomialHMM
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [2]:
url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_genomic.gbff.gz'
gz_fn = 'GCF_000009045.1_ASM904v1_genomic.gbff.gz'
gbk_fn = gz_fn.replace('.gbff.gz', '.gbk')
if not os.path.isfile(gz_fn):
    print('Beginning file download with urllib2...')
    urllib.request.urlretrieve(url, gz_fn)

if not os.path.isfile(gbk_fn):
    with gzip.open(gz_fn, 'rb') as f_in, open(gbk_fn, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

Beginning file download with urllib2...


Helper functions

In [3]:
def get_list_of_codons(dna_seq):
    codons = []
    for i in range(0, len(dna_seq), 3):
        codons.append(dna_seq[i:i+3])
    return codons
assert get_list_of_codons('ATGCCCGGGAAATTTTAG') == ['ATG', 'CCC', 'GGG', 'AAA', 'TTT', 'TAG']

def check_len_and_ambiguity(seq):
    assert isinstance(seq, str)
    ambig_nucs = ['R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'N']
    unambiguous = not any([anuc in seq for anuc in ambig_nucs])
    multiple_3 = len(seq) % 3 == 0
    return unambiguous and multiple_3
assert check_len_and_ambiguity('ATGACCTAG')
assert not check_len_and_ambiguity('ATGACCTA')
assert not check_len_and_ambiguity('ATGACCTAY')
assert not check_len_and_ambiguity('ATGACCAY')

In [4]:
u_aas = set()
u_codons = set()
all_codons = []
initial_states = []
emissions = {}
for record in SeqIO.parse(gbk_fn, "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and \
           'translation' in feature.qualifiers and \
           check_len_and_ambiguity(str(record.seq)):
            protein = feature.qualifiers['translation'][0] + '*'
            print(protein)
            aas = {aa for aa in protein}
            codon = get_list_of_codons(str(feature.extract(record.seq)))
            if len(protein) == len(codon):
                all_codons.append(codon)
                initial_states.append(codon[0])
                u_aas = u_aas.union(aas)
                u_codons = u_codons.union(set(codon))
                for i, cdn in enumerate(codon):
                    emissions[cdn] = protein[i]
        if len(initial_states) > 10:
            break
lu_aas = list(u_aas)
lu_codons = list(u_codons)

MENILDLWNQALAQIEKKLSKPSFETWMKSTKAHSLQGDTLTITAPNEFARDWLESRYLHLIADTIYELTGEELSIKFVIPQNQDVEDFMPKPQVKKAVKEDTSDFPQNMLNPKYTFDTFVIGSGNRFAHAASLAVAEAPAKAYNPLFIYGGVGLGKTHLMHAIGHYVIDHNPSAKVVYLSSEKFTNEFINSIRDNKAVDFRNRYRNVDVLLIDDIQFLAGKEQTQEEFFHTFNTLHEESKQIVISSDRPPKEIPTLEDRLRSRFEWGLITDITPPDLETRIAILRKKAKAEGLDIPNEVMLYIANQIDSNIRELEGALIRVVAYSSLINKDINADLAAEALKDIIPSSKPKVITIKEIQRVVGQQFNIKLEDFKAKKRTKSVAFPRQIAMYLSREMTDSSLPKIGEEFGGRDHTTVIHAHEKISKLLADDEQLQQHVKEIKEQLK*
MKFTIQKDRLVESVQDVLKAVSSRTTIPILTGIKIVASDDGVSFTGSDSDISIESFIPKEEGDKEIVTIEQPGSIVLQARFFSEIVKKLPMATVEIEVQNQYLTIIRSGKAEFNLNGLDADEYPHLPQIEEHHAIQIPTDLLKNLIRQTVFAVSTSETRPILTGVNWKVEQSELLCTATDSHRLALRKAKLDIPEDRSYNVVIPGKSLTELSKILDDNQELVDIVITETQVLFKAKNVLFFSRLLDGNYPDTTSLIPQDSKTEIIVNTKEFLQAIDRASLLAREGRNNVVKLSAKPAESIEISSNSPEIGKVVEAIVADQIEGEELNISFSPKYMLDALKVLEGAEIRVSFTGAMRPFLIRTPNDETIVQLILPVRTY*
MANPISIDTEMITLGQFLKLADVIQSGGMAKWFLSEHEVLVNDEPDNRRGRKLYVGDVVEIEGFGSFQVVN*
MYIQNLELTSYRNYDHAELQFENKVNVIIGENAQGKTNLMEAIYVLSMAKSHRTSNDKELIRWDKDYAKIEGRVMKQNGAIPMQLVISKKGKKGKVNHI

In [5]:
lu_codons

['GTC',
 'TTC',
 'AAA',
 'GCA',
 'CCG',
 'ATC',
 'ATA',
 'CGT',
 'TAG',
 'TAT',
 'TGC',
 'GAT',
 'TCC',
 'TAC',
 'CAA',
 'CGC',
 'TCA',
 'TAA',
 'AAT',
 'GTT',
 'AGC',
 'CTT',
 'CTC',
 'TGG',
 'CTA',
 'GGA',
 'TCT',
 'TGT',
 'TTT',
 'CCA',
 'CAT',
 'AGT',
 'ACA',
 'AAC',
 'CCC',
 'GGG',
 'AGG',
 'CAG',
 'TCG',
 'CGA',
 'GTG',
 'CCT',
 'TGA',
 'GAG',
 'ACT',
 'CTG',
 'ATT',
 'TTG',
 'CGG',
 'GCT',
 'GAC',
 'GGC',
 'TTA',
 'ATG',
 'ACG',
 'GGT',
 'AAG',
 'ACC',
 'CAC',
 'GTA',
 'GCG',
 'AGA',
 'GAA',
 'GCC']

In [6]:
emissions['AAA'], emissions['ATG']

('K', 'M')

In [7]:
def encode_seq(seq_obj, seqtype='dna'):
    encdr = lu_codons
    symbols = get_list_of_codons(seq_obj)
    if seqtype != 'dna':
        encdr = lu_aas
        symbols = [c for c in seq_obj]
    outseq = np.array([encdr.index(s) for s in symbols])
    return outseq

test_aa = 'MENILD'
test_nuc = 'AAAAAAATAAGATAG'
assert encode_seq(test_aa, seqtype='prot')[0] == lu_aas.index(test_aa[0]) and \
       encode_seq(test_aa, seqtype='prot')[-1] == lu_aas.index(test_aa[-1])
assert encode_seq(test_nuc, seqtype='dna')[0] == lu_codons.index(test_nuc[0:3]) and \
       encode_seq(test_nuc, seqtype='dna')[-1] == lu_codons.index(test_nuc[-3:])

def decode_seq(num_array, seqtype='dna'):
    encdr = lu_codons
    if seqtype != 'dna':
        encdr = lu_aas
    outseq = [encdr[s] for s in num_array]
    return ''.join(outseq)

assert decode_seq(encode_seq(test_nuc)) == test_nuc
assert decode_seq(encode_seq(test_aa, seqtype='prot'), seqtype='prot') == test_aa

In [8]:
emission_prob = np.zeros((len(lu_codons), len(lu_aas)))
for i, codon in enumerate(lu_codons):
    aa = emissions[codon]
    j = lu_aas.index(aa)
    emission_prob[i, j] = 1

In [9]:
initial_probabilities = {k:0 for k in lu_codons}
for i in initial_states:
    initial_probabilities[i] += 1
for k in initial_probabilities.keys():
    initial_probabilities[k] = initial_probabilities[k] / len(initial_states)
initial_probs_np = np.array([initial_probabilities[x] for x in lu_codons])

In [10]:
transition_counts = np.ones((len(lu_codons), len(lu_codons)))
for gene in all_codons:
    for i in range(0, len(gene)-1):
        codon0 = lu_codons.index(gene[i])
        codon1 = lu_codons.index(gene[i+1])
        transition_counts[codon0, codon1] += 1
transition_totals = transition_counts.sum(axis=1)
transition_probs = np.dot(np.diag(1/transition_totals), transition_counts)
transition_probs_df = pd.DataFrame(transition_probs, index=lu_codons, columns=lu_codons)
transition_probs_df.head()

Unnamed: 0,GTC,TTC,AAA,GCA,CCG,ATC,ATA,CGT,TAG,TAT,...,ACG,GGT,AAG,ACC,CAC,GTA,GCG,AGA,GAA,GCC
GTC,0.023256,0.007752,0.054264,0.007752,0.015504,0.062016,0.007752,0.007752,0.007752,0.007752,...,0.031008,0.015504,0.007752,0.007752,0.007752,0.023256,0.007752,0.023256,0.007752,0.007752
TTC,0.008696,0.034783,0.026087,0.008696,0.034783,0.017391,0.017391,0.017391,0.008696,0.017391,...,0.017391,0.017391,0.008696,0.008696,0.008696,0.008696,0.017391,0.017391,0.017391,0.008696
AAA,0.012658,0.015823,0.056962,0.037975,0.022152,0.031646,0.012658,0.012658,0.006329,0.009494,...,0.012658,0.022152,0.022152,0.003165,0.012658,0.015823,0.025316,0.006329,0.06962,0.015823
GCA,0.006329,0.006329,0.063291,0.012658,0.006329,0.006329,0.006329,0.006329,0.006329,0.012658,...,0.012658,0.012658,0.025316,0.006329,0.006329,0.018987,0.006329,0.031646,0.037975,0.025316
CCG,0.00885,0.00885,0.035398,0.00885,0.017699,0.00885,0.00885,0.026549,0.00885,0.00885,...,0.00885,0.00885,0.00885,0.017699,0.035398,0.017699,0.00885,0.00885,0.017699,0.00885


Build model

In [11]:
hmm = MultinomialHMM(n_components=len(lu_codons), 
                     startprob_prior=initial_probs_np, 
                     transmat_prior=transition_probs, 
                     verbose=False,  
                     init_params='')
hmm.transmat_ = transition_probs
hmm.emissionprob_ = emission_prob
hmm.startprob_ = initial_probs_np
hmm.n_features = len(lu_aas)

In [12]:
sample_aa, sample_nuc = hmm.sample(n_samples=10, random_state=21)

In [13]:
assert emission_prob[sample_nuc[0], sample_aa[0]] == 1
decode_seq(sample_aa.reshape(-1), seqtype='prot'), decode_seq(sample_nuc.reshape(-1), seqtype='dna')

('LLDNRPYHLL', 'TTGCTGGATAATCGCCCGTATCATTTATTA')

Testing

In [15]:
protein = ''
dna = ''
for record in SeqIO.parse(gbk_fn, "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            protein = Seq(feature.qualifiers['translation'][0] + '*')
            dna = feature.extract(record.seq)
            break

In [16]:
protein

Seq('MENILDLWNQALAQIEKKLSKPSFETWMKSTKAHSLQGDTLTITAPNEFARDWL...LK*')

In [17]:
prot_e = encode_seq(protein, seqtype='prot')
# protein, prot_e.reshape(-1, 1)

In [18]:
mle_dna_indices = hmm.predict(prot_e.reshape(-1, 1))
mle_dna = decode_seq(mle_dna_indices)

In [19]:
Seq(mle_dna).translate()

Seq('MENILDLWNQALAQIEKKLSKPSFETWMKSTKAHSLQGDTLTITAPNEFARDWL...LK*')

In [22]:
from Bio.HMM.Utilities import pretty_print_prediction

pretty_protein = ''.join([x + '  ' for x in protein])
pretty_print_prediction(pretty_protein, 
                        dna, 
                        mle_dna)

Emissions       M  E  N  I  L  D  L  W  N  Q  A  L  A  Q  I  E  K  K  L  S 
Real State      ATGGAAAATATATTAGACCTGTGGAACCAAGCCCTTGCTCAAATCGAAAAAAAGTTGAG
Predicted State ATGGAAAATATCTTGGATCTGTGGAATCAAGCGCTAGCTCAAATTGAAAAAAAATTATC

Emissions        K  P  S  F  E  T  W  M  K  S  T  K  A  H  S  L  Q  G  D  T
Real State      CAAACCGAGTTTTGAGACTTGGATGAAGTCAACCAAAGCCCACTCACTGCAAGGCGATA
Predicted State AAAACCGAGTTTTGAGACTTGGATGAAGTCAACAAAAGCGCACTCTCTTCAAGGCGATA

Emissions         L  T  I  T  A  P  N  E  F  A  R  D  W  L  E  S  R  Y  L  
Real State      CATTAACAATCACGGCTCCCAATGAATTTGCCAGAGACTGGCTGGAGTCCAGATACTTG
Predicted State CATTAACGATTACAGCTCCCAATGAATTTGCCCGTGACTGGCTGGAGTCCAGATACCTT

Emissions       H  L  I  A  D  T  I  Y  E  L  T  G  E  E  L  S  I  K  F  V 
Real State      CATCTGATTGCAGATACTATATATGAATTAACCGGGGAAGAATTGAGCATTAAGTTTGT
Predicted State CATTTAATTGCAGATACAATTTATGAATTAACAGGTGAAGAACTCAGCATTAAATTTGT

Emissions        I  P  Q  N  Q  D  V  E  D  F  M  P  K  P  Q  V  K  K  A  V
Real Sta

In [21]:
state_probabilities = hmm.predict_proba(prot_e.reshape(-1, 1))
state_probabilities[4,]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.26710532, 0.12009873, 0.        , 0.07194147,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.18877108, 0.        , 0.21970591, 0.        , 0.        ,
       0.        , 0.        , 0.13237748, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

In [None]:
# Check how well the HMM reproduces the native encoding for all the proteins