This is the notebook for exploring hierarchical HMM.

In [1]:
from hmm import *

In [2]:
header_names = ["id", "sequence", "structure"]
proteins_train = pd.read_csv("../projects/03/data/proteins_train.tsv", sep="\t", header=None)
proteins_train.columns = header_names
proteins_test = pd.read_csv("../projects/03/data/proteins_test.tsv", sep="\t", header=None)
proteins_test.columns = header_names

In [3]:
proteins_train.head()

Unnamed: 0,id,sequence,structure
0,>101M:A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
1,>102L:A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,CCHHHHHHHHHCCEEEEEECTTSCEEEETTEEEESSSCTTTHHHHH...
2,>102M:A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
3,>103L:A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...,CCHHHHHHHHHCCEEEEEECTTSCEEEETTEECCCCCCCCCHHHHH...
4,>103M:A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...


In [4]:
aa_states = proteins_train["sequence"].apply(lambda x: list(x)).explode().unique()
aa_states.sort()
aa_states

array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
       'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'], dtype=object)

In [5]:
ss_states = proteins_train["structure"].apply(lambda x: list(x)).explode().unique()
ss_states.sort()
ss_states

array(['B', 'C', 'E', 'G', 'H', 'I', 'S', 'T'], dtype=object)

In [6]:
res = MLE(proteins_train, aa_states, ss_states)

In [7]:
vit = viterbi(
    np.log(res["E"]), np.log(res["Tr"]), np.log(res['I']),proteins_test["sequence"], is_log=True
)

  result = func(self.values, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
# turn vit to dataframe
vit_df = pd.DataFrame(vit, columns=["predicted_structure", "likelihood"])

In [9]:
vit_df.head()

Unnamed: 0,predicted_structure,likelihood
0,CCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,-468.593868
1,CCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,-468.593868
2,CHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,-453.524341
3,CHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,-453.524341
4,CHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH...,-453.524341


In [18]:
def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

In [23]:
# compute accuracy
vit_df["hamming"] = vit_df["predicted_structure"].apply(
    lambda x: (len(x) - hamming_distance(proteins_test["structure"].iloc[0], x)) / len(x)
)
vit_df["hamming"].mean()

0.4664140569662479

In [11]:
res2 = MLE_3state(proteins_train, aa_states)

In [12]:
res2

{'I': H    0.0
 E    0.0
 C    1.0
 dtype: float64,
 'E':           A         C         D         E         F         G         H  \
 H  0.123484  0.012885  0.056717  0.085998  0.036332  0.037582  0.022645   
 E  0.058049  0.023489  0.027980  0.040025  0.052470  0.048093  0.027523   
 C  0.061565  0.015103  0.076355  0.051894  0.029126  0.125292  0.024205   
 
           I         K         L  ...         P         Q         R         S  \
 H  0.052775  0.071718  0.107890  ...  0.023431  0.040771  0.061846  0.043159   
 E  0.093068  0.057677  0.105113  ...  0.017824  0.031356  0.038337  0.051984   
 C  0.035511  0.067908  0.060783  ...  0.072402  0.032724  0.047016  0.076540   
 
           T         U         V         W         X         Y  
 H  0.043496  0.000016  0.067471  0.014135  0.000000  0.031380  
 E  0.077790  0.000000  0.126513  0.019741  0.000000  0.055217  
 C  0.064893  0.000000  0.042622  0.010382  0.000583  0.027263  
 
 [3 rows x 22 columns],
 'Tr':           H       

In [24]:
vit_3 = viterbi(
    np.log(res2["E"]), np.log(res2["Tr"]), np.log(res2['I']),proteins_test["sequence"], is_log=True
)
vit_df_3 = pd.DataFrame(vit_3, columns=["predicted_structure", "likelihood"])

  result = func(self.values, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [27]:
vit_df["predicted_structure"][0]

'CCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH'

In [26]:
vit_df_3["predicted_structure"][0]

'CCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCC'

In [35]:
proteins_test["structure"].iloc[0]


'CCCEEECCHHHHHHHHHHHHHHCCCCCCCCEEEEEECCCHHHHHHHHHHHCCCCEEEEEEEEEEECCEEEEEEEECCCCCCCCEEEEEEEECCCHHHHHHHHHCCCCEEEEEEECCCCHHHCCCCCEECCCCCEEECHHHEEEEECCCCCCC'

In [36]:
proteins_test["structure_3state"].iloc[0]

'CCCEEECCHHHHHHHHHHHHHHCCCCCCCCEEEEEECCCHHHHHHHHHHHCCCCEEEEEEEEEEECCEEEEEEEECCCCCCCCEEEEEEEECCCHHHHHHHHHCCCCEEEEEEECCCCHHHCCCCCEECCCCCEEECHHHEEEEECCCCCCC'

In [29]:
proteins_test["structure_3state"] = proteins_test["structure"].apply(
    lambda x: "".join([map8to3[i] for i in x])
)

In [33]:
# compute accuracy
vit_df_3["hamming"] = vit_df_3["predicted_structure"].apply(
    lambda x: (len(x) - hamming_distance(proteins_test["structure_3state"].iloc[0], x)) / len(x)
)
vit_df_3["hamming"].mean()

0.48836129189551797