In [18]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys

In [2]:
import cPickle as pickle
data=pickle.load( open( "../data/train10.pkl", "rb" ))

In [66]:
class HMM:
        def __init__(self, state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None, smoothing_obs = 0.01):
          
            print "HMM creating with: "
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print str(self.N)+" states"
            print str(self.M)+" observations"
            
            self.omega_Y = state_list
            self.omega_Y.append("*")
            self.omega_X = observation_list
            
            if transition_proba is None:
                self.transition_proba = zeros( (self.N+1, self.N+1, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            """if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            """
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.smoothing_obs = smoothing_obs 
            
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N+1):
                self.Y_index[self.omega_Y[i]] = i
                
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
      
        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                k += 1
            return indices

    
        def data2indices(self, sent): 
            """From a word of the corpus: 
            - extract the letter and coorection 
            - returns two list of indices, one for each
            -> (letterid, correctionid)
            """
            letterids = list()
            correctionids  = list()
            for couple in sent:
                ltr = couple[0]
                crt = couple[1]
                letterids.append(self.X_index[ltr])
                correctionids.append(self.Y_index[crt])
            return letterids,correctionids
            
        def observation_estimation(self, pair_counts):

            for pair in pair_counts:
                letter=pair[0]
                correction=pair[1]
                count=pair_counts[pair]
                
                if letter in self.X_index:
                    k=self.X_index[letter]
                i=self.Y_index[correction]
                self.observation_proba[k,i]=count
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1, self.N)
                        
        def transition_estimation(self, c_bitag, c_tritag):
            
            for tritag in c_tritag:
                #getting indices
                y_2=self.Y_index[tritag[0]]
                y_1=self.Y_index[tritag[1]]
                y=self.Y_index[tritag[2]]
                bigram=(tritag[0],tritag[1])       
                self.transition_proba[y_2,y_1,y]=float(c_tritag[tritag])/float(c_bitag[bigram])               
   
        def init_estimation(self, c_inits, c_inits_bitag):
            somme=float(sum(c_inits.values()))
            for correction in c_inits:
                i=self.Y_index[correction]
                j=self.Y_index["*"]
                self.transition_proba[j,j,i]=float(c_inits[correction])/somme
                
            for pair in c_inits_bitag:
                y_1=self.Y_index[pair[0]]
                y=self.Y_index[pair[1]]
                j=self.Y_index["*"]
                self.transition_proba[j,y_1,y]=float(c_inits_bitag[pair])/float(c_inits[pair[0]])
                
            

        def supervised_training(self, pair_counts, c_bitag, c_tritag ,c_inits, c_inits_bitag):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(pair_counts)
            self.transition_estimation(c_bitag, c_tritag)
            self.init_estimation(c_inits, c_inits_bitag)

In [139]:
def make_counts(corpus):
    """ 
    Build different count tables to train a HMM. Each count table is a dictionnary. 
    Returns: 
    * c_letter: letter counts
    * c_correction: correction counts
    * c_pairs: count of pairs (letter,correction)
    
    * c_bitag: count of tag bigram 
    * c_tritag: count of tag trigram 
    * c_inits: count of tag found in the first position
    
    """
    c_letter = dict()
    c_correction = dict()
    c_pairs= dict()
    c_bitag = dict()
    c_tritag = dict()
    c_inits = dict()
    c_inits_bitag = dict()
    
    for word in corpus:
        for i in range(len(word)):
            couple= word[i]
            letter = couple[0]
            correction = couple[1]
            #Counting letter 
            if letter in c_letter:
                c_letter[letter] +=1
            else:
                c_letter[letter] =1  
            #Counting correction
            if correction in c_correction:
                c_correction[correction] +=1
            else:
                c_correction[correction] =1
            #Counting par(letter, correction)
            if couple in c_pairs:
                c_pairs[couple] +=1
            else :
                c_pairs[couple] =1
            #Counting bitag(corr_i, corr_(i+1))
            if i > 0 and i < len(word)-1:
                bitag = (word[i-1][1], correction)
                if bitag in c_bitag:
                    c_bitag[bitag] += 1
                else:
                    c_bitag[bitag] =1
                    
            #Counting tritag
            if i > 1:
                tritag = (word[i-2][1],word[i-1][1], correction)
                if tritag in c_tritag :
                    c_tritag[tritag] +=1
                else :
                    c_tritag[tritag] =1
                    
            if i == 0 and len(word)>1:
                if correction in c_inits:
                    c_inits[correction] +=1
                else :
                    c_inits[correction] =1
                bg_first=(correction,word[i+1][1])
                
                if bg_first in c_inits_bitag:
                    c_inits_bitag[bg_first]+=1
                else:
                    c_inits_bitag[bg_first]=1
                    
    return c_letter, c_correction, c_pairs, c_bitag, c_tritag, c_inits, c_inits_bitag

In [140]:
c_letter, c_correction, c_pairs, c_bitag, c_tritag, c_inits, c_inits_bitag=make_counts(data)

In [141]:
hmm = HMM(state_list=c_correction.keys(), observation_list=c_letter.keys(),
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None)

HMM creating with: 
26 states
26 observations


In [132]:
hmm.observation_estimation(c_pairs)
print hmm.observation_proba.sum(axis=0)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.]


In [142]:
hmm.transition_estimation(c_bitag, c_tritag)
hmm.init_estimation(c_inits, c_inits_bitag)

In [147]:
print [hmm.transition_proba[i,j,:].sum() for i in range(26) for j in range(26)]

[0.0, 1.0, 1.0, 1.0, 0.99999999999999989, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99999999999999989, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.99999999999999989, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.99999999999999989, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.99999999999999989, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99999999999999989, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0000000000000002, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99999999999999989, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.99999999999999989, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.99999999999999989, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 

In [143]:


print hmm.omega_Y[1], hmm.omega_Y[23], hmm.omega_Y[1], hmm.transition_proba[1,23,:]

c y c [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


In [97]:
liste=[ 0.04424779,  0.1039823 ,  0.00221239,  0.03318584,  0.0619469 ,
        0.01327434,  0.11283186,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.00221239,  0.00442478,  0.10840708,
        0.        ,  0.        ,  0.22345133,  0.05088496,  0.00221239,
        0.06858407,  0.        ,  0.1659292 ,  0.        ,  0.        ,  0.        ]

print sum(liste)

0.99778762


In [136]:
bitag=('a','c')
for tritag in c_tritag:
    #getting indices
    y_2=hmm.Y_index[tritag[0]]
    #print tritag
    y_1=hmm.Y_index[tritag[1]]


    y=hmm.Y_index[tritag[2]]
    bigram=(tritag[0],tritag[1])
    if bigram==bitag:
        print bigram
        print c_tritag[tritag]
        print c_bitag[bigram] 
        print float(c_tritag[tritag])/float(c_bitag[bigram])  
    hmm.transition_proba[y_2,y_1,y]=float(c_tritag[tritag])/float(c_bitag[bigram])         

('a', 'c')
4
407
0.00982800982801
('a', 'c')
15
407
0.036855036855
('a', 'c')
49
407
0.120393120393
('a', 'c')
11
407
0.027027027027
('a', 'c')
33
407
0.0810810810811
('a', 'c')
167
407
0.410319410319
('a', 'c')
3
407
0.00737100737101
('a', 'c')
37
407
0.0909090909091
('a', 'c')
4
407
0.00982800982801
('a', 'c')
74
407
0.181818181818
('a', 'c')
2
407
0.004914004914
('a', 'c')
8
407
0.019656019656


In [128]:
bitag=('c','y')
for tritag in c_tritag:
    #getting indices
    y_2=hmm.Y_index[tritag[0]]
    #print tritag
    y_1=hmm.Y_index[tritag[1]]


    y=hmm.Y_index[tritag[2]]
    bigram=(tritag[0],tritag[1])
    if bigram==bitag:
        print tritag
        print bigram
        print c_tritag[tritag]
        print c_bitag[bigram] 
        print float(c_tritag[tritag])/float(c_bitag[bigram])  
    hmm.transition_proba[y_2,y_1,y]=float(c_tritag[tritag])/float(c_bitag[bigram])         

('c', 'y', 'c')
('c', 'y')
3
24
0.125
