In [1]:
import nltk
import numpy
from numpy import array, ones, zeros, multiply
import sys
from ipywidgets import FloatProgress
from IPython.display import display
import cPickle as pickle  

train_10 = pickle.load(open('/Users/xchen/Documents/AIC/TC4/projet/typos-data/train10.pkl', 'rb'))
train_20 = pickle.load(open('/Users/xchen/Documents/AIC/TC4/projet/typos-data/train20.pkl', 'rb'))
test_10 = pickle.load(open('/Users/xchen/Documents/AIC/TC4/projet/typos-data/test10.pkl', 'rb'))
test_20 = pickle.load(open('/Users/xchen/Documents/AIC/TC4/projet/typos-data/test20.pkl', 'rb'))
print len(train_10),len(train_20),len(test_10),len(test_20)
print test_10[2]


29057 27184 1501 3374
[('i', 'i'), ('s', 's')]


In [2]:
class HMM:
        def __init__(self,state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None,smoothing_obs = 0.01):
            """Builds a new Hidden Markov Model
            state_list is the list of state symbols [q_0...q_(N-1)]
            observation_list is the list of observation symbols [v_0...v_(M-1)]
            transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            print "HMM creating with: "
            self.N = len(state_list) # The number of states
            self.M = len(observation_list) # The number of words in the vocabulary
            print str(self.N)+" states"
            print str(self.M)+" observations"
            self.omega_Y = state_list # Keep the vocabulary of tags
            self.omega_X = observation_list # Keep the vocabulary of tags
            # Init. of the 3 distributions : observation, transition and initial states
            if transition_proba is None:
                self.transition_proba = zeros( (self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            # Since everything will be stored in numpy arrays, it is more convenient and compact to 
            # handle words and tags as indices (integer) for a direct access. However, we also need 
            # to keep the mapping between strings (word or tag) and indices. 
            self.make_indexes()
            self.smoothing_obs = smoothing_obs

        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities arrays"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
                
        def get_observationIndices( self, observations ):
            """return observation indices, i.e 
            return [self.O_index[o] for o in observations]
            and deals with OOVs
            """
            indices = zeros( len(observations), int )
            k = 0
            for o in observations:
                if o in self.X_index:
                    indices[k] = self.X_index[o]
                else:
                    indices[k] = UNKid
                k += 1
            return indices
        
        def data2indices(self, sent): 
            """From one tagged sentence of the brown corpus: 
            - extract the words and tags 
            - returns two list of indices, one for each
            -> (wordids, tagids)
            """
            wordids = list()
            tagids  = list()
            for couple in sent:
                wrd = couple[0]
                tag = couple[1]
                if wrd in self.X_index:
                    wordids.append(self.X_index[wrd])
                tagids.append(self.Y_index[tag])
            return wordids,tagids
        
        def observation_estimation(self,pair_counts):
            """ Build the observation distribution: 
                observation_proba is the observation probablility matrix
                    [b_ki],  b_ki = Pr(X_t=v_k|Y_t=q_i)"""
            # fill in with counts
            for pair in pair_counts:
                wrd=pair[0]
                tag=pair[1]
                cpt=pair_counts[pair]
                k = 0 # for <unk>
                if wrd in self.X_index: 
                    k=self.X_index[wrd]
                i=self.Y_index[tag]
                self.observation_proba[k,i]=cpt
            # normalize
            self.observation_proba=self.observation_proba+self.smoothing_obs
            self.observation_proba=self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
        
        def transition_estimation(self,trans_counts):
            """ Build the transition distribution: 
                transition_proba is the transition matrix with : 
                [a_ij] a[i,j] = Pr(Y_(t+1)=q_i|Y_t=q_j)
            """
            # fill with counts
            for trans in trans_counts:
                i=self.Y_index[trans[0]]
                j=self.Y_index[trans[1]]
                cpt = trans_counts[trans]
                self.transition_proba[j,i] = cpt
            # normalize
            self.transition_proba = self.transition_proba/self.transition_proba.sum(axis = 0).reshape(1,self.N)
                
            
        def init_estimation(self,init_counts):
            for init in init_counts:
                index = self.Y_index[init]
                self.initial_state_proba[index] = init_counts[init]
            self.initial_state_proba = self.initial_state_proba/sum(self.initial_state_proba)
        
        def supervised_training(self, pair_counts, trans_counts,init_counts):
            """ Train the HMM's parameters. This function wraps everything"""
            self.observation_estimation(pair_counts)
            self.transition_estimation(trans_counts)
            self.init_estimation(init_counts)
            
        def viterbi(self,obs):
            B = self.observation_proba
            A = self.transition_proba
            T = len(obs)
            N = self.N
            
            delta = zeros(N,float)
            tmp = zeros(N,float)
            psi = zeros((T,N),int)
            delta_t = zeros(N,float)
            
            delta = B[obs[0]]*self.initial_state_proba
            for t in xrange(1,T):
                O_t = obs[t]
                for j in range(N):
                    tmp = multiply(delta,A[j,:])
                    idx = psi[t,j] = tmp.argmax()
                    delta_t[j] = tmp[idx]*B[O_t,j]
                delta,delta_t = delta_t,delta
            i_star = [delta.argmax()]
            temp = delta.argmax()
            for psi_t in psi[T-1:0:-1]:
                i_star.append(psi_t[temp])
                temp = psi_t[temp]
            i_star.reverse()
            return i_star

In [3]:
def make_counts(data):
    c_letter_t = {}
    c_letter_c = {}
    c_pairs = {}
    c_transition = {}
    c_init = {}
    for d in data:   
        if not c_init.has_key(d[0][1]):
            c_init[d[0][1]]=1
        else: c_init[d[0][1]] = c_init.get(d[0][1])+1
        for i in xrange(len(d)):
            if not c_letter_t.has_key(d[i][0]):
                c_letter_t[d[i][0]] = 1
            else: c_letter_t[d[i][0]] = c_letter_t.get(d[i][0])+1
            if not c_letter_c.has_key(d[i][1]):
                c_letter_c[d[i][1]] = 1
            else: c_letter_c[d[i][1]] = c_letter_c.get(d[i][1])+1
            if not c_pairs.has_key(d[i]):
                c_pairs[d[i]] = 1
            else: c_pairs[d[i]] = c_pairs.get(d[i])+1
            if i <= len(d)-2:
                if not c_transition.has_key((d[i][1],d[i+1][1])):
                    c_transition[(d[i][1],d[i+1][1])] = 1
                else: c_transition[(d[i][1],d[i+1][1])] = c_transition.get((d[i][1],d[i+1][1]))+1
    
    return c_letter_t,c_letter_c,c_pairs,c_transition,c_init

In [10]:
c_letter_t,c_letter_c,c_pairs,c_transition,c_init = make_counts(train_10)
print 'number of observations:',len(c_letter_t)
print 'number of states:',len(c_letter_c)
print 'number of obs/state pairs:',len(c_pairs)
print 'number of transitions:',len(c_transition) 
print 'number of initial state:',len(c_init)

number of observations: 26
number of states: 26
number of obs/state pairs: 127
number of transitions: 403
number of initial state: 25


In [5]:
tot = 0.0
correct = 0.0
#confusion = zeros([len(c_letter_c),len(c_letter_c)])
f = FloatProgress(min=0, max=len(test_10))
display(f)
type_ids = list()
correct_ids  = list()
for origin in test_10:
    f.value+=1
    for i in origin:
        type_ids.append(i[0])
        correct_ids.append(i[1])
    #type_ids,correct_ids = hmm_train.data2indices(origin)
    #correct_ids_pre = hmm_train.viterbi(type_ids)
for i in xrange(len(type_ids)):
    f.value+=1
    hyp = type_ids[i]
    ref = correct_ids[i]
    if hyp == ref:
        correct+=1
    #confusion[hyp][ref]+=1
    tot+=1
print "OK : "+str(correct)+" / "+str(tot)+ " -> "+ str(correct*100/tot)

OK : 6575.0 / 7320.0 -> 89.8224043716


In [7]:
c_words_train,c_tags_train,c_pairs_train,c_transition_train,c_init_train = make_counts(train_10)
print len(c_words_train),len(c_tags_train),len(c_pairs_train),len(c_transition_train),len(c_init_train)

hmm_train = HMM(state_list=c_tags_train.keys(), observation_list=c_words_train.keys(),
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None)
hmm_train.supervised_training(c_pairs_train, c_transition_train,c_init_train)
print hmm_train.observation_proba.sum(axis =0)
print hmm_train.transition_proba.sum(axis =0)
print sum(hmm_train.initial_state_proba)
print len(hmm_train.observation_proba)
print len(hmm_train.transition_proba)
print len(hmm_train.initial_state_proba)

#hmm_test.init_estimation(c_init_test)

26 26 127 403 25
HMM creating with: 
26 states
26 observations
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.]
1.0
26
26
26


In [7]:
print hmm_train.initial_state_proba

[  1.06445951e-01   4.16422893e-02   4.91791995e-02   3.02853013e-02
   2.93560932e-02   1.68634064e-02   3.56196441e-02   8.09443508e-02
   3.60670406e-02   2.71879409e-03   1.85841622e-03   4.03000998e-02
   2.32302027e-02   7.88450287e-02   2.60522421e-02   1.17011391e-03
   5.28272017e-02   7.47496300e-02   2.60866573e-02   9.91155315e-03
   1.74071652e-01   5.44447121e-02   4.68045566e-03   2.54671852e-03
   1.03245345e-04   0.00000000e+00]


In [8]:
tot_1 = 0.0
correct_1 = 0.0
confusion_1 = zeros([hmm_train.N,hmm_train.N])
f = FloatProgress(min=0, max=len(test_10))
display(f)
for test in test_10:
    f.value+=1
    wordids_test,tagids_test = hmm_train.data2indices(test)
    tagids_pre = hmm_train.viterbi(wordids_test)
    for i in xrange(len(tagids_pre)):
        hyp = tagids_pre[i]
        ref = tagids_test[i]
        if hyp == ref:
            correct_1+=1
        confusion_1[hyp][ref]+=1
        tot_1+=1
print "OK : "+str(correct_1)+" / "+str(tot_1)+ " -> "+ str(correct_1*100/tot_1)

OK : 6822.0 / 7320.0 -> 93.1967213115
