In [None]:
import markhov
import em
import pandas_functions
import numpy as np
import pandas as pd
from florisplotlib import *
%pylab inline

# Demonstrating EM

Here we demonstrate the use and behaviour of the various functions in our expectation maximisation algorithm for learning rule probagbilities for a two-part grammar made up of a finite state automaton that generates strings of operations and a bigram grammar of legal transitions for the operation Merge.

### Get set up

Some functions for log-transforming the grammar

Make some transitions

In [None]:
trans = {'a':['a','b'],
           'b':['b','a'],
           '[':['a','b']
       }


In [None]:
trans_probs = {'a':{'a':0.5,'b':0.5},
           'b':{'b':0.5,'a':0.5},
           '[':{'a':0.5,'b':0.5}
       }

trans_probs=markhov.fsa_log(trans_probs)

Operations FSA

In [None]:
ops = {'S':[('NotCL','mg')], # from start we have to merge
       'NotCL':[('NotCL','mg'),('NotCL','copy'), # this state is the state in which the last "special" operation was *not* Clear. Either we've done none or the last was copy. From here we can do everything including end
               ('CLEAR_S','clear'), # go here to clear the buffer
               ('F','end') # go here to end
           ],
       'CLEAR_S':[('CLEAR','mg')], # this is where we've just cleared. Buffer is empty so you can only Merge
       'CLEAR':[('CLEAR','mg'), # the last special op was Clear so we can Copy or Merge.
                ('NotCL','copy') # if we Copy, the last special op was Copy so go to NotCL
            ],
       'F':[] #final state
   }

# no copy
ops_nc = {'S':[('S','mg'),('F','end')],
          'F':[]}

In [None]:
ops_probs=em.initialise(ops)

Corpora

In [None]:
aaa = ['a','a a','a a a']
aaaa=aaa+['a a a a','a a a a a']

### Parse a sentence

In [None]:
s=aaa[-1]

In [None]:
parses=markhov.parse(s,trans,ops)

In [None]:
for i in range(len(parses)):
    print(markhov.parse2string(parses[i]))

### Parse the corpus

In [None]:
parsed_corpus=em.parse_corpus(aaa,trans,ops)

In [None]:
print(em.parsed_corpus2string(parsed_corpus))

Make a parallel list of parse relative probabilities

In [None]:
parse_probs=em.get_p_parses(parsed_corpus,ops_probs,trans_probs)

In [None]:
print(parse_probs)

In [None]:
exp_bigrams = em.expected_transition_counts(parsed_corpus,parse_probs,trans,'bc')
exp_unigrams = em.expected_state_counts(parsed_corpus,parse_probs,trans,'uc')

In [None]:
for u in exp_unigrams:
    print ('\nExpected count of %s in each sentence:'%u)
    for s in exp_unigrams[u]:
        print (' %s: %.4f'%(s,exp_unigrams[u][s]))

In [None]:
for a in exp_bigrams:
    for b in exp_bigrams[a]:
        print ('\nExpected count of %s %s in each sentence:'%(a,b))
        for s in exp_bigrams[a][b]:
            print (' %s: %.4f'%(s,exp_bigrams[a][b][s]))

In [None]:
exp_tr = em.expected_transition_counts(parsed_corpus,parse_probs,ops,'tc')
exp_states = em.expected_state_counts(parsed_corpus,parse_probs,ops,'sc')

In [None]:
for u in exp_states:
    print ('\nExpected count of %s in each sentence:'%u)
    for s in exp_states[u]:
        print (' %s: %.4f'%(s,exp_states[u][s]))

In [None]:
for a in exp_tr:
    for (b,e) in exp_tr[a]:
        print ('\nExpected count of (%s %s %s) in each sentence:'%(a,e,b))
        for s in exp_tr[a][(b,e)]:
            print (' %s: %.4f'%(s,exp_tr[a][(b,e)][s]))

In [None]:
new_ops = em.update(exp_states,exp_tr,ops)

In [None]:
print (markhov.fsa2string(new_ops,False)) # False prints the non-log probs

In [None]:
em.check_fsa(new_ops) # is this a valid probability distribution?

In [None]:
em.smooth(new_ops,0.01)

In [None]:
new_trans = em.update(exp_unigrams,exp_bigrams,trans)

In [None]:
print (markhov.fsa2string(new_trans))

In [None]:
em.check_fsa(new_trans) # check this is a valid probability distribution

In [None]:
reload(em)

In [None]:
em.smooth(new_trans,0.01)

## Expectation Maximisation time!

In [None]:
n = 10 # number of EM iterations to do

In [None]:
history,corpus=em.em(aaa,trans,ops,n) # run EM


In [None]:
#for i,step in enumerate(history):
#    print ('\n--------\n--------\nIteration %i'%i)
#    print ('FSA:')
#    print (markhov.fsa2string(step['fsa']))
#    print ('\n------\nTrans probs:')
#    print (markhov.trans2string(step['trans_probs']))

Display the results

In [None]:
df_ops=pandas_functions.ops_table(history,ops)
df_ops

In [None]:
df_ops.to_csv('ops_%s.csv'%('oct_22_2016_aaa')) # print to file

In [None]:
ncol,_ = df_ops.shape
colors = get_colors(ncol)
for i,row in df_ops.iterrows():
    col = colors[i]
    plot(range(n),[ row["p.iteration%03d"%i] for i in range(n) ],'-',color=col,label=row[" rule"])
legend()
xlim(-.5,n+15)
xlabel("Iteration")
ylabel("Rule probability (log)")

Same for the transitions

In [None]:
df_bis=pandas_functions.trans_probs_table(history,trans)
df_bis

In [None]:
df_bis.to_csv('trans_%s.csv'%('oct_22_2016_aaa'))

### Calculate the log likelihood of the corpus given the trained grammar

This is the ll of the corpus at the end of training:

In [None]:
em.ll_corpus(parsed_corpus,history[-1]['trans_probs'],history[-1]['fsa'])

Make a dataframe of the LL of the corpus throughout training:

In [None]:
df_lls=pandas_functions.ll_corpus_table(history)
df_lls

In [None]:
plot(df_lls["iteration"],df_lls["likelihood"],'o-')
xlabel("Iteration")
ylabel("Log Likelihood")

Look at the changing relative probabilities of the parses

In [None]:
df_parses=pandas_functions.p_parses_table(parsed_corpus,history)
df_parses

In [None]:
reload(em)
reload(pandas_functions)

In [None]:
sc=0.01 # smoothing constant

In [None]:
ll_test,parsed_train,parsed_test,history = em.em_train(aaa,aaaa,trans,ops,n,sc)

In [None]:
df_test = pandas_functions.ll_table(history)
df_test

In [None]:
compare = em.compare(aaaa,bbbb,trans,ops,ops_nc,10)

In [None]:
print ('LL test corpus, Copy grammar:\t%.3f'%compare[0][0])
print ('LL test corpus, No-Copy grammar:%.3f'%compare[1][0])

In [None]:
df_copy = pandas_functions.ll_table(compare[0][1])
df_no_copy =pandas_functions.ll_table(compare[1][1])

In [None]:
df_copy

In [None]:
df_no_copy

In [None]:
bbbb=['b','b b','b b b','b b b b','b b b b b']

asbs = ['a b','b a','a a b', 'b b a','a b a b a b']

ab=aaaa+bbbb

for s in ab:
    print (s)

In [None]:
n=10
w = 2
sc=0.01
window = em.windows(ab,trans,ops,ops_nc,n,w,sc)

In [None]:
reload(em)

In [None]:
window[0][1][0][1][0]['fsa']