# Running EM on the birdsong corpus

## Getting set up

In [None]:
import markhov
import em
import numpy as np
%pylab inline

Read in the birdsong corpus

In [None]:
f=open('../corpus/cath8.txt','r')
corpus = f.readlines()
f.close()
corpus = [line.rstrip('\n') for line in corpus]


Create a transition grammar for the corpus

In [None]:
trans = {'[':[]}
for s in corpus:
    s=['[']+s.split(' ')
    for i in range(1,len(s)):
        trans[s[i-1]]=trans.get(s[i-1],[])
        if s[i] not in trans[s[i-1]]:
            trans[s[i-1]].append(s[i])


Operations FSA

In [None]:
ops = {'S':{'NotCL':['mg']}, # from start we have to merge
       'NotCL':{'NotCL':['mg','copy'], # this state is the state in which the last "special" operation was *not* Clear. 
                #Either we've done none or the last was copy. From here we can do everything including end
               'CLEAR_S':['clear'], # go here to clear the buffer
               'F':['end'] # go here to end
           },
       'CLEAR_S':{'CLEAR':['mg']}, # this is where we've just cleared. Buffer is empty so you can only Merge
       'CLEAR':{'CLEAR':['mg'], # the last special op was Clear so we can Copy or Merge.
                'NotCL':['copy'] # if we Copy, the last special op was Copy so go to NotCL
            },
       'F':{} #final state
   }


## Expectation Maximisation time!

In [None]:
n = 10 # number of EM iterations to do

In [None]:
history=em.em_rabbit(corpus,trans,ops,n) # run EM

Display the results

In [None]:
import pandas as pd

Make the rules into a simpler list

In [None]:
rules=[]
for lhs in ops:
    for rhs in ops[lhs]:
        for e in ops[lhs][rhs]:
            rules.append(('%s->%s %s'%(lhs,e,rhs),(lhs,rhs,e)))

Put them and their updating probabilities into a table

In [None]:
tab=[]
for (rule,(lhs,rhs,e)) in rules:
    thisrule={'rule':rule}
    for i in range(n):
        p=history[i]['fsa'][lhs][rhs][e]
        thisrule["p.iteration%i"%i]=p
    tab.append(thisrule)

Use pandas to make the table into a datafram

In [None]:
df = pd.DataFrame(tab)

In [None]:
df

In [None]:
df.to_csv('ops_%s.csv'%('oct_22_2016_aaa')) # print to file

Same for the transitions

In [None]:
bigrams=[]
for lhs in trans:
    for rhs in trans[lhs]:
        bigrams.append(('%s,%s'%(lhs,rhs),(lhs,rhs)))

In [None]:
tab=[]
for (bi,(lhs,rhs)) in bigrams:
    thisrule={'rule':bi}
    for i in range(n):
        p=history[i]['trans_probs'][lhs][rhs]
        thisrule["p.iteration%i"%i]=p
    tab.append(thisrule)

In [None]:
df_bis = pd.DataFrame(tab)

In [None]:
df_bis

In [None]:
df_bis.to_csv('trans_%s.csv'%('oct_22_2016_cath8'))

### Calculate the log likelihood of the corpus given the trained grammar

This is the ll of the corpus at the end of training:

In [None]:
# parse the corpus
parsed_corpus=em.parse_corpus(corpus,trans,ops)

In [None]:
markhov.ll_corpus(parsed_corpus,history[-1]['trans_probs'],history[-1]['fsa'])

Make a dataframe of the LL of the corpus throughout training:

In [None]:
tab=[]
for i in range(n):
    this_iter={'iteration':i,
               'likelihood':markhov.ll_corpus(parsed_corpus,history[i]['trans_probs'],history[i]['fsa'])}
    tab.append(this_iter)

In [None]:
df_lls=pd.DataFrame(tab)

In [None]:
df_lls

In [None]:
plot(df_lls["iteration"],df_lls["likelihood"],'o-')
xlabel("Iteration")
ylabel("Log Likelihood")

In [None]:
reload(em)