# Running EM on the birdsong corpus

## Getting set up

In [None]:
import markhov
import em
import numpy as np
import pandas as pd
from pandas_functions import *
%pylab inline
import seaborn as sns
from florisplotlib import *

In [None]:
reload(em)
import pandas_functions
reload(pandas_functions)
from pandas_functions import *

Read in the birdsong corpus

In [None]:
f=open('../corpus/cath8.txt','r')
corpus = f.readlines()
f.close()
corpus = [line.rstrip('\n') for line in corpus]


Create a transition grammar for the corpus

In [None]:
trans = {'[':[]}
for s in corpus:
    s=['[']+s.split(' ')
    for i in range(1,len(s)):
        trans[s[i-1]]=trans.get(s[i-1],[])
        if s[i] not in trans[s[i-1]]:
            trans[s[i-1]].append(s[i])


Operations FSAs

In [None]:
# copy grammar
ops_c = {'S':[('NotCL','mg')], # from start we have to merge
       'NotCL':[('NotCL','mg'),('NotCL','copy'), # this state is the state in which the last "special" operation was *not* Clear. Either we've done none or the last was copy. From here we can do everything including end
               ('CLEAR_S','clear'), # go here to clear the buffer
               ('F','end') # go here to end
           ],
       'CLEAR_S':[('CLEAR','mg')], # this is where we've just cleared. Buffer is empty so you can only Merge
       'CLEAR':[('CLEAR','mg'), # the last special op was Clear so we can Copy or Merge.
                ('NotCL','copy') # if we Copy, the last special op was Copy so go to NotCL
            ],
       'F':[] #final state
   }

# no copy grammar
ops_nc = {'S':[('S','mg'),('F','end')], # from start we have to merge
       'F':[] #final state
   }


## Expectation Maximisation time!

In [None]:
n = 10 # number of EM iterations to do

In [None]:
history=em.em(corpus,trans,ops,n) # run EM

### Display the results

Make the rules into a simpler list

In [None]:
rules=[]
for lhs in ops:
    for (rhs,e) in ops[lhs]:
        rules.append(('%s->%s %s'%(lhs,e,rhs),(lhs,rhs,e)))

Put them and their updating probabilities into a table

In [None]:
tab=[]
for (rule,(lhs,rhs,e)) in rules:
    thisrule={' rule':rule}
    for i in range(n):
        p=history[i]['fsa'][lhs][(rhs,e)]
        thisrule["p.iteration%03d"%i]=p
    tab.append(thisrule)

Use pandas to make the table into a datafram

In [None]:
df = pd.DataFrame(tab)

In [None]:
df

In [None]:
df.to_csv('ops_%s.csv'%('oct_23_2016_cath8')) # print to file

In [None]:
ncol,_ = df.shape
colors = get_colors(ncol)
for i,row in df.iterrows():
    col = colors[i]
    plot(range(n),[ row["p.iteration%03d"%i] for i in range(n) ],'-',color=col,label=row[" rule"])
legend()
xlim(-.5,n+15)
xlabel("Iteration")
ylabel("Rule probability (log)")

Same for the transitions

In [None]:
bigrams=[]
for lhs in trans:
    for rhs in trans[lhs]:
        bigrams.append(('%s,%s'%(lhs,rhs),(lhs,rhs)))

In [None]:
tab=[]
for (bi,(lhs,rhs)) in bigrams:
    thisrule={' bigram':bi}
    for i in range(n):
        p=history[i]['trans_probs'][lhs][rhs]
        thisrule["p.iteration%03d"%i]=p
    tab.append(thisrule)

In [None]:
df_bis = pd.DataFrame(tab)

In [None]:
df_bis

In [None]:
df_bis.to_csv('trans_%s.csv'%('oct_23_2016_cath8'))

### Calculate the log likelihood of the corpus given the trained grammar

This is the ll of the corpus at the end of training:

In [None]:
# parse the corpus
parsed_corpus=em.parse_corpus(corpus,trans,ops)

In [None]:
em.ll_corpus(parsed_corpus,history[-1]['trans_probs'],history[-1]['fsa'])

Make a dataframe of the LL of the corpus throughout training:

In [None]:
tab=[]
for i in range(n):
    this_iter={'iteration':i,
               'likelihood':em.ll_corpus(parsed_corpus,history[i]['trans_probs'],history[i]['fsa'])}
    tab.append(this_iter)

In [None]:
df_lls=pd.DataFrame(tab)

In [None]:
df_lls

In [None]:
plot(df_lls["iteration"],df_lls["likelihood"],'o-')
xlabel("Iteration")
ylabel("Log Likelihood")

Compare the grammars

In [None]:
compare = em.compare(corpus[:123],corpus[123:],trans,ops_c,ops_nc,n)

In [None]:
df_copy = ll_table(compare[0][1])
df_no_copy =ll_table(compare[1][1])

In [None]:
df_copy

In [None]:
df_no_copy

In [None]:
n=10
w = 3
sc=0.01
window = em.windows(corpus,trans,ops_c,ops_nc,n,w,sc)

In [None]:
r=3 # run windows three times
ws=em.iter_windows(corpus,trans,ops_c,ops_nc,n,w,r,sc)

In [None]:
df_window=ll_window(ws,True)

In [None]:
df_window

In [None]:
f,ax = subplots(1,3,figsize=(20,10),sharey=False)
colors = {'copy':"red","no copy":"blue"}
for names,data in df_window.groupby(['run','training window','grammar']):
    run,window,gram = names
    ax[window].plot(data["iteration"],data["train LL"],'o-',label=names,color=colors[gram])
    ax[window].set_title("LL training corpus as a function of iteration")
    xlabel("Iteration")
    ylabel("Log-Likelihood")
legend()


In [None]:
f,ax = subplots(1,3,figsize=(20,10),sharey=True)
colors = {'copy':"red","no copy":"blue"}
for names,data in df_window.groupby(['run','training window','grammar']):
    run,window,gram = names
    ax[window].plot(data["iteration"],data["test LL"],'o-',label=names,color=colors[gram])
    ax[window].set_title("LL testing corpus as a function of iteration")
    xlabel("Iteration")
    ylabel("Log-Likelihood")
legend()


In [None]:
f,ax = subplots(1,3,figsize=(20,10),sharey=True)
selections = df_window[ (df_window["grammar"]=="copy") & ((df_window["iteration"]==0) | (df_window["iteration"]==n)) ]
colors = {'train':'purple','test':'green'}
for names,data in selections.groupby(['run','training window']):
    run,window = names
    for corp in ["test","train"]:
        it0 = data[ data["iteration"]==0 ]["%s LL"%corp].iloc[0]
        itn = data[ data["iteration"]==n ]["%s LL"%corp].iloc[0]
        ax[window].plot(it0,itn,'o-',color=colors[corp],label=corp)
    ax[window].set_title("LL testing corpus as a function of iteration")
    xlabel("LL on iteration 0")
    ylabel("LL on iteration n")
legend()


In [None]:
df_rules = rule_probs_table(ws,trans,ops_c,ops_nc,True)

In [None]:
df_rules

In [None]:
selections = df_rules[ (df_rules['prob'] < np.log(0.01)) & (df_rules['iteration']==n) & (df_rules['run']==0) & (df_rules['training window']==0)] 

In [None]:
len(selections)

In [None]:
selections


In [None]:
x = [1,2,3]
x[3]


In [None]:
plot(df_window["iteration"],df_window["train LL"],'o-')
xlabel("Iteration")
ylabel("Log Likelihood of training corpus")
title("Copy grammar, window 1")