# Preprocess Ainu Folklore Corpus

We mannually downloaded the html files from [A Glossed Audio Corpus of Ainu Folklore](http://ainucorpus.ninjal.ac.jp/corpus/en/).  
Assume that source html files stored in `html` directory.
```
data
└── ainu-en-ja
    └── html
        ├── K7708241UP.html
        ├── K7708242UP.html
        ├── K7803231UP.html
        ├── K7803232UP.html
        ├── K7803233KY.html
        ├── K7803233UP.html
        ├── K7807152KY.html
        ├── K7908051UP.html
        ├── K8010291UP.html
        └── K8109193UP.html
```

Document-level metadata

In [1]:
# list of documents
# (doc_id, en_title, ja_title, checkdigits)
docs = [
    ("K7803231UP", "The Young Lad Raised by the Cat God", u"猫の神様に育てられた少年", 254),
    ("K7708241UP", "Pananpe Escapes from the Demon's Hands", u"パナンペ鬼の手から逃れる", 126),
    ("K7708242UP", "The Girl Who Gave the Bad Red Dog Poison", u"悪い赤犬に毒を飲ませた少女", 308),
    ("K7803232UP", "The Poor Man Who Dug Up the Village Chief Wife’s Grave", 
     u"村長の奥さんの墓を掘り返した貧乏人", 174),
    ("K7803233UP", "The Grapevines which Warded Off the Topattumi-night Raiders", 
     u"ぶどうづるの輪がトパットゥミを退けてくれた話", 419),
    ("K7803233KY", "The Woman Who Became kemkacikappo Bird", u"ケﾑカチカッポになった女", 181),
    ("K7807152KY", "The Goddess of Fire Fought with the Demon God from the End of the Earth", 
     u"火の女神が地の果ての魔神と戦った", 125),
    ("K7908051UP", "The Bridge of Mist", u"霞の架け橋", 377),
    ("K8010291UP", "The Rich Man from Cenpak", u"チェンパｸのニｼパの話", 535),
    ("K8109193UP", "Godly Elder Sister Gets Rid of Bad Bear Father", 
     u"巫力の強い姉が、悪い熊親父を退治した", 169)
]

In [2]:
def find_sentence(soup, sent):
    
    kana = [td.text for td in sent.find_all("td", ["kana"])]
    ainu = [td.text for td in sent.find_all("td", ["ainu"])]
    morpheme = [td.text for td in sent.find_all("td", ["morpheme"])]
    gloss_en = [td.text for td in sent.find_all("td", ["gloss_en"])]
    gloss_jp = [td.text for td in sent.find_all("td", ["gloss_jp"])]
    
    return zip(kana, ainu, morpheme, gloss_en, gloss_jp)

In [3]:
def find_translation(soup, sent):
    
    ja = [div.text for div in sent.find_all("div", ["ft_j", "translation-text"])]
    en = [div.text for div in sent.find_all("div", ["ft_e", "translation-text"])]
    return zip(en, ja)

In [4]:
import os
from codecs import open
from bs4 import BeautifulSoup as bs

def extract(doc, sentences, translations):
    
    # read file
    with open(os.path.join("html", "%s.txt" % doc), 'r', encoding="utf-8") as f:
        read_data = f.read()
        
    # parse html
    soup = bs(read_data, 'html.parser')
    
    # extract entries
    sents = soup.find_all(id=lambda x: x and x.startswith("gridview-1029-record-"))
    for sent in sents:
        a = sent.find('span')
        record_id = int(a['id'].split('.')[-1])
        if not sentences.has_key((doc, record_id)):
            sentences[(doc, record_id)] = find_sentence(soup, sent)
        if not translations.has_key((doc, record_id)):
            translations[(doc, record_id)] = find_translation(soup, sent)
    
    return sentences, translations, record_id

In [5]:
from collections import OrderedDict

sentences = OrderedDict()
translations = OrderedDict()
for d in docs:
    print 'processing', d[0], '...',
    sentences, translations, record_id = extract(d[0], sentences, translations)
    assert d[-1] == record_id
    print record_id, 'sequences found.'
print 'done!'

processing K7803231UP ... 254 sequences found.
processing K7708241UP ... 126 sequences found.
processing K7708242UP ... 308 sequences found.
processing K7803232UP ... 174 sequences found.
processing K7803233UP ... 419 sequences found.
processing K7803233KY ... 181 sequences found.
processing K7807152KY ... 125 sequences found.
processing K7908051UP ... 377 sequences found.
processing K8010291UP ... 535 sequences found.
processing K8109193UP ... 169 sequences found.
done!


Save extractions

In [93]:
import cPickle
from codecs import open

with open('sentences.cPickle', 'wb') as f:
    cPickle.dump(sentences, f)
    
with open('translations.cPickle', 'wb') as f:
    cPickle.dump(translations, f)

In [3]:
import cPickle
with open('sentences.cPickle', 'rb') as f:
    sents = cPickle.load(f)
    
with open('translations.cPickle', 'rb') as f:
    trans = cPickle.load(f)

In [4]:
import re

def sanitize(ainu):
    ainu = ainu.strip()
    ainu = re.sub(r'\(.+\)', '', ainu)
    ainu = re.sub(r'\[.+\]', '', ainu)
    ainu = re.sub(r'[|_\[\]()]', '', ainu)
    ainu = re.sub(r'<y>', '', ainu) # phonological alternations, insertions
    ainu = re.sub(r'={2,}', '=', ainu)
    ainu = re.sub(u'\u201c', '"', ainu) # double quotation
    ainu = re.sub(u'\u201d', '"', ainu) # double quotation
    ainu = re.sub(ur"^[`'’]|[`'’]$", '"', ainu) # single quote
    ainu = re.sub(ur"['’]s ", "'s ", ainu) # apostrophy
    ainu = re.sub(ur"['’]m ", "'m ", ainu) # apostrophy
    ainu = re.sub(ur"['’]ve ", "'ve ", ainu) # apostrophy
    ainu = re.sub(ur"['’]ll ", "'ll ", ainu) # apostrophy
    ainu = re.sub(ur"n['’]t ", "n't ", ainu) # apostrophy
    ainu = re.sub(r"\*\d{1,2}", "", ainu) # footnote mark
    ainu = re.sub(u'…', '...', ainu)
    #ainu = re.sub(r'[.,;?\']$', '', ainu)
    return ainu

We use [neologdn](https://github.com/ikegami-yukino/neologdn) to normalize Japanese texts.

In [5]:
import neologdn
def sanitize_ja(ja):
    ja = ja.strip()
    ja = re.sub(u"（？）", "(?)", ja)
    ja = re.sub(u"（", "(", ja)
    ja = re.sub(u"）", ")", ja)
    ja = re.sub(ur"\[", "(", ja)
    ja = re.sub(ur"\]", ")", ja)
    ja = re.sub(ur"\(", " (", ja)
    ja = re.sub(ur"\)", ") ", ja)
    ja = re.sub(u"．", ".", ja)
    ja = re.sub(u"--", "...", ja)
    ja = re.sub(u"…", "...", ja)
    ja = re.sub(ur"^[`'’]|[`'’]$", '"', ja)
    ja = re.sub(u"『", u"「", ja)
    ja = re.sub(u"』", u"」", ja)
    ja = re.sub(r"\*", "", ja)
    ja = neologdn.normalize(ja)
    return ja.strip()

In [6]:
def tokenize(kana):
    kana = re.sub(r"\?", " ? ", kana)
    kana = re.sub(r",{1}", " , ", kana)
    kana = re.sub(r'"', ' &quot; ', kana)
    kana = re.sub(r"\.{1}", " . ", kana)
    kana = re.sub(r"\s{2,}", " ", kana)
    kana = re.sub(r'(\.\s\.\s\.)', '...', kana)
    kana = re.sub(r'(\.\s\.)', '...', kana)
    return kana.strip()

Save original texts and translations

In [15]:
from codecs import open
with open('train.tok.ainu', 'w', encoding='utf-8') as ainu_train:
    with open('train.tok.kana', 'w', encoding='utf-8') as kana_train:
        with open('dev.tok.ainu', 'w', encoding='utf-8') as ainu_dev:
            with open('dev.tok.kana', 'w', encoding='utf-8') as kana_dev:
                for k, v in sentences.iteritems():
                    kana = ' '.join([tokenize(sanitize(t[0])) for t in v])
                    ainu = ' '.join([tokenize(sanitize(t[1])) for t in v])
                    if k[0] in ["K8010291UP", "K8109193UP"]:
                        kana_dev.write(kana + '\n')
                        ainu_dev.write(ainu + '\n')
                    else:
                        kana_train.write(kana + '\n')
                        ainu_train.write(ainu + '\n')

In [18]:
with open('train.en', 'w', encoding='utf-8') as en_train:
    with open('train.ja', 'w', encoding='utf-8') as ja_train:
        with open('dev.en', 'w', encoding='utf-8') as en_dev:
            with open('dev.ja', 'w', encoding='utf-8') as ja_dev:
                for k, v in translations.iteritems():
                    if k[0] in ["K8010291UP", "K8109193UP"]:
                        en_dev.write(sanitize(v[1][0]) + '\n')
                        ja_dev.write(sanitize_ja(v[0][0]) + '\n')
                    else:
                        en_train.write(sanitize(v[1][0]) + '\n')
                        ja_train.write(sanitize_ja(v[0][0]) + '\n')

To tokenize, we use external scripts:  
**eng:** `tokenizer.pl` provided as a component of moses [here](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)  
**jpn:** [Mecab](http://taku910.github.io/mecab/)'s `-Owkati` option (see `scripts/ja_tokenizer.py` for details)  

In [19]:
script_dir = os.path.join(os.getcwd(), '..', '..', 'scripts')
ja_tokenizer = os.path.join(script_dir, 'ja_tokenizer.py')
en_tokenizer = os.path.join(script_dir,  'tokenizer.pl')

for d in ['train', 'dev']:
    os.system('%s -l en <%s.en >%s.tok.en' % (en_tokenizer, d, d))
    os.system('%s <%s.ja >%s.tok.ja' % (ja_tokenizer, d, d))

Format them in xml

In [7]:
def doc_meta(soup, doc_id, en_title, ja_title):
    metadata = soup.new_tag("metadata")
    
    if doc_id.endswith("UP"):
        genre = "Uepeker"
    elif doc_id.endswith("KY"):
        genre = "Kamuyyukar"
    
    # ainu
    meta = soup.new_tag("meta")
    meta['type'] = 'language'
    meta['name'] = 'ainu'
    meta['scripts'] = 'latin,kana'
    meta['iso-639-3'] = 'ain'
    meta['tiers'] = 'phrases words morphemes'
    metadata.append(meta)
    

    # eng
    meta = soup.new_tag("meta")
    meta['type'] = 'language'
    meta['name'] = 'english'
    meta['iso-639-3'] = 'eng'
    meta['tiers'] = 'glosses translations'
    metadata.append(meta)

    # jpn
    meta = soup.new_tag("meta")
    meta['type'] = 'language'
    meta['name'] = 'japanese'
    meta['iso-639-3'] = 'jpn'
    meta['tiers'] = 'glosses translations'
    metadata.append(meta)

    # doc_id
    meta = soup.new_tag("meta")
    meta['type'] = 'source'
    meta['id'] = doc_id
    meta['genre'] = genre
    metadata.append(meta)
    
    # title
    meta = soup.new_tag("meta")
    meta['type'] = 'title'
    meta['language'] = 'eng'
    meta.append(en_title)
    metadata.append(meta)
    
    meta = soup.new_tag("meta")
    meta['type'] = 'title'
    meta['language'] = 'jpn'
    meta.append(ja_title)
    metadata.append(meta)

    
    return metadata

In [8]:
def sent(soup, k, v):
    igt = soup.new_tag("igt")
    igt['id'] = k[0] + '.%0.3d' % k[1]
    
    # raw sentence
    tier = soup.new_tag("tier")
    tier['type'] = 'phrases'
    tier['id'] = 'r.%0.3d' % k[1]
    tier['state'] = 'raw'
    
    item = soup.new_tag("item")
    item['id'] = 'r_ainu.%0.3d' % k[1]
    item['script'] = 'latin'
    item.append(' '.join([t[1] for t in v]))
    tier.append(item)
    
    item = soup.new_tag("item")
    item['id'] = 'r_kana.%0.3d' % k[1]
    item['script'] = 'kana'
    item.append(' '.join([t[0] for t in v]))
    tier.append(item)
    igt.append(tier)
    
    # cleaned sentence
    tier = soup.new_tag("tier")
    tier['type'] = 'phrases'
    tier['id'] = 'c.%0.3d' % k[1]
    tier['state'] = 'cleaned'
    
    item = soup.new_tag("item")
    item['id'] = 'c_ainu.%0.3d' % k[1]
    item['script'] = 'latin'
    item.append(' '.join([sanitize(t[1]) for t in v]))
    tier.append(item)
    
    item = soup.new_tag("item")
    item['id'] = 'c_kana.%0.3d' % k[1]
    item['script'] = 'kana'
    item.append(' '.join([sanitize(t[0]) for t in v]))
    tier.append(item)
    igt.append(tier)
    
    # tokenized sentence
    tier = soup.new_tag("tier")
    tier['type'] = 'phrases'
    tier['id'] = 'n.%0.3d' % k[1]
    tier['state'] = 'normalized'
    
    item = soup.new_tag("item")
    item['id'] = 'n_ainu.%0.3d' % k[1]
    item['script'] = 'latin'
    item.append(' '.join([tokenize(sanitize(t[1])) for t in v]))
    tier.append(item)
    igt.append(tier)
    
    item = soup.new_tag("item")
    item['id'] = 'n_kana.%0.3d' % k[1]
    item['script'] = 'kana'
    item.append(' '.join([tokenize(sanitize(t[0])) for t in v]))
    tier.append(item)
    igt.append(tier)
    return igt


In [9]:
def notes(soup, orig, align):
    meta = soup.new_tag("meta")
    meta['alignment'] = ','.join(align)
    tmp = orig[0]
    if len(orig) > 1:
        tmp = orig[1]
    
    if re.match(r'.+\*\d{1,2}$', tmp):
        meta['type'] = 'footnote'
        meta.append('')
    elif re.search(r'\([^\?]+\)', tmp):
        meta['type'] = 'phonological alternations' # ()
        match = re.search(r'\(([^\?]+)\)', tmp)
        meta.append(match.group(1))
    else:
        if re.search(r'_', tmp):
            meta['type'] = 'phonological alternations' # underbar
        elif re.search(r'\(\?\)', tmp):
            meta['type'] = 'unclear interpretations' # (?)
        elif re.search(r'<.+>', tmp):
            meta['type'] = 'inserted sounds' # <>
        elif re.search(r'\[.+\]', tmp):
            meta['type'] = 'complementary' # []
        meta.append(' '.join(orig))
    return meta
        

def ainu_words(soup, k, v):
    tier_id = 'ainu.%0.3d' % k[1]
    tier = soup.new_tag("tier")
    tier['type'] = 'words'
    tier['id'] = 'w_' + tier_id
    tier['script'] = 'latin'
    tier['segmentation'] = 'n_' + tier_id
    
    metadata = soup.new_tag("metadata")
    flag = False
    
    c = 0
    for i, w in enumerate(v):
        clean = tokenize(sanitize(w[1]))
        tok = clean.split()
        for j, t in enumerate(tok):
            align = 'w_' + tier_id + '.%d' % (i+1)
            if len(tok) > 1:
                align += '.%d' % (j+1)
            if len(v) > i and t != v[i][1]:
                if not re.match(r'^\&quot;.+|.+(\.|,|\?|\&quot;)$', clean):
                    meta = notes(soup, [v[i][1]], [align])
                    metadata.append(meta)
                    flag = True
            item = soup.new_tag("item")
            item['id'] = align
            item['segmentation'] = 'n_' + tier_id + "[%d:%d]" % (c, c+len(t))
            item.append(t)
            tier.append(item)
            c += 1 + len(t)
    if flag:
        tier.insert(0, metadata)
    return tier


    
def kana_words(soup, k, v):
    tier_id = 'kana.%0.3d' % k[1]
    tier = soup.new_tag("tier")
    tier['type'] = 'words'
    tier['id'] = 'w_' + tier_id
    tier['script'] = 'kana'
    tier['segmentation'] = 'n_' + tier_id
    
    metadata = soup.new_tag("metadata")
    flag = False
    
    c = 0
    for i, w in enumerate(v):
        clean = tokenize(sanitize(w[0]))
        tok = clean.split()
        
        for j, t in enumerate(tok):
            align = 'w_' + tier_id + '.%d' % (i+1)
            if len(tok) > 1:
                align += '.%d' % (j+1)
            if len(v) > i and t != v[i][0]:
                if not re.match(r'^\&quot;.+|.+(\.|,|\?|\&quot;)$', clean):
                    orig_array = [v[i][0]]
                    align_array = [align]
                    if i > 0:
                        orig_array.insert(0, v[i-1][0])
                        align_array.insert(0, 'w_' + tier_id + '.%d' % (i))
                    meta = notes(soup, orig_array, align_array)
                    metadata.append(meta)
                    flag = True
            item = soup.new_tag("item")
            item['id'] = align
            item['segmentation'] = 'n_' + tier_id + "[%d:%d]" % (c, c+len(t))
            item.append(t)
            tier.append(item)
            c += 1 + len(t)
    if flag:
        tier.insert(0, metadata)
    return tier

In [10]:
def morpheme(soup, k, v):
    tier_id = '.%0.3d' % k[1]
    tier = soup.new_tag("tier")
    tier['type'] = 'morphemes'
    tier['id'] =  'm' + tier_id
    tier['segmentation'] = 'w_ainu' + tier_id
    
    metadata = soup.new_tag("metadata")
    flag = False
    
    for i, w in enumerate(v):
        word = w[2]
        morph = word.split('-')
        
        # check consistency -> gloss
        #if w[2].count('-') != w[3].count('-'):
        #    print k, '\t', w[2], '\t', w[3]
            
            
        c = 0
        for j, t in enumerate(morph):
            align = 'm' + tier_id + '.%d' % (i+1)
            if len(morph) > 1:
                align += '.%d' % (j+1)
            if len(v) > i and t != v[i][2]:
                if not (re.match(r'.+(\.|,|\?)$', word) or re.search(r'-', word)):
                    meta = notes(soup, [v[i][2]], [align])
                    metadata.append(meta)
                    flag = True
            item = soup.new_tag("item")
            item['id'] = align
            item['segmentation'] = 'w_ainu' + tier_id + ".%d[%d:%d]" % (i+1, c, c+len(t))
            item.append(t)
            tier.append(item)
            c += len(t)
    if flag:
        tier.insert(0, metadata)
    return tier

In [11]:
def gloss(soup, k, v, lang):
    tier_id = '.%0.3d' % k[1]
    tier = soup.new_tag("tier")
    tier['type'] = 'glosses'
    tier['id'] =  lang + '_g' + tier_id
    tier['language'] = lang
    tier['alignment'] = 'm' + tier_id
    
    metadata = soup.new_tag("metadata")
    flag = False
    
    if lang == 'eng':
        idx = 3
    elif lang == 'jpn':
        idx = 4
    else:
        raise Exception('language definition not found.')
    
    
    for i, w in enumerate(v):
        morph = re.sub(ur'(\u2010|\uff0d|\u30fc)', '-', w[2])
        gloss = w[idx]
        if gloss in [u'\u2010', u'\uff0d', u'\u30fc']:
            gloss = re.sub(ur'(\u2010|\uff0d|\u30fc)', u'\u2015', gloss)
        if re.search(ur'in\u2010law', gloss):
            gloss = re.sub(ur'(\uff0d|\u30fc)', '-', gloss)
        else:
            gloss = re.sub(ur'(\u2010|\uff0d|\u30fc)', '-', gloss)
        
        
        
        glosses = gloss.split('-')
        morphs = morph.split('-')
        
        # check consistency
        if len(morphs) != len(glosses):
            print '!', k, i, '\t', morph,'\t', len(morphs), '\t', gloss, '\t', len(glosses)
            #raise ValueError('glosses not consistent with mophemes')
            continue
            
        c = 0
        for j, (m, g) in enumerate(zip(morphs, glosses)):
            align = lang + '_g' + tier_id + '.%d' % (i+1)
            if len(glosses) > 1:
                align += '.%d' % (j+1)
            if len(v) > i and m != v[i][idx]:
                if not (re.match(r'.+(\.|,|\?)$', m) or re.search(r'-', m)):
                    meta = notes(soup, v[i][idx], align)
                    metadata.append(meta)
                    flag = True
            
            gloss_elm = g.split('.')
            en = []
            pos = []
            for b, gl in enumerate(gloss_elm):
                if lang == 'jpn':
                    en_g = w[3]
                    if en_g in [u'\u2010', u'\uff0d', u'\u30fc']:
                        en_g = re.sub(ur'(\u2010|\uff0d|\u30fc)', u'\u2015', en_g)
                    if re.search(ur'in\u2010law', en_g):
                        en_g = re.sub(ur'(\uff0d|\u30fc)', '-', en_g)
                    else:
                        en_g = re.sub(ur'(\u2010|\uff0d|\u30fc)', '-', en_g)
                    try:
                        en_gloss = en_g.split('-')[j].split('.')[b]
                    except IndexError:
                        print '$', k, i, '\t', en_g,'\t', len(en_g.split('-')),\
                                    '\t', gloss, '\t', len(glosses)
                        #raise ValueError('jpn glosses not consistent with eng glosses')
                        continue
                else:
                    en_gloss = gl
                if en_gloss.isupper() or (en_gloss.isdigit() and int(en_gloss) <= 4):
                    pos.append(gl)
                else:
                    en.append(gl)
            
            if len(en) > 0:
                item = soup.new_tag("item")
                item['id'] = align + '_' + lang
                item['alignment'] = 'm_ainu' + align[5:]
                item.append(' '.join(en))
                tier.append(item)
            
            if len(pos) > 0:
                item = soup.new_tag("item")
                item['id'] = align + '_tag'
                item['alignment'] = 'm_ainu' + align[5:]
                item.append('.'.join(pos))
                tier.append(item)
            
            c += len(m)
        

    return tier

In [12]:
def translation(soup, k, t):
    tier_id = '.%0.3d' % k[1]
    tier = soup.new_tag("tier")
    tier['type'] = 'translations'
    tier['id'] =  't' + tier_id
    tier['alignment'] = 'r' + tier_id
    
    # english
    item = soup.new_tag("item")
    item['language'] = 'eng'
    item.append(sanitize(t[1][0]))
    tier.append(item)
    
    # japanese
    item = soup.new_tag("item")
    item['language'] = 'jpn'
    item.append(sanitize_ja(t[0][0]))
    tier.append(item)
    return tier

In [13]:
def construct(doc_id, sentences, translations, en_title, ja_title):
    
    # corpus
    soup = bs("", "xml")
    corpus = soup.new_tag("xigt-corpus")
    
    # doc meta
    meta = doc_meta(soup, doc_id, en_title, ja_title)
    corpus.append(meta)
    
    i = 1
    k = (doc_id, i)
    
    #for k, v in sentences.iteritems():
    while sentences.has_key(k):
        v = sentences[k]
        # sentence meta
        igt = sent(soup, k, v)

        # words
        ainu = ainu_words(soup, k, v)
        igt.append(ainu)
        kana = kana_words(soup, k, v)
        igt.append(kana)

        # morphemes
        morph = morpheme(soup, k, v)
        igt.append(morph)

        # glosses
        try:
            en_gloss = gloss(soup, k, v, 'eng')
            igt.append(en_gloss)
            ja_gloss = gloss(soup, k, v, 'jpn')
            igt.append(ja_gloss)
        except ValueError:
            continue

        # translations
        t = translations[k]
        trans = translation(soup, k, t)
        igt.append(trans)

        corpus.append(igt)
        i += 1
        k = (doc_id, i)
    
    
    soup.append(corpus)
    return soup, i-1

We detected following incontistencies, even after sanitaization.

In [27]:
for d in docs:
    soup, _ = construct(d[0], sentences, translations, d[1], d[2])

! ('K7803231UP', 168) 7 	o-yan 	2 	at.APPL-land-INTR.SG 	3
! ('K7803231UP', 168) 7 	o-yan 	2 	～に-陸-自形成.単 	3
! ('K7803231UP', 252) 7 	ko-hosipi 	2 	～に帰る 	1
! ('K7708242UP', 110) 2 	katun 	1 	shape-belong.to 	2
! ('K7708242UP', 110) 2 	katun 	1 	姿/形-～に属している 	2
! ('K7708242UP', 111) 2 	katun 	1 	shape-belong.to 	2
! ('K7708242UP', 111) 2 	katun 	1 	姿/形-～に属している 	2
! ('K7708242UP', 151) 2 	katun 	1 	shape-belong.to 	2
! ('K7708242UP', 151) 2 	katun 	1 	姿/形-～に属している 	2
! ('K7708242UP', 152) 2 	katun 	1 	shape-belong.to 	2
! ('K7708242UP', 152) 2 	katun 	1 	姿/形-～に属している 	2
! ('K7708242UP', 165) 6 	e-u-ka-opiwki 	4 	～について-互い―上-助ける? 	3
! ('K7708242UP', 170) 4 	- 	2 	― 	1
! ('K7708242UP', 170) 4 	- 	2 	― 	1
! ('K7708242UP', 172) 10 	- 	2 	― 	1
! ('K7708242UP', 172) 10 	- 	2 	― 	1
! ('K7708242UP', 231) 5 	katun 	1 	shape-belong.to 	2
! ('K7708242UP', 231) 5 	katun 	1 	姿/形-～に属している 	2
! ('K7708242UP', 287) 3 	- 	2 	― 	1
! ('K7708242UP', 287) 3 	- 	2 	― 	1
! ('K7803232UP', 32) 5 	- 	2 	― 	1
! ('K78032

We mannually corrected the inconcistencies above, and then saved them.

In [17]:
import os
from codecs import open
from bs4 import BeautifulSoup as bs

for d in docs:
    print 'processing', d[0], '...',
    with open(os.path.join('xml', '%s.xml' % d[0]), 'w', encoding='utf-8') as f:
        soup, i = construct(d[0], sents, trans, d[1], d[2])
        assert d[-1] == i
        print i, 'sequences saved.'
        f.write(re.sub('&amp;', '&', soup.prettify()))

 processing K7803231UP ... 254 sequences saved.
processing K7708241UP ... 126 sequences saved.
processing K7708242UP ... 308 sequences saved.
processing K7803232UP ... 174 sequences saved.
processing K7803233UP ... 419 sequences saved.
processing K7803233KY ... 181 sequences saved.
processing K7807152KY ... 125 sequences saved.
processing K7908051UP ... 377 sequences saved.
processing K8010291UP ... 535 sequences saved.
processing K8109193UP ... 169 sequences saved.


Call from xigt

In [116]:
from xigt.codecs import xigtxml
with open(os.path.join('xml', 'K7803231UP.xml')) as f:
    xc = xigtxml.load(f)
    words_tier = xc[0]['w_ainu.001']
    morph_tier = xc[0]['m.001']
    gloss_tier = xc[0]['eng_g.001']
    jpn_tier = xc[0]['jpn_g.001']

In [117]:
print ' '.join([w.value().strip() for w in words_tier])
for m, g, j in zip(morph_tier, gloss_tier, jpn_tier):
    print '%15s %15s %15s' % (m.value().strip(), g.value().strip(), j.value().strip())

ne ene iki wa okay pe a= ne ru an hi ka a= erampewtek no ,
             ne            what              なに
            ene       like this           このように
              i           APASS              もの
             ki              do            ～をする
             wa             and              して
           okay           exist              ある
             pe              PL               複
             a=    thing/person              もの
             ne            4.A=           4.他主=
             ru             COP            ～である
             an         INFR.EV              こと
             hi           exist              ある
             ka              SG               単
             a=            NMLZ               の
     erampewtek            even               も
             no            4.A=           4.他主=


## Statistics

In [39]:
from collections import Counter
x = []
y = 0
print '%10s %5s %5s %5s' % ('', 'sent', 'token', 'vocab')
print '-'*28
for d in docs:
    i = 1
    vocab = []
    while sents.has_key((d[0], i)):
        v = sents[(d[0], i)]
        vocab.extend([w[1] for w in v])
        i += 1
    x.extend(vocab)
    y += d[-1]
    print '%10s %5d %5d %5d' % (d[0], d[-1], len(vocab), len(Counter(vocab)))
print '-'*28
print '%10s %5d %5d %5d' % ('total', y, len(x), len(Counter(x)))

            sent token vocab
----------------------------
K7803231UP   254  2249   537
K7708241UP   126   874   234
K7708242UP   308  2828   560
K7803232UP   174  1556   391
K7803233UP   419  3704   695
K7803233KY   181   803   239
K7807152KY   125   477   173
K7908051UP   377  3531   608
K8010291UP   535  4945   829
K8109193UP   169  1827   468
----------------------------
     total  2668 22794  2141


the number of tokens occur just once (or more than 10 times) in the whole corpus

In [40]:
sum([i for i in Counter(x).values() if i==1])

1053

In [41]:
sum([1 for i in Counter(x).values() if i > 10])

283

In [43]:
print u'\u2010'

‐
