In [1]:
from collections import Counter
from contextlib import ExitStack
from tabulate import tabulate
from IPython.display import display, Markdown
from ilonimi import Vocabulary, Normalizer, Tokenizer

In [2]:
vocab = Vocabulary()
normalizer = Normalizer()
tokenizer = Tokenizer(
    convert_unk = True,
    convert_number = True,
    convert_proper = True)

path_list = [
    '../tokipona-corpus-collection/100tokipona/100tokipona.txt',
    '../tokipona-corpus-collection/tokipona1000/tokipona1000.txt',
    '../tokipona-corpus-collection/tatoeba/tatoeba.txt']

def preproc(sent):
    sent = sent.strip()
    sent = normalizer(sent)
    sent = tokenizer(sent)
    return sent
    
with ExitStack() as stack:
    sents = [
        preproc(sent)
        for path
        in path_list
        for sent
        in stack.enter_context(open(path))]

In [3]:
word_set = set(vocab.word_list)
freq = Counter([
    word
    for sent in sents
    for word in sent.split()
    if word in word_set
]).most_common()
num_words = sum([t for w, t in freq])

In [4]:
def show_table(accum = False):
    repeat = 5 # 横幅
    height = len(freq) // repeat # 縦幅 (26)
    table = [[''] * (2 * repeat) for _ in range(height)]
    
    num_accum = 0
    for i, (w, f) in enumerate(freq):
        num_accum += f
        table[i % height][i // height * 2] = w
        if accum:
            second = '{:.3f}%'.format(num_accum / num_words * 100)
        else:
            second = '{:.3f}%'.format(f / num_words * 100)
        table[i % height][i // height * 2 + 1] = second
        
    if accum:
        headers = ['単語', '累積頻度']
    else:
        headers = ['単語', '頻度']
    
    table = tabulate(table, headers = headers * repeat, tablefmt ='github')
    display(Markdown(table))

In [5]:
show_table()

| 単語   | 頻度   | 単語    | 頻度   | 単語   | 頻度   | 単語   | 頻度   | 単語            | 頻度   |
|--------|--------|---------|--------|--------|--------|--------|--------|-----------------|--------|
| li     | 8.849% | pilin   | 0.970% | sin    | 0.402% | moli   | 0.226% | waso            | 0.104% |
| e      | 6.667% | ma      | 0.940% | awen   | 0.391% | seli   | 0.226% | ko              | 0.102% |
| mi     | 4.968% | jo      | 0.889% | wan    | 0.369% | kulupu | 0.220% | monsi           | 0.099% |
| jan    | 4.448% | lili    | 0.854% | soweli | 0.364% | olin   | 0.216% | pan             | 0.093% |
| ni     | 3.993% | ike     | 0.761% | nasa   | 0.357% | pimeja | 0.207% | walo            | 0.092% |
| tawa   | 3.381% | pana    | 0.745% | tu     | 0.357% | insa   | 0.206% | loje            | 0.090% |
| la     | 3.016% | kepeken | 0.729% | mije   | 0.355% | sijelo | 0.205% | nena            | 0.089% |
| sina   | 2.938% | telo    | 0.697% | ante   | 0.348% | poki   | 0.201% | pipi            | 0.086% |
| lon    | 2.922% | pini    | 0.677% | mani   | 0.344% | esun   | 0.198% | selo            | 0.084% |
| ala    | 2.857% | suli    | 0.643% | sike   | 0.338% | kute   | 0.195% | mun             | 0.082% |
| ona    | 2.811% | sama    | 0.598% | lipu   | 0.337% | linja  | 0.195% | palisa          | 0.079% |
| pi     | 2.795% | moku    | 0.577% | pakala | 0.333% | ali    | 0.192% | alasa           | 0.075% |
| tenpo  | 1.958% | ale     | 0.568% | kin    | 0.317% | supa   | 0.191% | jelo            | 0.064% |
| kama   | 1.854% | suno    | 0.563% | mama   | 0.310% | anpa   | 0.187% | laso            | 0.061% |
| mute   | 1.829% | lawa    | 0.544% | poka   | 0.303% | noka   | 0.171% | kala            | 0.057% |
| toki   | 1.801% | anu     | 0.527% | kon    | 0.289% | suwi   | 0.166% | unpa            | 0.039% |
| pona   | 1.782% | ijo     | 0.525% | luka   | 0.285% | a      | 0.145% | mu              | 0.038% |
| seme   | 1.617% | ilo     | 0.506% | en     | 0.284% | kiwen  | 0.143% | oko             | 0.031% |
| wile   | 1.555% | wawa    | 0.474% | len    | 0.273% | uta    | 0.141% | monsuta         | 0.024% |
| sona   | 1.352% | taso    | 0.466% | kalama | 0.258% | lete   | 0.138% | kipisi          | 0.020% |
| tomo   | 1.323% | musi    | 0.458% | open   | 0.253% | sinpin | 0.131% | namako          | 0.011% |
| ken    | 1.281% | nasin   | 0.447% | lape   | 0.251% | akesi  | 0.130% | pu              | 0.006% |
| o      | 1.162% | meli    | 0.433% | nimi   | 0.246% | kili   | 0.126% | mulapisu        | 0.002% |
| pali   | 1.043% | sitelen | 0.422% | utala  | 0.234% | lupa   | 0.117% | tonsi           | 0.002% |
| lukin  | 1.008% | sewi    | 0.409% | nanpa  | 0.233% | kule   | 0.114% | kijetesantakalu | 0.002% |
| tan    | 0.972% | weka    | 0.404% | kasi   | 0.227% | jaki   | 0.107% | epiku           | 0.001% |

In [6]:
show_table(accum = True)

| 単語   | 累積頻度   | 単語    | 累積頻度   | 単語   | 累積頻度   | 単語   | 累積頻度   | 単語            | 累積頻度   |
|--------|------------|---------|------------|--------|------------|--------|------------|-----------------|------------|
| li     | 8.849%     | pilin   | 71.152%    | sin    | 86.412%    | moli   | 94.297%    | waso            | 98.670%    |
| e      | 15.517%    | ma      | 72.092%    | awen   | 86.803%    | seli   | 94.523%    | ko              | 98.772%    |
| mi     | 20.484%    | jo      | 72.981%    | wan    | 87.172%    | kulupu | 94.743%    | monsi           | 98.872%    |
| jan    | 24.932%    | lili    | 73.835%    | soweli | 87.536%    | olin   | 94.958%    | pan             | 98.965%    |
| ni     | 28.926%    | ike     | 74.596%    | nasa   | 87.894%    | pimeja | 95.166%    | walo            | 99.057%    |
| tawa   | 32.307%    | pana    | 75.341%    | tu     | 88.251%    | insa   | 95.372%    | loje            | 99.148%    |
| la     | 35.323%    | kepeken | 76.070%    | mije   | 88.606%    | sijelo | 95.577%    | nena            | 99.236%    |
| sina   | 38.260%    | telo    | 76.767%    | ante   | 88.954%    | poki   | 95.778%    | pipi            | 99.322%    |
| lon    | 41.183%    | pini    | 77.444%    | mani   | 89.298%    | esun   | 95.976%    | selo            | 99.406%    |
| ala    | 44.040%    | suli    | 78.087%    | sike   | 89.636%    | kute   | 96.171%    | mun             | 99.487%    |
| ona    | 46.850%    | sama    | 78.685%    | lipu   | 89.973%    | linja  | 96.366%    | palisa          | 99.566%    |
| pi     | 49.646%    | moku    | 79.263%    | pakala | 90.306%    | ali    | 96.558%    | alasa           | 99.641%    |
| tenpo  | 51.603%    | ale     | 79.831%    | kin    | 90.623%    | supa   | 96.750%    | jelo            | 99.705%    |
| kama   | 53.457%    | suno    | 80.394%    | mama   | 90.933%    | anpa   | 96.937%    | laso            | 99.766%    |
| mute   | 55.286%    | lawa    | 80.938%    | poka   | 91.236%    | noka   | 97.108%    | kala            | 99.823%    |
| toki   | 57.087%    | anu     | 81.466%    | kon    | 91.525%    | suwi   | 97.274%    | unpa            | 99.863%    |
| pona   | 58.869%    | ijo     | 81.991%    | luka   | 91.810%    | a      | 97.419%    | mu              | 99.901%    |
| seme   | 60.486%    | ilo     | 82.497%    | en     | 92.094%    | kiwen  | 97.562%    | oko             | 99.932%    |
| wile   | 62.041%    | wawa    | 82.971%    | len    | 92.367%    | uta    | 97.703%    | monsuta         | 99.955%    |
| sona   | 63.393%    | taso    | 83.438%    | kalama | 92.626%    | lete   | 97.841%    | kipisi          | 99.975%    |
| tomo   | 64.716%    | musi    | 83.895%    | open   | 92.879%    | sinpin | 97.972%    | namako          | 99.987%    |
| ken    | 65.997%    | nasin   | 84.342%    | lape   | 93.129%    | akesi  | 98.102%    | pu              | 99.993%    |
| o      | 67.159%    | meli    | 84.775%    | nimi   | 93.376%    | kili   | 98.228%    | mulapisu        | 99.995%    |
| pali   | 68.202%    | sitelen | 85.197%    | utala  | 93.610%    | lupa   | 98.346%    | tonsi           | 99.997%    |
| lukin  | 69.209%    | sewi    | 85.606%    | nanpa  | 93.843%    | kule   | 98.459%    | kijetesantakalu | 99.999%    |
| tan    | 70.182%    | weka    | 86.011%    | kasi   | 94.070%    | jaki   | 98.566%    | epiku           | 100.000%   |

In [7]:
bigrams = Counter([
    bigram
    for sent
    in [sent.split() for sent in sents]
    for bigram
    in zip(sent, sent[1:])
    if all(word in word_set for word in bigram)
]).most_common()

In [8]:
ngset = {'ona', 'e', 'ni', 'li', 'la', 'mi', 'sina', 'pi', 'o', 'ken', 'wile', 'ala', 'lon', 'tawa'}
[bigram
for bigram
in bigrams
if not any(word in ngset for word in bigram[0]) and bigram[1] > 500]

[(('anu', 'seme'), 1854),
 (('tenpo', 'suno'), 1383),
 (('tenpo', 'pini'), 1137),
 (('kama', 'jo'), 1043),
 (('ma', 'tomo'), 1015),
 (('tan', 'seme'), 1009),
 (('pona', 'mute'), 864),
 (('kama', 'sona'), 855),
 (('jan', 'lili'), 825),
 (('jan', 'seme'), 586),
 (('tenpo', 'kama'), 505)]

In [9]:
trigrams = Counter([
    trigram
    for sent
    in [sent.split() for sent in sents]
    for trigram
    in zip(sent, sent[1:], sent[2:])
    if all(word in word_set for word in trigram)
]).most_common()

In [10]:
[trigram
for trigram
in trigrams
if not any(word in ngset for word in trigram[0]) and trigram[1] > 30]

[(('tenpo', 'suno', 'kama'), 241),
 (('tenpo', 'suno', 'pini'), 205),
 (('kepeken', 'ilo', 'toki'), 178),
 (('ma', 'tomo', 'lawa'), 121),
 (('jan', 'lawa', 'ma'), 106),
 (('toki', 'kepeken', 'toki'), 105),
 (('kalama', 'musi', 'uta'), 100),
 (('tenpo', 'suno', 'ale'), 100),
 (('pilin', 'ike', 'tan'), 97),
 (('tenpo', 'suli', 'nanpa'), 89),
 (('pilin', 'pona', 'mute'), 81),
 (('pilin', 'akesi', 'tan'), 77),
 (('tenpo', 'pimeja', 'pini'), 76),
 (('nanpa', 'luka', 'tu'), 72),
 (('kama', 'tan', 'ma'), 64),
 (('sike', 'suno', 'mute'), 64),
 (('nanpa', 'luka', 'luka'), 63),
 (('luka', 'tu', 'wan'), 60),
 (('kama', 'tan', 'sewi'), 59),
 (('mute', 'anu', 'seme'), 58),
 (('pona', 'lukin', 'mute'), 57),
 (('suli', 'nanpa', 'luka'), 57),
 (('tan', 'ma', 'tomo'), 54),
 (('tenpo', 'kama', 'lili'), 54),
 (('pona', 'anu', 'seme'), 51),
 (('luka', 'luka', 'tu'), 51),
 (('luka', 'luka', 'luka'), 49),
 (('kepeken', 'supa', 'monsi'), 49),
 (('pilin', 'pona', 'tan'), 48),
 (('telo', 'kasi', 'seli'), 42),
