In [1]:
from collections import defaultdict
import json
import pandas as pd
import numpy as np

In [2]:
pu = pd.read_csv('https://raw.githubusercontent.com/janpona/pu/master/pu.csv')
pu

Unnamed: 0,Word,Alternative,Definition
0,a,kin,"PARTICLE (emphasis, emotion, or confirmation)"
1,akesi,,"NOUN non-cute animal; reptile, amphibian"
2,ala,,"ADJECTIVE no, not, zero"
3,alasa,,"VERB to hunt, forage"
4,ale,ali,"ADJECTIVE all; abundant, countless, bountiful,..."
...,...,...,...
115,wan,,"ADJECTIVE unique, united NUMBER one"
116,waso,,"NOUN bird, flying creature, winged animal"
117,wawa,,"ADJECTIVE strong, powerful; confident, sure; e..."
118,weka,,"ADJECTIVE absent, away, ignored"


In [3]:
def split_definition(definition):
    if definition == 'new, fresh; additional, another, extra':
        definition = 'ADJECTIVE ' + definition
    definitions = defaultdict(list)
    words = definition.split()
    for word in words:
        if word.isupper():
            current_definition = definitions[word.lower()]
            continue
        current_definition.append(word)
    return {
        pos: (
            ' '.join(definition).replace(';', ',').split(', ')
            if pos != 'particle' else
            [' '.join(definition).replace('(', '').replace(')', '')]
        )
        for pos, definition in definitions.items()
    }

split_definition(pu[' Definition'][0])

{'particle': ['emphasis, emotion, or confirmation']}

In [4]:
def dict_form(pu_row):
    word = pu_row['Word']
    alternative = pu_row[' Alternative']
    definitions = split_definition(pu_row[' Definition'])
    return {
        'forms': [word, alternative] if isinstance(alternative, str) else [word],
        'definitions': definitions
    }

dict_form(pu.iloc[0])

{'forms': ['a', 'kin'],
 'definitions': {'particle': ['emphasis, emotion, or confirmation']}}

In [5]:
pu_words = [dict_form(row) for idx, row in pu.iterrows()]
pu_words[0]

{'forms': ['a', 'kin'],
 'definitions': {'particle': ['emphasis, emotion, or confirmation']}}

In [6]:
# top-rated words from https://www.reddit.com/r/tokipona/comments/g9ne0s
extra_words = [
    {'forms': ['monsuta'], 'definitions': {'noun': ['monster']}},
    {'forms': ['kipisi'], 'definitions': {'verb': ['to split', 'cut', 'fragment']}},
    {'forms': ['kijetesantakalu'], 'definitions': {'noun': ['racoon']}},
    {'forms': ['tonsi'], 'definitions': {'noun': ['transgender person', 'non-binary person']}}
]

In [7]:
words = pu_words + extra_words
with open('words.json', 'w') as words_file:
    json.dump(words, words_file, indent=4, sort_keys=True)