In [1]:
import json
from glob import glob
import gzip
import os
import re
import shutil
import string

import pandas as pd

In [7]:
path_to_dump = os.path.expanduser("~/defmod/datasets/norwegian/ordbok-dump-2023-05-09.json")

with open(os.path.join(path_to_dump, "ordbok-nno-dump-2023-05-09.json"), "r", encoding="utf8") as f:
    data = json.load(f)
    
print(len(data))

95436


In [8]:
with open(os.path.join(path_to_dump, "ordbok-nob-dump-2023-05-09.json"), "r", encoding="utf8") as f:
    data_bokmål = json.load(f)
    
print(len(data_bokmål))

80124


In [9]:
data_bokmål[100].keys()

dict_keys(['body', 'lemmas', 'suggest', 'to_index', 'submitted', 'article_id'])

In [10]:
data_bokmål[100]['lemmas']

[{'id': 904,
  'hgno': 0,
  'lemma': 'acetyl',
  'junction': '',
  'split_inf': None,
  'final_lexeme': 'acetyl',
  'neg_junction': None,
  'paradigm_info': [{'to': None,
    'from': '1996-01-01',
    'tags': ['NOUN', 'Neuter'],
    'inflection': [{'tags': ['Sing', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Sing', 'Def'], 'word_form': 'acetylet'},
     {'tags': ['Plur', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Plur', 'Def'], 'word_form': 'acetyla'}],
    'paradigm_id': 669,
    'standardisation': 'STANDARD',
    'inflection_group': 'NOUN_regular'},
   {'to': None,
    'from': '1996-01-01',
    'tags': ['NOUN', 'Neuter'],
    'inflection': [{'tags': ['Sing', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Sing', 'Def'], 'word_form': 'acetylet'},
     {'tags': ['Plur', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Plur', 'Def'], 'word_form': 'acetylene'}],
    'paradigm_id': 679,
    'standardisation': 'STANDARD',
    'inflection_group': 'NOUN_regular'}],
  'initial_lexeme'

In [12]:
lemma2idx = {}

for i, word in enumerate(data_bokmål):
    if word.get("lemmas"):
        lemma = word['lemmas'][0]["lemma"]
        if lemma in lemma2idx:
            lemma += "1"
        lemma2idx[lemma] = i

In [13]:
nn_lemma2idx = {}

for i, word in enumerate(data):
    if word.get("lemmas"):
        lemma = word['lemmas'][0]["lemma"]
        if lemma in nn_lemma2idx:
            lemma += "1"
        nn_lemma2idx[lemma] = i

In [14]:
NO_CHARS = re.compile(r"[øæåØÅÆ]")

In [15]:
def find_forms(article, word_forms):
    if article.get("lemmas"):
        for lemma in article["lemmas"]:
            paradigms = lemma['paradigm_info']
            for paradigm in paradigms:
                if 'NOUN' in paradigm["tags"]:
                    inflections = paradigm["inflection"]
                    for form in inflections:
                        if form["word_form"] is not None:
                            if re.search(NO_CHARS, form["word_form"]) is not None:
                                word_forms.append(re.sub(r"[øØ]", "oe", form["word_form"]))
                                word_forms.append(re.sub(r"[åÅ]", "aa", form["word_form"]))
                                word_forms.append(re.sub(r"[æÆ]", "ae", form["word_form"]))
                            word_forms.append(form["word_form"])
                            if ("Sing" in form["tags"]) and ("Def" in form["tags"]):
                                word_forms.append(form["word_form"]+"s")
                                if re.search(NO_CHARS, form["word_form"]) is not None:
                                    word_forms.append(re.sub(r"[øØ]", "oe", form["word_form"])+"s")
                                    word_forms.append(re.sub(r"[åÅ]", "aa", form["word_form"])+"s")
                                    word_forms.append(re.sub(r"[æÆ]", "ae", form["word_form"])+"s")
    return word_forms

In [18]:
def create_dataset(lemma2idx, nn_lemma2idx):
    examples = []
    words = set()
    words_list = []
    
    for path in glob(os.path.expanduser('~/PycharmProjects/gloss-annotator/wugs/nor_dia_change/*/data/*')):
        word = os.path.split(path)[-1]
        words.add(word)
    
    for word in words:
        word_forms = []
        words_list.append(word)
            
        if word != "Syden": # немецкое слово, не изменяется
            if word == "tape":
                word = "teip"

            if word in {'formiddagen', "landet"}:
                word = word[:-2]
            if word == 'særforbund': # нет в словаре
                nb = data_bokmål[lemma2idx['forbund']]
                word_forms = find_forms(nb, word_forms)
                try:
                    nn = data[nn_lemma2idx['forbund']]
                    word_forms = find_forms(nn, word_forms)
                except KeyError:
                    pass
                for i, wf in enumerate(word_forms):
                    word_forms[i] = 'sær' + wf
            else:
                nb = data_bokmål[lemma2idx[word]]
                word_forms = find_forms(nb, word_forms)
                try:
                    nn = data[nn_lemma2idx[word]]
                    word_forms = find_forms(nn, word_forms)
                except KeyError:
                    pass
            print(word_forms)
            examples.append(" ".join(word_forms))
            if not word_forms:
                print(word, lemma2idx[word])
        else:
            examples.append("Syden syden")
    
    df = pd.DataFrame({"words": words_list, "forms": examples})
    df.to_csv("norwegian_forms.tsv", sep="\t", index=False)

In [19]:
create_dataset(lemma2idx, nn_lemma2idx)


['sete', 'setet', 'setets', 'seter', 'seta', 'sete', 'setet', 'setets', 'seter', 'setene', 'sete', 'seta', 'setas', 'seter', 'setene']
['stryk', 'stryken', 'strykens', 'stryker', 'strykene', 'stryk', 'stryket', 'strykets', 'stryk', 'stryka', 'stryk', 'stryket', 'strykets', 'stryk', 'strykene', 'stryk', 'stryket', 'strykets', 'stryk', 'stryka']
['tavle', 'tavlen', 'tavlens', 'tavler', 'tavlene', 'tavle', 'tavla', 'tavlas', 'tavler', 'tavlene', 'tavle', 'tavla', 'tavlas', 'tavler', 'tavlene']
['etterforskning', 'etterforskningen', 'etterforskningens', 'etterforskninger', 'etterforskningene', 'etterforskning', 'etterforskninga', 'etterforskningas', 'etterforskninger', 'etterforskningene', 'etterforsking', 'etterforskingen', 'etterforskingens', 'etterforskinger', 'etterforskingene', 'etterforsking', 'etterforskinga', 'etterforskingas', 'etterforskinger', 'etterforskingene']
['særforbund', 'særforbundet', 'særforbundets', 'særforbund', 'særforbunda', 'særforbund', 'særforbundet', 'særforbun