In [4]:
import json
from glob import glob
import gzip
import os
import re
import shutil
import string

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [5]:
with open("ordbok-nno-dump-2023-05-09.json", "r") as f:
    data = json.load(f)
    
print(len(data))

95436


In [2]:
with open("ordbok-nob-dump-2023-05-09.json", "r") as f:
    data_bokmål = json.load(f)
    
print(len(data_bokmål))

80124


In [18]:
data_bokmål[100].keys()

dict_keys(['body', 'lemmas', 'suggest', 'to_index', 'submitted', 'article_id'])

In [19]:
data_bokmål[100]['lemmas']

[{'id': 904,
  'hgno': 0,
  'lemma': 'acetyl',
  'junction': '',
  'split_inf': None,
  'final_lexeme': 'acetyl',
  'neg_junction': None,
  'paradigm_info': [{'to': None,
    'from': '1996-01-01',
    'tags': ['NOUN', 'Neuter'],
    'inflection': [{'tags': ['Sing', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Sing', 'Def'], 'word_form': 'acetylet'},
     {'tags': ['Plur', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Plur', 'Def'], 'word_form': 'acetyla'}],
    'paradigm_id': 669,
    'standardisation': 'STANDARD',
    'inflection_group': 'NOUN_regular'},
   {'to': None,
    'from': '1996-01-01',
    'tags': ['NOUN', 'Neuter'],
    'inflection': [{'tags': ['Sing', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Sing', 'Def'], 'word_form': 'acetylet'},
     {'tags': ['Plur', 'Ind'], 'word_form': 'acetyl'},
     {'tags': ['Plur', 'Def'], 'word_form': 'acetylene'}],
    'paradigm_id': 679,
    'standardisation': 'STANDARD',
    'inflection_group': 'NOUN_regular'}],
  'initial_lexeme'

In [34]:
lemma2idx = {}

for i, word in enumerate(tqdm(data_bokmål)):
    if word.get("lemmas"):
        lemma = word['lemmas'][0]["lemma"]
        if lemma in lemma2idx:
            lemma += "1"
        lemma2idx[lemma] = i

HBox(children=(FloatProgress(value=0.0, max=80124.0), HTML(value='')))




In [35]:
nn_lemma2idx = {}

for i, word in enumerate(tqdm(data)):
    if word.get("lemmas"):
        lemma = word['lemmas'][0]["lemma"]
        if lemma in nn_lemma2idx:
            lemma += "1"
        nn_lemma2idx[lemma] = i

HBox(children=(FloatProgress(value=0.0, max=95436.0), HTML(value='')))




In [39]:
NO_CHARS = re.compile(r"[øæåØÅÆ]")

In [37]:
def find_forms(article, word_forms):
    if article.get("lemmas"):
        for lemma in article["lemmas"]:
            paradigms = lemma['paradigm_info']
            for paradigm in paradigms:
                if 'NOUN' in paradigm["tags"]:
                    inflections = paradigm["inflection"]
                    for form in inflections:
                        if form["word_form"] is not None:
                            if re.search(NO_CHARS, form["word_form"]) is not None:
                                word_forms.append(re.sub(r"[øØ]", "oe", form["word_form"]))
                                word_forms.append(re.sub(r"[åÅ]", "aa", form["word_form"]))
                                word_forms.append(re.sub(r"[æÆ]", "ae", form["word_form"]))
                            word_forms.append(form["word_form"])
                            if ("Sing" in form["tags"]) and ("Def" in form["tags"]):
                                word_forms.append(form["word_form"]+"s")
                                if re.search(NO_CHARS, form["word_form"]) is not None:
                                    word_forms.append(re.sub(r"[øØ]", "oe", form["word_form"])+"s")
                                    word_forms.append(re.sub(r"[åÅ]", "aa", form["word_form"])+"s")
                                    word_forms.append(re.sub(r"[æÆ]", "ae", form["word_form"])+"s")
    return word_forms

In [49]:
def create_dataset(lemma2idx, nn_lemma2idx):
    examples = []
    words = set()
    words_list = []
    
    for path in glob(os.path.expanduser('~/PycharmProjects/gloss-annotator/wugs/nor_dia_change/*/data/*')):
        word = os.path.split(path)[-1]
        words.add(word)
    
    for word in words:
        word_forms = []
        words_list.append(word)
            
        if word != "Syden": # немецкое слово, не изменяется
            if word == "tape":
                word = "teip"
            if word == 'særforbund': # нет в словаре
                nb = data_bokmål[lemma2idx['forbund']]
                word_forms = find_forms(nb, word_forms)
                try:
                    nn = data[nn_lemma2idx['forbund']]
                    word_forms = find_forms(nn, word_forms)
                except KeyError:
                    pass
                for i, wf in enumerate(word_forms):
                    word_forms[i] = 'sær' + wf
            else:
                nb = data_bokmål[lemma2idx[word]]
                word_forms = find_forms(nb, word_forms)
                try:
                    nn = data[nn_lemma2idx[word]]
                    word_forms = find_forms(nn, word_forms)
                except KeyError:
                    pass
            print(word_forms)
            examples.append(" ".join(word_forms) + "\n")
            if not word_forms:
                print(word, lemma2idx[word])
        else:
            examples.append("Syden syden\n")
    
    df = pd.DataFrame({"words": words_list, "forms": examples})
    df.to_csv("norwegian_forms.tsv", sep="\t", index=False)

In [50]:
create_dataset(lemma2idx, nn_lemma2idx)


['varsel', 'varselet', 'varselets', 'varsel', 'varsla', 'varsel', 'varselet', 'varselets', 'varsel', 'varslene', 'varsel', 'varslet', 'varslets', 'varsler', 'varsla', 'varsel', 'varslet', 'varslets', 'varsler', 'varslene', 'varsel', 'varselet', 'varselets', 'varsel', 'varsla']
['bystyrerepresentant', 'bystyrerepresentanten', 'bystyrerepresentantens', 'bystyrerepresentanter', 'bystyrerepresentantene', 'bystyrerepresentant', 'bystyrerepresentanten', 'bystyrerepresentantens', 'bystyrerepresentantar', 'bystyrerepresentantane']
['kanal', 'kanalen', 'kanalens', 'kanaler', 'kanalene', 'kanal', 'kanalen', 'kanalens', 'kanalar', 'kanalane']
['fil', 'filen', 'filens', 'filer', 'filene', 'fil', 'fila', 'filas', 'filer', 'filene', 'fil', 'fila', 'filas', 'filer', 'filene']
['melding', 'meldingen', 'meldingens', 'meldinger', 'meldingene', 'melding', 'meldinga', 'meldingas', 'meldinger', 'meldingene', 'melding', 'meldinga', 'meldingas', 'meldingar', 'meldingane']
['leilighet', 'leiligheten', 'leilig