In [1]:
import json
import gzip
import os
import re
import shutil
import string

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
data_path = os.path.expanduser("~/defmod/datasets/norwegian/ordbok-dump-2023-05-09.json")

In [3]:
with open(os.path.join(data_path, "ordbok-nno-dump-2023-05-09.json"), "r") as f:
    data = json.load(f)
    
print(len(data))

95436


In [4]:
with open(os.path.join(data_path, "ordbok-nob-dump-2023-05-09.json"), "r") as f:
    data_bokmål = json.load(f)
    
print(len(data_bokmål))

80124


In [11]:
lemma2idx = {}

for i, word in enumerate(tqdm(data_bokmål)):
    if word.get("lemmas"):
        lemma = word['lemmas'][0]["lemma"]
        lemma2idx[lemma] = i

  0%|          | 0/80124 [00:00<?, ?it/s]

In [12]:
lemma2idx["atferdsforsking"]

2626

In [13]:
lemma2idx["næringsinnhold"]

35829

In [15]:
lemma2idx["næringsdrivende"]

35826

In [None]:
data_bokmål[2626]

есть объяснения к примерам {'quote': {'items': [], 'content': 'har en sagt a, får en (også) si b'}, 'type_': 'example', 'explanation': {'items': [], 'content': 'har en først begynt på noe, får en også fullføre det'}}

если нижние индексы типа О2 пишем как обычные цифры

In [17]:
#DICT_OF_CONTRACTIONS = {'contraction': [], 'type':[], 'lemma':[]}

In [18]:
CONTRACTIONS_FILENAME = "ordboka_contractions"
contractions = pd.read_csv(f'{CONTRACTIONS_FILENAME}.txt', sep="\t", encoding="utf-8")
contractions.columns

Index(['contraction', 'type', 'lemma', 'full'], dtype='object')

In [19]:
contractions_dict = {row[1].contraction: row[1].full for row in contractions.iterrows()}

In [20]:
def collect_items(element_items, lemma, collect_contractions=False): #element_["items"]
    items = []
    for item in element_items:
        if item['type_'] in {'usage', 'subscript', 'superscript'}:
            items.append(item["text"])
        elif item['type_'] == 'article_ref': 
            if item.get('word_form'):
                items.append(item['word_form'])
            else:
                items.append(item['lemmas'][0]['lemma'])
                #  word_form может не быть, если слово не изменяется
        elif item['type_'] in TYPES_WITH_IDS:
            items.append(contractions_dict.get(item['id'], item['id']))
            if collect_contractions:
                DICT_OF_CONTRACTIONS['contraction'].append(item['id'])
                DICT_OF_CONTRACTIONS['type'].append(item['type_'])
                DICT_OF_CONTRACTIONS['lemma'].append(lemma)

        elif item['type_'] == 'quote_inset':
            quote_content = item['content']
            inset_items = collect_items(item["items"], lemma)
            try:
                items.append(replace_dollars(inset_items, quote_content))
            except IndexError:
                print(f"{item} caused index error in quote inset")

        elif item['type_'] == 'fraction':
            items.append(f"{item['numerator']}/{item['denominator']}")

        else:
            print(item)
    return items

In [21]:
dollar_pattern=re.compile(r"\$")
def replace_dollars(items, content):
    offset = 0
    for i, dollar in enumerate(re.finditer(dollar_pattern, content)):
        start = dollar.start() + offset
        end = dollar.end() + offset

        content = content[:start] + items[i] + content[end:]
        offset += len(items[i]) - 1
    return content

In [22]:
def replace_dollars_with_items(content, element_, lemma):
    if '$' in content:
        items = collect_items(element_["items"], lemma)
        #print(f"Old content: {content}")
        if items: # для статьи про знак доллара будут пустые 
            content = replace_dollars(items, content)
    #print(f"New content: {content}")
    return content

In [23]:
PUNCTUATION = re.compile(r"[!\?\.]+$")

In [24]:
PROMPTS = (
' Hva betyr ',
' What is the definition of ',
' Kva betyr ',
)

In [31]:
def make_example(df, prompt_id, add_prompt):
    content = df.example
    lemma = df.word
    if isinstance(content, str):
        if re.search(PUNCTUATION, content.strip()) is not None:
            if add_prompt:
                df.example = content + f'{PROMPTS[prompt_id]}{lemma}?'
            return df
        if add_prompt:
            df.example = content + f'.{PROMPTS[prompt_id]}{lemma}?'
        else:
            df.example = content + '.'
        return df

In [26]:
def handle_subdefinition(
    element_,
    current_definition,
    examples,
    gold_definitions,
    lemma,
    prompt_id,
    targets,
    pos,
    poses,
):
    sub_definitions = element_['elements']
    sub_defined = current_definition
    for sub_definition in sub_definitions:
        if sub_definition['type_'] == 'explanation':
            content = sub_definition["content"]
            content = replace_dollars_with_items(content, sub_definition, lemma)
            sub_defined = current_definition + ' ' + content
            
        elif sub_definition['type_'] == 'example':
            #print('example')
            sub_definition = sub_definition['quote']
            content = sub_definition["content"]
            content = replace_dollars_with_items(content, sub_definition, lemma)
            
            
            try:
                gold_definitions.append(sub_defined)
            except UnboundLocalError:
                print(element_)
                raise UnboundLocalError
            examples.append(content)
            poses.append(pos)
            targets.append(lemma)
            
        
    return examples, gold_definitions, targets

In [27]:
TYPES_WITH_IDS = {
'relation',
'domain',
'entity',
'temporal',
'language',
'grammar',
'rhetoric',
}

In [28]:
from pprint import pprint as print

In [35]:
def create_dataset(data, prompt_id, spraak="norwegian", add_prompt=False):
    examples, gold_definitions, targets, poses = [], [], [], []
    
    for i, word in enumerate(tqdm(data)):
        if word.get("lemmas"):
            lemma = word['lemmas'][0]["lemma"]
            
            #print(f"Lemma: {lemma}")
            pos = word['lemmas'][0]['paradigm_info'][0]['inflection_group'].split("_")[0]
            
            definitions = word['body']['definitions']
            definitions_examples = {}
            for definition in definitions:
                if definition.get('elements') is None:
                    #print(f"Lemma: {lemma}") # такое слово только одно и на сайте его вообще нет
                    #print(f"No elements in {definition}")
                    continue
                current_definition, content = '', ''
                for element in definition["elements"]:
                    if element.get('elements') is not None:
                        # много определений?
                        current_definition = ''
                        for element_ in element["elements"]:
                            if element_['type_'] in {"definition", "explanation"}:
                                
                                #print('Definition or explanation')
                                if not element_.get('sub_definition'):
                                    if not element_.get('elements'):

                                        content = element_["content"]
                                        content = replace_dollars_with_items(content, element_, lemma)
                                        if current_definition:
                                            current_definition = current_definition + ', ' + content
                                        else: 
                                            current_definition = content

                                    else: #есть пара мест где sub_definition не подписано но оно есть
                                        examples, gold_definitions, targets = handle_subdefinition(
                                            element_,
                                            current_definition,
                                            examples,
                                            gold_definitions,
                                            lemma,
                                            prompt_id,
                                            targets,
                                            pos,
                                            poses,
                                        )
                                else:
                                    examples, gold_definitions, targets = handle_subdefinition(
                                        element_,
                                        current_definition,
                                        examples,
                                        gold_definitions,
                                        lemma,
                                        prompt_id,
                                        targets,
                                        pos,
                                        poses,
                                    )

                            elif element_['type_'] == 'example':
                                #print('example')
                                element_ = element_['quote']
                                content = element_["content"]
                                content = replace_dollars_with_items(content, element_, lemma)

                                gold_definitions.append(current_definition)
                                examples.append(content)
                                poses.append(pos)
                                targets.append(lemma)

                    else:
                        #print('No elements in elements')
                        #print(element['type_'])
                        try:
                            if element['type_'] in {"definition", "explanation"}:
                                #print('Definition or explanation')
                                content = replace_dollars_with_items(element['content'], element, lemma)

                                if current_definition:
                                    current_definition = current_definition + ', ' + content
                                else: 
                                    current_definition = content

                            elif element['type_'] == 'example':
                                #print('example')
                                element = element['quote']
                                content = replace_dollars_with_items(element['content'], element, lemma)


                                gold_definitions.append(current_definition)
                                examples.append(content)
                                targets.append(lemma)
                                poses.append(pos)
                        except IndexError:
                            print(element)
                            raise IndexError
                gold_definitions.append(current_definition)
                examples.append(content)
                targets.append(lemma)
                poses.append(pos)

    df = pd.DataFrame({"word": targets, 'gloss': gold_definitions, 'example': examples, 'POS': poses,})
    print(df.shape)
    df.drop_duplicates(inplace=True)
    print(df.shape)
    df.replace('', np.nan, inplace=True)
    print(df[df.example.isna()])
    df.dropna(subset="gloss", inplace=True)
    print(df.shape)
    folder_path = os.path.expanduser(f'{spraak}/')
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    filename = os.path.join(folder_path, 'complete.tsv.gz')
    print(f"\nWriting to {filename}")
    df.to_csv(filename, sep="\t", index=False, encoding="utf-8", compression='gzip')
    print(df.columns)
    filename = "no_examples_allowed"
    df = df.apply(lambda x: make_example(x, prompt_id, add_prompt), axis=1)
    if add_prompt:
        df = df.drop("word", axis=1)
        filename = PROMPTS[prompt_id].translate(
        str.maketrans(string.punctuation + " ", '_'*(len(string.punctuation) + 1)),
    )
    
    filename = f'{spraak}_finetuning_{filename}.tsv.gz'
    print(df.columns)
    print(f"\nWriting to {filename}")
    df.to_csv(filename, sep="\t", index=False, encoding="utf-8", compression='gzip')  
    return df

In [36]:
nb_0 = create_dataset(data_bokmål, 0)


  0%|          | 0/80124 [00:00<?, ?it/s]

(150774, 4)
(121659, 4)
             word gloss example   POS
98      absentere   NaN     NaN  VERB
187       ad acta   NaN     NaN   ADV
190          adam   NaN     NaN  NOUN
347      ad undas   NaN     NaN   ADV
679          akke   NaN     NaN  VERB
...           ...   ...     ...   ...
149547       kåte   NaN     NaN  VERB
150335       grav   NaN     NaN   ADV
150394  kuppelhue   NaN     NaN  NOUN
150588      velde   NaN     NaN  NOUN
150770    klesvei   NaN     NaN  NOUN

[532 rows x 4 columns]
(119476, 4)
'\nWriting to norwegian/complete.tsv.gz'
Index(['word', 'gloss', 'example', 'POS'], dtype='object')
Index(['word', 'gloss', 'example', 'POS'], dtype='object')
'\nWriting to norwegian_finetuning_no_examples_allowed.tsv.gz'


In [None]:
create_dataset(data_bokmål, 1)
create_dataset(data, 2, "nynorsk")
create_dataset(data, 1, "nynorsk")

In [None]:
contractions_df = pd.DataFrame(DICT_OF_CONTRACTIONS)

In [None]:
contractions_df.drop_duplicates(subset=['contraction'], inplace=True)
contractions_df.head()

In [None]:
#contractions_df.to_csv(f'{CONTRACTIONS_FILENAME}.tsv', sep="\t", index=False, encoding="utf-8")

убрать определения типа brukt som substantiv? - don't drop them so far