In [2]:
import json
from pathlib import Path

In [3]:
json_path = Path("data/wordnet-jp.json")
json_open = json_path.open()
json_dict = json.load(json_open)

In [6]:
json_dict.keys()

dict_keys(['LexicalResource'])

In [7]:
len(json_dict["LexicalResource"])

3

In [8]:
json_dict["LexicalResource"].keys()

dict_keys(['GlobalInformation', 'Lexicon', 'SenseAxes'])

In [9]:
json_dict["LexicalResource"]["GlobalInformation"]

{'label': 'Japanese WordNet 1.1 by NICT'}

In [11]:
json_dict["LexicalResource"]["Lexicon"].keys()

dict_keys(['languageCoding', 'label', 'language', 'owner', 'version', 'LexicalEntry', 'Synset'])

In [12]:
len(json_dict["LexicalResource"]["Lexicon"]["LexicalEntry"])

93834

In [13]:
len(json_dict["LexicalResource"]["Lexicon"]["Synset"])

112345

In [16]:
for key in json_dict["LexicalResource"]["Lexicon"].keys():
    print(f"""{key}_len = {len(json_dict["LexicalResource"]["Lexicon"][key])}""")

languageCoding_len = 9
label_len = 16
language_len = 3
owner_len = 4
version_len = 3
LexicalEntry_len = 93834
Synset_len = 112345


In [20]:
json_dict["LexicalResource"]["SenseAxes"]['SenseAxis'][0]

{'id': 'sa_jpn-1.1-0',
 'relType': 'eq_synonym',
 'Target': [{'ID': 'jpn-1.1-01785341-a'}, {'ID': 'eng-30-01785341-a'}]}

In [40]:
for entry in json_dict["LexicalResource"]["Lexicon"]["LexicalEntry"][:10]:
    print(entry["Lemma"])

{'writtenForm': '夜半', 'partOfSpeech': 'n'}
{'writtenForm': '骨膜', 'partOfSpeech': 'n'}
{'writtenForm': '応じる', 'partOfSpeech': 'v'}
{'writtenForm': 'フレグランス', 'partOfSpeech': 'n'}
{'writtenForm': '職掌', 'partOfSpeech': 'n'}
{'writtenForm': '終始一貫して', 'partOfSpeech': 'r'}
{'writtenForm': 'ノースカロライナ', 'partOfSpeech': 'n'}
{'writtenForm': 'メセン', 'partOfSpeech': 'n'}
{'writtenForm': '奥ゆかしげ', 'partOfSpeech': 'a'}
{'writtenForm': '雄勁', 'partOfSpeech': 'n'}


In [43]:
from importlib import reload

In [44]:
import wordnet_db

In [63]:
reload(wordnet_db)
getSynonym = wordnet_db.getSynonym
getSynonym("犬")

['いぬ',
 'まわし者',
 'イヌ',
 'スパイ',
 'ドッグ',
 '回し者',
 '回者',
 '密偵',
 '工作員',
 '廻し者',
 '廻者',
 '探',
 '探り',
 '洋犬',
 '秘密捜査員',
 '諜報員',
 '諜者',
 '間者',
 '間諜',
 '隠密',
 '飼い犬',
 '飼犬']

In [49]:
reload(wordnet_db)
getHypernym = wordnet_db.getHypernym
getHypernym("犬")

In [50]:
import sqlite3
conn = sqlite3.connect("./data/wordnet-jp.db")

In [51]:
class node:
    def __init__(self, name, children=None):
        self.name = name  # String
        self.children = children  # List of Class node

    # 結果表示用
    def display(self, indent = 0):
        if self.children != None:
            print(' '*indent + self.name)
            for c in self.children:
                c.display(indent+1)
        else:
            print(' '*indent + self.name)

In [52]:
# 上位-下位の関係にある概念の抽出
hierarchy_dict = {}  # key:上位語(String), value:下位語(List of String)
n_term_set = set()  # 下位語に含まれる単語集合

cur = conn.execute("select synset1,synset2 from synlink where link='hypo'")  # 上位語-下位語の関係にあるものを抽出
for row in cur:
    b_term = row[0]
    n_term = row[1]

    if b_term not in hierarchy_dict:
        hierarchy_dict[b_term] = []

    hierarchy_dict[b_term].append(n_term) 
    n_term_set.add(n_term)

print("上位語に含まれる単語の数 ： %s" % len(hierarchy_dict))

top_concepts = list(set(hierarchy_dict.keys()) - n_term_set)
print("上位語に含まれる単語の中で下位語に含まれない単語の数 ： %s" % len(top_concepts))

上位語に含まれる単語の数 ： 20008
上位語に含まれる単語の中で下位語に含まれない単語の数 ： 346


In [53]:
# synset(概念)のIDから、概念の名称に変換する辞書の作成
synset_name_dict = {}  # key:synsetのID, value:synsetの名称
cur = conn.execute("select synset,name from synset")
for row in cur:
    synset_name_dict[row[0]] = row[1]
for k,v in synset_name_dict.items():
    print("%s : %s" % (k,v))
    break

07125096-n : expletive


In [60]:
for key in list(hierarchy_dict.keys())[:10]:
    print(synset_name_dict[key], end=" => ")
    print(" ".join([synset_name_dict[value] for value in hierarchy_dict[key]]))

measles => german_measles
dwelling => cliff_dwelling condominium fixer-upper fireside hermitage homestead house pile_dwelling lodge messuage semi-detached_house vacation_home yurt
acetum => chili_vinegar cider_vinegar wine_vinegar
atomic_weapon => atom_bomb fusion_bomb megaton_bomb
apnea => periodic_apnea_of_the_newborn sleep_apnea
teashop => buttery
oil_palm => african_oil_palm american_oil_palm
blood_transfusion => exchange_transfusion
cut_of_meat => joint confit chop chine leg side_of_meat forequarter hindquarter cut_of_beef rib entrecote shank shin brisket steak loin sirloin undercut neck shoulder cut_of_veal cut_of_mutton cut_of_lamb saddle rack cut_of_pork
atomic_number => magic_number
