In [1]:
import numpy as np
from scipy.spatial import distance
import random

import gensim

import nltk
from nltk.corpus import wordnet 
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
synset_filepath = './data/synsets.txt'
lexemes_filepath = './data/lexemes.txt'
mapping_filepath = './data/mapping.txt'
word2vec_filepath = './data/GoogleNews-vectors-negative300.bin'

In [3]:
def read_synset(synset_filepath):
    with open(synset_filepath) as f:
        sysnet_text = f.read().splitlines()
    sysnet_text = [i.split(' ') for i in sysnet_text]
    sysnet_text = sysnet_text[1:]
    synset_dict = {}
    for line in sysnet_text:
        synset_dict[line[0]] = np.array([np.float(i) for i in line[1:]])
    return synset_dict

In [4]:
def read_lexemes_dict(lexemes_filepath):
    with open(lexemes_filepath) as f:
        lexemes_text = f.read().splitlines()
    lexemes_text = [i.split(' ') for i in lexemes_text][1:]
    lexemes_dict = {}
    for line in lexemes_text:
        lexemes_dict[line[0]] = np.array([np.float64(i) for i in line[1:]])
    return lexemes_dict

In [5]:
def read_lexemes_list(lexemes_filepath):
    with open(lexemes_filepath) as f:
        lexemes_text = f.read().splitlines()
    lexemes_text = [i.split(' ') for i in lexemes_text][1:]
    lexemes_list = []
    for line in lexemes_text:
        lexemes_list.append([line[0], np.array([np.float64(i) for i in line[1:]])])
    return lexemes_list

In [6]:
def read_mapping(mapping_filepath):
    with open(mapping_filepath) as f:
        mapping_text = f.read().splitlines()
    mapping_dict = {}
    for line in mapping_text:
        if len(line) == 18:
            synset_name = line[:-1]
            mapping_dict[synset_name] = ['']
        else:
            synset_name = line.split(' ')[0]
            lexemes = [lexeme for lexeme in (line.split(' ')[1]).split(',')][:-1]
            mapping_dict[synset_name] = lexemes
    return mapping_dict

In [7]:
synset_dict = read_synset(synset_filepath)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  synset_dict[line[0]] = np.array([np.float(i) for i in line[1:]])


KeyboardInterrupt: 

In [8]:
mapping_dict = read_mapping(mapping_filepath)

In [9]:
lexemes_dict = read_lexemes_dict(lexemes_filepath)

In [10]:
lexemes_list = read_lexemes_list(lexemes_filepath)

In [11]:
# word_embed = gensim.models.KeyedVectors.load_word2vec_format(word2vec_filepath, binary=True)

In [12]:
# word_embed.most_similar(['suit'])

In [13]:
# word_embed['suit']

In [14]:
# wn-2.1-00900771-nはlawsuit, suitなどの概念
# synset_dict['wn-2.1-00900771-n']

In [15]:
wordnet.synset_from_sense_key('matter%1:03:00::')

Synset('matter.n.03')

In [16]:
wordnet.lemma_from_key('abstraction%1:03:00::')

Lemma('abstraction.n.06.abstraction')

In [17]:
wordnet.synsets('word')[0]

Synset('word.n.01')

In [18]:
wordnet.synsets('word')[0].lemmas()[0].key()

'word%1:10:00::'

In [19]:
def get_key_from_value(dic, val):
    for key, sense_keys in dic.items():
        for sense_key in sense_keys:
            if sense_key == val:
                return key
    return None

In [34]:
def get_verctor_from_word(word, lexemes_dict, mapping_dict, synset_idx=0):
    try:
        sense_key = wordnet.synsets(word)[synset_idx].lemmas()[0].key()
        vector_key = get_key_from_value(mapping_dict, sense_key)
    
        lexemes_vector = lexemes_dict[f'{word}-{vector_key}']
        return lexemes_vector
    except (KeyError, IndexError):
        return None

In [21]:

# print()
# v2 = get_verctor_from_word('matter', synset_idx=2)
# np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [22]:
v1 = get_verctor_from_word('beauty', synset_idx=0)
M = [lexeme[1] for lexeme in lexemes_list]

cos = 1 - distance.cdist([v1], M, metric='cosine')[0]


In [23]:
def get_most_similar_lexemes(word, rank_range=10, is_single=True):
    vector = get_verctor_from_word(word, synset_idx=0)
    cos_similarities = 1 - distance.cdist([vector], M, metric='cosine')[0]
    sorted_indices = np.argsort(cos_similarities)[::-1]
    most_similar_lexemes = []
    for i in range(rank_range):
        idx = sorted_indices[i+1] 
        most_similar_lexeme = lexemes_list[idx][0]
        cos_similarity = cos_similarities[idx]
        most_similar_lexemes.append([most_similar_lexeme, cos_similarity])
    if is_single:
        n_random = random.randint(0, len(most_similar_lexemes)-1)
        most_similar_lexeme = most_similar_lexemes[n_random][0][:-18]
        return most_similar_lexeme
    else:
        return most_similar_lexemes


In [24]:
get_most_similar_lexemes('table', is_single=False)

[['table-wn-2.1-03817683-n', 0.6717800528884559],
 ['table-wn-2.1-06287241-n', 0.6485356899937872],
 ['table-wn-2.1-06985773-n', 0.632223787891677],
 ['contents-wn-2.1-05435692-n', 0.6245781296883698],
 ['table-wn-2.1-03816979-n', 0.5325503949596477],
 ['drawer-wn-2.1-02815939-n', 0.4530191103317075],
 ['shelf-wn-2.1-03651067-n', 0.42058332823175015],
 ['container-wn-2.1-02692952-n', 0.41954335135017584],
 ['drawer-wn-2.1-08234021-n', 0.411639526908014],
 ['tray-wn-2.1-03901966-n', 0.4095814242002326]]

In [25]:
# word_embed.most_similar(['table'])

In [26]:
sense_key = wordnet.synsets('word')[0].lemmas()[0].key()
sense_key

'word%1:10:00::'

In [27]:
get_key_from_value(mapping_dict, sense_key)

'wn-2.1-05297961-n'

In [28]:
def get_hypernum(word, idx=0, distance=1):
    synsets = wordnet.synsets(word)
    try:
        synset = synsets[idx]
    except IndexError:
        print("No such word")
        return None
    try:
        hypernum = synset.hypernym_paths()[0][-(distance+1)]
    except IndexError:
        print("No such hypernum")
        return None
    hypernum_lexemes = hypernum.lemmas()[0].key()[:-10]
    return hypernum_lexemes

In [29]:
def get_hyponym(word, idx=0):
    synsets = wordnet.synsets(word)
    try:
        synset = synsets[idx]
    except IndexError:
        print("No such word")
        return None

    hyponyms = synset.hyponyms()
    if not hyponyms:
        print("No hyponyms")
        return None
    else:
        n_random = random.randint(0, len(hyponyms)-1)
        hyponym = hyponyms[n_random]
        hyponym_lexemes = hyponym.lemmas()[0].key()[:-10]
        return hyponym_lexemes

In [30]:
get_hypernum('apple', distance=1)

'edible_fruit'

In [31]:
get_hyponym('stress')

'sentence_stress'

In [40]:
import random

import numpy as np
from scipy.spatial import distance
import nltk
from nltk.corpus import wordnet 
nltk.download("wordnet")


def get_most_similar_lexemes(word, lexemes_dict, lexemes_list, mapping_dict, rank_range=10, is_single=True):
    vector = get_verctor_from_word(word, lexemes_dict, mapping_dict, synset_idx=0)
    if vector is not None:
        all_vectors = [lexeme[1] for lexeme in lexemes_list]
        cos_similarities = 1 - distance.cdist([vector], all_vectors, metric='cosine')[0]
        sorted_indices = np.argsort(cos_similarities)[::-1]
        most_similar_lexemes = []
        for i in range(rank_range):
            idx = sorted_indices[i+1] 
            most_similar_lexeme = lexemes_list[idx][0]
            cos_similarity = cos_similarities[idx]
            most_similar_lexemes.append([most_similar_lexeme, cos_similarity])
        if is_single:
            n_random = random.randint(0, len(most_similar_lexemes)-1)
            most_similar_lexeme = most_similar_lexemes[n_random][0][:-18]
            return most_similar_lexeme
        else:
            return most_similar_lexemes
    else:
        return None

def get_verctor_from_word(word, lexemes_dict, mapping_dict, synset_idx=0):
    try:
        sense_key = wordnet.synsets(word)[synset_idx].lemmas()[0].key()
        vector_key = get_key_from_value(mapping_dict, sense_key)
    
        lexemes_vector = lexemes_dict[f'{word}-{vector_key}']
        return lexemes_vector
    except (KeyError, IndexError):
        return None
    
def get_key_from_value(dic, val):
    for key, sense_keys in dic.items():
        for sense_key in sense_keys:
            if sense_key == val:
                return key
    return None

[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [67]:
idx = 0
input_word  = 'chair'

input_word_vector = get_verctor_from_word(input_word, lexemes_dict, mapping_dict)
if input_word_vector is not None:
    synsets = wordnet.synsets(input_word)
    try:
        synset = synsets[idx]
    except IndexError:
        print("No such word")
        # return None

    hyponyms = synset.hyponyms()
    print(hyponyms)
    if not hyponyms:
        print("No hyponyms")
        # return None
    else:
        sense_keys = [hyponym.lemmas()[0].key() for hyponym in hyponyms]
        vectors_keys = [get_key_from_value(mapping_dict, sense_key) for sense_key in sense_keys]
        words = [sense_key[:-10] for sense_key in sense_keys]
        lexemes_vectors = []
        words_not_none = []
        for vector_key, word in zip(vectors_keys, words):
            try:
                lexemes_vectors.append(lexemes_dict[f'{word}-{vector_key}'])
                words_not_none.append(word)
            except KeyError:  
                print('skipped')
                pass
        try:
            cos_similarities = 1 - distance.cdist([input_word_vector], lexemes_vectors, metric='cosine')[0]
            most_similar_idx = np.argmin(cos_similarities)
            hyponym_lexemes = words_not_none[most_similar_idx]
        except ValueError:
            return None

        # return hyponym_lexemes
else:
    # return None
    pass

[Synset('armchair.n.01'), Synset('barber_chair.n.01'), Synset('chair_of_state.n.01'), Synset('chaise_longue.n.01'), Synset('eames_chair.n.01'), Synset('fighting_chair.n.01'), Synset('folding_chair.n.01'), Synset('highchair.n.01'), Synset('ladder-back.n.01'), Synset('lawn_chair.n.01'), Synset('rocking_chair.n.01'), Synset('straight_chair.n.01'), Synset('swivel_chair.n.01'), Synset('tablet-armed_chair.n.01'), Synset('wheelchair.n.01')]
skipped
skipped
skipped
skipped
skipped
skipped
[array([ 4.73990e-02, -6.71220e-02, -5.36120e-02,  3.86000e-04,
       -3.14000e-04, -3.74600e-02,  5.03290e-02, -7.46180e-02,
        6.71640e-02,  1.72300e-03, -6.15000e-04, -1.07200e-02,
        2.73750e-02, -4.80300e-02, -3.65600e-03,  8.23000e-03,
        3.19530e-02, -9.38000e-04,  2.75040e-02,  2.72980e-02,
        4.94000e-02,  3.95340e-02, -1.12720e-02,  5.85200e-03,
        2.10070e-02, -1.37530e-02, -7.58180e-02,  6.67480e-02,
       -3.20250e-02, -9.30080e-02, -6.75590e-02,  3.87150e-02,
       -2

In [52]:
lexemes_vectors[0]

array([ 1.99780e-02, -2.33020e-02,  6.78820e-02, -3.73750e-02,
       -1.16078e-01, -1.63382e-01, -1.37588e-01, -2.49113e-01,
        2.92700e-02, -5.50840e-02,  1.31600e-02, -8.28230e-02,
        1.05890e-02, -8.10680e-02, -5.62730e-02,  1.21742e-01,
       -1.05700e-01,  5.00750e-02,  1.19756e-01,  7.10190e-02,
        9.69350e-02,  1.91140e-02,  6.79320e-02,  1.10611e-01,
        1.20372e-01,  1.86000e-02, -1.60236e-01,  1.55509e-01,
        1.42624e-01, -1.65603e-01,  3.29700e-02, -1.04362e-01,
       -4.18980e-02, -8.17610e-02,  7.13670e-02, -1.58514e-01,
        1.40002e-01,  1.51093e-01, -1.06600e-01,  3.26350e-02,
       -7.66720e-02, -1.40699e-01, -7.57990e-02, -1.26209e-01,
       -5.00220e-02, -5.51650e-02, -9.62820e-02, -6.63180e-02,
        1.58528e-01,  7.42600e-03, -1.30737e-01, -5.86010e-02,
       -8.51400e-02,  8.60550e-02, -1.31305e-01,  9.26530e-02,
       -8.13250e-02,  2.74877e-01, -8.99680e-02,  9.00340e-02,
       -7.49790e-02, -4.99260e-02, -5.89550e-02, -9.006

In [57]:
words_not_none, cos_similarities

(['ambulance',
  'bus',
  'cab',
  'compact',
  'convertible',
  'coupe',
  'cruiser',
  'electric',
  'hardtop',
  'hatchback',
  'horseless_carriage',
  'hot_rod',
  'jeep',
  'limousine',
  'loaner',
  'minicar',
  'minivan',
  'racer',
  'roadster',
  'sedan',
  'sport_utility',
  'subcompact'],
 array([-0.14163443, -0.25180293, -0.19444333, -0.28929809, -0.22012383,
        -0.19296742, -0.29526422, -0.24647144, -0.20611394, -0.26929983,
        -0.2248085 , -0.24444539, -0.20970778, -0.11490214, -0.13408958,
        -0.30866293, -0.25706729, -0.24737141, -0.23409254, -0.27393682,
        -0.20971055, -0.19605953]))

# 実験

## 入力

In [None]:
value_list = ['car']
mean_list = ['simplicity']
state_list = ['tire']
attr_list = ['safety']

In [None]:
word_lists = [value_list, mean_list, state_list, attr_list]

## 出力

### wordnet+語義ベクトル

In [None]:
def get_word(input_word):
    p_random = random.random()
    if p_random < 0.5:
        output_word = get_most_similar_lexemes(input_word, rank_range=10)
        method = 'lexemes'
    elif p_random < 0.8:
        output_word = get_hyponym(input_word, idx=0)
        method = 'hyponym'
    else:
        output_word = get_hypernum(input_word, idx=0)
        method = 'hypernum'
    return output_word, method

In [None]:
get_word('beauty')

In [None]:
result = []
for word_list in word_lists:
    for word in word_list:
        word, method = get_word(word)
        result.append([word, method])

In [None]:
result

In [None]:
def test(word_lists):
    new_result = word_lists.copy()
    for word_list in word_lists:
        word = word_list[0]
        try:
            word, method = get_word(word)
            new_result.append([word, method])
        except KeyError:
            pass
        except AttributeError:
            pass
    return new_result

In [None]:
for _ in range(4):
    result = test(result)


In [None]:
lexemes = []
hyponym = []
hypernum = []

for i in result:
    if i[1] == 'lexemes':
        lexemes.append(i[0])
    elif i[1] == 'hyponym':
        hyponym.append(i[0])
    else:
        hypernum.append(i[0])
print('### lexemes ###')
print(lexemes)
print('### hyponym ###')
print(hyponym)
print('### hypernum ###')
print(hypernum)

In [76]:
def test(x):
    if x == 1:
        return None, None
    else:
        return 1, 2

In [80]:
x, y = test(1)

In [None]:
import math
import random
import pickle
import gensim

topword = 20 # ?
word2vec_filepath = 'data/GoogleNews-vectors-negative300.bin'
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_filepath, binary=True)
print('読み込み完了')

def generate_similar_space(space_name, main_list, renkan):

    for i in range(1):# ????
        wordlist = []
        # ランダムに選択する
        n_random = math.floor(random.uniform(0, len(main_list)))
        n_random = 0  # ここを消せばランダムなる

        入力単語集合 = [main_list[n_random]]
        
        # 最も類似度が高い単語を抽出
        kyouki = word2vec_model.most_similar(positive=入力単語集合, topn=topword)

        # 記録リストの調整
        for result in kyouki:
            wordlist.append(result[0])  # 結果からワードだけを取り出している，値だけでも可能
        print(space_name + '空間内での共起') # space_nameは価値、意味、状態、属性のどれか
        print(main_list[n_random])
        print(wordlist, end='')


        # topword:一度に提示する共起単語の数
        for j in range(topword):
            # 重複チェック
            if not wordlist[j] in value_list and \
            not wordlist[j] in mean_list and \
            not wordlist[j] in state_list and \
            not wordlist[j] in attr_list:
                # if not wordlist[j] in usedv and not wordlist[j] in usedm and not wordlist[j] in useds and not wordlist[j] in useda:
                main_list.append(wordlist[j])  # 今まで共起した単語と重複しないj+1番めの順位の共起語を持ってくる
                renkan.append([main_list[n_random], wordlist[j]])
                break

    print('採用された単語：' + wordlist[j]) # ??
    print(renkan)

# 与えられた属性要素，状態要素から新たな状態要素を発想 seni=遷移
def generate_different_space(x, beforelist, afterlist, renkan):

    r_random = math.floor(random.uniform(0, len(renkan)))
    bn_random = math.floor(random.uniform(1, len(beforelist)))

    # renkan[r_random][0] #価値1
    # 選択した連関と選ばれた単語が等しい場合だめなので、異なるまでランダムを回す
    if x == '属性から状態' or x == '状態から意味' or x == '意味から価値':  # 要するに下から上の場合
        while renkan[r_random][1] == beforelist[bn_random]:  
            r_random = math.floor(random.uniform(0, len(renkan)))
            bn_random = math.floor(random.uniform(1, len(beforelist)))
    else:  # 要するに上から下の場合
        while renkan[r_random][0] == beforelist[bn_random]:
            r_random = math.floor(random.uniform(0, len(renkan)))
            bn_random = math.floor(random.uniform(1, len(beforelist)))

    # 既存1-既存2+発生源1=新要素
    r_random = 0  # ここを消せばランダムな連関が選択される 0にすると自分で入力した要素の連関

    if x == '属性から状態' or x == '状態から意味' or x == '意味から価値':  # 要するに下から上の場合
        words_positive = [renkan[r_random][0], beforelist[bn_random]]
        word_negetive = [renkan[r_random][1]]

        kyouki = word2vec_model.most_similar(positive=words_positive, negative=word_negetive, topn=topword)

    else:  # 要するに上から下の場合
        words_positive = [renkan[r_random][1], beforelist[bn_random]]
        word_negetive = [renkan[r_random][0]]

        kyouki = word2vec_model.most_similar(positive=words_positive, negative=word_negetive, topn=topword)

    # 共起を表示する
    wordlist = []
    for result in kyouki:
        wordlist.append(result[0])
    print(x + 'を発想')
    if x == '属性から状態' or x == '状態から意味' or x == '意味から価値':  # 要するに下から上の場合
        print(renkan[r_random][1] + ' - ' + renkan[r_random][0] + ' + ' + beforelist[bn_random])
    else:
        print(renkan[r_random][1] + ' - ' + renkan[r_random][0] + ' + ' + beforelist[bn_random])
    print(wordlist, end='')

    # リストの調整
    # afuse.append(afterlist[0])
    # afterlist.remove(afterlist[0])
    for j in range(topword):
        # 重複チェック
        if not wordlist[j] in value_list \
        and not wordlist[j] in mean_list \
        and not wordlist[j] in state_list \
        and not wordlist[j] in attr_list:
            # if not wordlist[j] in usedv and not wordlist[j] in usedm and not wordlist[j] in useds and not wordlist[j] in useda:
            afterlist.append(wordlist[j])  # 重複した場合j番めの順位の共起語を持ってくる
            break
    print('採用された単語：' + wordlist[j])
    # 連関の記録
    if x == '属性から状態' or x == '状態から意味' or x == '意味から価値':
        renkan.append([wordlist[j], beforelist[bn_random]])
    else:
        renkan.append([beforelist[bn_random], wordlist[j]])


renkan = []
value_list = ['safety']
mean_list = ['toughness']
state_list = ['velocity']
attr_list = ['tire']
generate_similar_space('value', value_list, renkan)
generate_similar_space('meaning', mean_list, renkan)
generate_similar_space('state', state_list, renkan)
generate_similar_space('attribute', attr_list, renkan)

generate_different_space('意味から価値', mean_list, value_list, renkan)


In [None]:
from word_extractor.lexemes_vector import get_most_similar_lexemes
from word_extractor.wordnet import get_hypernum, get_hyponym, get_not_similar_hyponym
from utils.dataloader import read_lexemes_dict_and_list, read_mapping

synset_filepath = './data/synsets.txt'
lexemes_filepath = './data/lexemes.txt'
mapping_filepath = './data/mapping.txt'
word2vec_filepath = './data/GoogleNews-vectors-negative300.bin'

lexemes_dict, lexemes_list = read_lexemes_dict_and_list(lexemes_filepath)
mapping_dict = read_mapping(mapping_filepath)

def extract_words(input_words, lexemes_dict, lexemes_list, mapping_dict):
    """フローチャート通りに単語を抽出する関数

    Args:
        extracted_words ([type]): [description]

    Returns: extracted_words, links

    ex)
    extracted_words = [
        {id:0, label:"word1"},
        {id:1, label:"word2"},
        ...
    ], 
    links = [
        {source:0, target:1},
        {source:0, target:2},
        ...
    ];
    """

    extracted_words = []
    links = []

    for word in input_words:
        num_words = len(extracted_words)
        extracted_words.append({'id':num_words, 'label':word})
        # 機能1〜4
        word1 = get_hypernum(word)
        word2 = get_hyponym(word)
        word3 = get_not_similar_hyponym(word)
        word4 = get_most_similar_lexemes(word, lexemes_dict, lexemes_list, mapping_dict)
        print(word1, word2, word3, word4)

        count = 0
        for output_word in [word1, word2, word3, word4]:
            if output_word is None:
                print('Got None')
            else:
                count += 1
                input_id = num_words
                output_id = input_id + count
                extracted_words.append({'id':output_id, 'label':output_word})
                links.append({'source':input_id, 'target':output_id})
        # 機能5 ()

    return extracted_words, links

input_words = ['car', 'toughness', 'velocity', 'tire']
extract_words(input_words, lexemes_dict, lexemes_list, mapping_dict)

In [2]:
import numpy as np

from word_extractor.lexemes_vector import get_most_similar_lexemes, get_verctor_from_word, get_verctor_from_sense_key, get_word_from_vector
from word_extractor.wordnet import get_hypernum, get_hyponym, get_not_similar_hyponym
from utils.dataloader import read_lexemes_dict_and_list, read_mapping

input_words = ['car', 'toughness', 'velocity', 'tire']
synset_filepath = './data/synsets.txt'
lexemes_filepath = './data/lexemes.txt'
mapping_filepath = './data/mapping.txt'
word2vec_filepath = './data/GoogleNews-vectors-negative300.bin'

lexemes_dict, lexemes_list = read_lexemes_dict_and_list(lexemes_filepath)
mapping_dict = read_mapping(mapping_filepath)


In [3]:

def extract_words(input_words, lexemes_dict, lexemes_list, mapping_dict):

    extracted_words = []
    links = []
    word_histries = []

    # 機能1〜4
    for word in input_words:
        num_words = len(extracted_words)
        extracted_words.append({'id':num_words, 'label':word})
        word_histries.append(word)

        # 抽出
        word1, sense_key1 = get_hypernum(word)
        word2, sense_key2 = get_hyponym(word)
        word3, sense_key3 = get_not_similar_hyponym(word, lexemes_dict, mapping_dict)
        word4, sense_key4 = get_most_similar_lexemes(word, lexemes_dict, lexemes_list, mapping_dict)
        print(word1, word2, word3, word4, sense_key1, sense_key2, sense_key3, sense_key4)
        word_histries.append(sense_key1)
        word_histries.append(sense_key2)
        word_histries.append(sense_key3)
        word_histries.append(sense_key4)
        

        # エッジを定義
        count = 0
        for output_word in [word1, word2, word3, word4]:
            if output_word is None:
                print('Got None')
            else:
                count += 1
                input_id = num_words
                output_id = input_id + count
                extracted_words.append({'id':output_id, 'label':output_word})
                links.append({'source':input_id, 'target':output_id})
    print(word_histries)

    for idx in range(int(len(word_histries) / 5)):
        input_word_vector = get_verctor_from_word(word_histries[5*idx], lexemes_dict, mapping_dict)
        word1_vector, word2_vector, word3_vector, word4_vector = [np.array(get_verctor_from_sense_key(sense_key, lexemes_dict, mapping_dict), dtype=np.float64) for sense_key in word_histries[5*idx+1:5*idx+5]]
        if input_word_vector is not None: #Noneじゃなかったら
            if word1_vector is not None:
                word5_1_vector = input_word_vector - word3_vector + word1_vector
                word5_1, sense_key5_1 = get_word_from_vector(word5_1_vector, lexemes_list, mapping_dict)
                print('##############', word5_1)
                #TODO word1とかのextracted_wordsでのIDを取得
                    # その単語の前にいくつのNoneがあるかを計算
                #TODO extracted_words, linksを定義
                
            if word3_vector is not None:
                word5_3_vector = input_word_vector - word3_vector + word3_vector
                word5_3, sense_key5_3 = get_word_from_vector(word5_3_vector, lexemes_list, mapping_dict)
                print('##############', word5_3)
            if word4_vector is not None:
                word5_4_vector = input_word_vector - word3_vector + word4_vector
                word5_4, sense_key5_4 = get_word_from_vector(word5_4_vector, lexemes_list, mapping_dict)
                print('##############', word5_4)
    


    return extracted_words, links

In [19]:
extracted_words = []
links = []
word_histries = []
word_histries_2 = []

# 機能1〜4
for word in input_words:
    num_words = len(extracted_words)
    extracted_words.append({'id':num_words, 'label':word})
    word_histries.append(word)

    # 抽出
    word1, sense_key1 = get_hypernum(word)
    word2, sense_key2 = get_hyponym(word)
    word3, sense_key3 = get_not_similar_hyponym(word, lexemes_dict, mapping_dict)
    word4, sense_key4 = get_most_similar_lexemes(word, lexemes_dict, lexemes_list, mapping_dict)

    for sense_key in [sense_key1, sense_key2, sense_key3, sense_key4]:
        word_histries.append(sense_key1)
    word_histries_2.append([word, sense_key1, sense_key2, sense_key3, sense_key4])

    # エッジを定義
    count = 0
    for output_word in [word1, word2, word3, word4]:
        if output_word is None:
            # print('Got None')
            pass
        else:
            count += 1
            input_id = num_words
            output_id = input_id + count
            extracted_words.append({'id':output_id, 'label':output_word})
            links.append({'source':input_id, 'target':output_id})

skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped


In [20]:
# extracted_words, links

In [34]:
import itertools
import random

def rand_ints_nodup(a, b, k):
    ns = []
    while len(ns) < k:
        n = random.randint(a, b)
        if not n in ns:
            ns.append(n)
    return ns

def word_histries_to_vectors(word_histries):
    word_input_vector = get_verctor_from_word(word_histries[0], lexemes_dict, mapping_dict)
    word1_vector, word2_vector, word3_vector, word4_vector = [np.array(get_verctor_from_sense_key(sense_key, lexemes_dict, mapping_dict), dtype=np.float64) for sense_key in word_histries[1:]]

    return [word_input_vector, word1_vector, word2_vector, word3_vector, word4_vector]

def check_id(idx, word_histries):
    num_none = word_histries[:idx].count(None)
    id = idx - num_none
    return id

# 機能5(2つの入力単語の組み合わせ)
# 組み合わせをランダム6こ選ぶ
# num_input_words = len(input_words)
num_input_words = 4
combi_all = list(itertools.combinations(range(num_input_words), 2))
idx_conbi = rand_ints_nodup(0, len(combi_all)-1, 6)
combi_list = [combi_all[idx] for idx in idx_conbi]

for idx1, idx2 in combi_list:
    id_input_1 = check_id(idx1*5, word_histries)
    id_input_2 = check_id(idx2*5, word_histries)

    word_idx1_vector, _, _, word_idx1_3_vector, _ = word_histries_to_vectors(word_histries_2[idx1])
    word_idx2_vector, word_idx2_1_vector, word_idx2_2_vector, word_idx2_3_vector, word_idx2_4_vector= word_histries_to_vectors(word_histries_2[idx2])
    
    if (word_idx1_vector is not None) and (word_idx1_3_vector is not None): #Noneじゃなかったら
        for word_vector in [word_idx2_vector, word_idx2_1_vector, word_idx2_2_vector, word_idx2_3_vector, word_idx2_4_vector]:
            if word_vector is not None:
                output_word_vector_1 = word_idx1_vector - word_idx1_3_vector + word_vector
                output_word, sense_key_output_word = get_word_from_vector(output_word_vector_1, lexemes_list, mapping_dict)

                wordlist = [word_dic.get('label') for word_dic in extracted_words]
                if not output_word in wordlist:
                    print('##############', output_word)
                    id_output = len(extracted_words)
                    extracted_words.append({'id':id_output, 'label':output_word})
                    links.append({'source':id_input_1, 'target':id_output})
                    links.append({'source':id_input_2, 'target':id_output})


    


    



In [15]:
def check_id(idx, word_histries):
    num_none = word_histries[:idx].count(None)
    id = idx - num_none
    return id

for idx in range(int(len(word_histries) / 5)):
    input_word_vector = get_verctor_from_word(word_histries[5*idx], lexemes_dict, mapping_dict)
    word1_vector, word2_vector, word3_vector, word4_vector = [np.array(get_verctor_from_sense_key(sense_key, lexemes_dict, mapping_dict), dtype=np.float64) for sense_key in word_histries[5*idx+1:5*idx+5]]
    if (input_word_vector is not None) and (word3_vector is not None): #Noneじゃなかったら
        if word1_vector is not None:
            output_word_vector = input_word_vector - word3_vector + word1_vector
            output_word, sense_key_output_word = get_word_from_vector(output_word_vector, lexemes_list, mapping_dict)
            id_input, id_1, id_3 = [check_id(i, word_histries) for i in [idx*5, idx*5+1, idx*5+3]]
            print('##############', output_word, id_input, id_1, id_3)
            output_id = len(extracted_words)
            extracted_words.append({'id':output_id, 'label':output_word})
            for input_id in [id_input, id_1, id_3]:
                links.append({'source':input_id, 'target':output_id})
        if word4_vector is not None:
            output_word_vector = input_word_vector - word3_vector + word4_vector
            output_word, sense_key_output_word = get_word_from_vector(output_word_vector, lexemes_list, mapping_dict)
            id_input, id_3, id_4 = [check_id(i, word_histries) for i in [idx*5, idx*5+3, idx*5+4]]
            print('##############', output_word, id_input, id_3, id_4)
            output_id = len(extracted_words)
            extracted_words.append({'id':output_id, 'label':output_word})
            for input_id in [id_input, id_3, id_4]:
                links.append({'source':input_id, 'target':output_id})

############## car 0 1 3
############## car 0 3 4
############## radioactively 5 6 8
############## radioactively 5 8 8
############## radioactively 9 10 12
############## radioactively 9 12 13
############## radioactively 14 15 17
############## radioactively 14 17 18


In [16]:
extracted_words, links

([{'id': 0, 'label': 'car'},
  {'id': 1, 'label': 'motor_vehicle'},
  {'id': 2, 'label': 'sport_utility'},
  {'id': 3, 'label': 'minicar'},
  {'id': 4, 'label': 'mill'},
  {'id': 5, 'label': 'toughness'},
  {'id': 6, 'label': 'endurance'},
  {'id': 7, 'label': 'legs'},
  {'id': 8, 'label': 'endurance'},
  {'id': 9, 'label': 'velocity'},
  {'id': 10, 'label': 'rate'},
  {'id': 11, 'label': 'airspeed'},
  {'id': 12, 'label': 'groundspeed'},
  {'id': 13, 'label': 'airspeed'},
  {'id': 14, 'label': 'tire'},
  {'id': 15, 'label': 'hoop'},
  {'id': 16, 'label': 'pneumatic_tire'},
  {'id': 17, 'label': 'pneumatic_tire'},
  {'id': 18, 'label': 'tubeless_tire'},
  {'id': 19, 'label': 'car'},
  {'id': 20, 'label': 'car'},
  {'id': 21, 'label': 'radioactively'},
  {'id': 22, 'label': 'radioactively'},
  {'id': 23, 'label': 'radioactively'},
  {'id': 24, 'label': 'radioactively'},
  {'id': 25, 'label': 'radioactively'},
  {'id': 26, 'label': 'radioactively'}],
 [{'source': 0, 'target': 1},
  {'sou

In [4]:
input_words = ['car', 'toughness', 'velocity', 'tire']
extract_words(input_words, lexemes_dict, lexemes_list, mapping_dict)

skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
motor_vehicle sport_utility minicar gas motor_vehicle%1:06:00:: sport_utility%1:06:00:: limousine%1:06:00:: gas%1:27:02::
skipped
endurance legs None toughness endurance%1:07:00:: legs%1:07:00:: None toughness%1:07:02::
Got None
skipped
skipped
skipped
skipped
skipped
skipped
skipped
rate speed_of_light groundspeed coefficient rate%1:28:00:: speed_of_light%1:28:00:: angular_velocity%1:19:00:: coefficient%1:23:00::
skipped
skipped
hoop wagon_tire pneumatic_tire pneumatic_tire hoop%1:06:00:: wagon_tire%1:06:00:: car_tire%1:06:00:: pneumatic_tire%1:06:00::
['car', 'motor_vehicle%1:06:00::', 'sport_utility%1:06:00::', 'limousine%1:06:00::', 'gas%1:27:02::', 'toughness', 'endurance%1:07:00::', 'legs%1:07:00::', None, 'toughness%1:07:02::', 'velocity', 'rate%1:28:00::', 'speed_of_light%1:28:00::', 'angular_velocity%1:19:00::', 'coefficient%1:23:00::', 'tire', 'hoop%1:06:00::', 'wagon_tire%1:06:00::', 'car_tire%1:06:00::'

([{'id': 0, 'label': 'car'},
  {'id': 1, 'label': 'motor_vehicle'},
  {'id': 2, 'label': 'sport_utility'},
  {'id': 3, 'label': 'minicar'},
  {'id': 4, 'label': 'gas'},
  {'id': 5, 'label': 'toughness'},
  {'id': 6, 'label': 'endurance'},
  {'id': 7, 'label': 'legs'},
  {'id': 8, 'label': 'toughness'},
  {'id': 9, 'label': 'velocity'},
  {'id': 10, 'label': 'rate'},
  {'id': 11, 'label': 'speed_of_light'},
  {'id': 12, 'label': 'groundspeed'},
  {'id': 13, 'label': 'coefficient'},
  {'id': 14, 'label': 'tire'},
  {'id': 15, 'label': 'hoop'},
  {'id': 16, 'label': 'wagon_tire'},
  {'id': 17, 'label': 'pneumatic_tire'},
  {'id': 18, 'label': 'pneumatic_tire'}],
 [{'source': 0, 'target': 1},
  {'source': 0, 'target': 2},
  {'source': 0, 'target': 3},
  {'source': 0, 'target': 4},
  {'source': 5, 'target': 6},
  {'source': 5, 'target': 7},
  {'source': 5, 'target': 8},
  {'source': 9, 'target': 10},
  {'source': 9, 'target': 11},
  {'source': 9, 'target': 12},
  {'source': 9, 'target': 13}