In [2]:
import numpy as np
import pandas as pd
import gensim

from word_extractor.lexemes_vector import get_most_similar_lexemes, get_verctor_from_word, get_verctor_from_sense_key, get_word_from_vector, get_analogy
from word_extractor.wordnet import get_hypernum, get_hyponym, get_not_similar_hyponym
from utils.dataloader import read_lexemes_dict_and_list, read_mapping

input_words = ['car', 'toughness', 'velocity', 'tire']
synset_filepath = './data/synsets.txt'
lexemes_filepath = './data/lexemes.txt'
mapping_filepath = './data/mapping.txt'
word2vec_filepath = './data/GoogleNews-vectors-negative300.bin'

lexemes_dict, lexemes_list = read_lexemes_dict_and_list(lexemes_filepath)
mapping_dict = read_mapping(mapping_filepath)
word_embed = gensim.models.KeyedVectors.load_word2vec_format(word2vec_filepath, binary=True)

[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
def show_word2vec_and_autoextend(word, synset_idx=0):
    print('####    word2vec     #####')
    for i, word_w2 in enumerate(word_embed.most_similar([word])):
        print(f'{word_w2[0]}')

    print('####    AutoExtend     #####')
    most_similar_lexemes = get_most_similar_lexemes(word, lexemes_dict, lexemes_list, mapping_dict, rank_range=10, is_single=False, synset_idx=synset_idx)
    for i, lexeme in enumerate(most_similar_lexemes[0]):
        print(f'{lexeme[0][:-18]}')

In [58]:
word_embed.most_similar(['cat'])

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]

In [59]:
word_embed.most_similar(['cat'])

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]

# 実験１ 単語実験

## 語義曖昧性

Suit

In [34]:
show_word2vec_and_autoextend('suit')

####    word2vec     #####
suits
lawsuit
Suit
Atta_chakki_delivery
lawsuits
countersuit
polka_dot_clown
His_showmanship_rhinestone
complaint
lawsuit_alleging
####    AutoExtend     #####
zoot_suit
garment
gabardine
tailcoat
tuxedo
suit
tux
suit
suit
trousers


In [23]:
show_word2vec_and_autoextend('chair')

####    word2vec     #####
chairs
Chair
chairperson
chairwoman
chairman
Vice_Chair
Co_Chair
chairing
Chairs
cochair
####    AutoExtend     #####
seat
sofa
armchair
stool
recliner
chaise_longue
swivel_chair
rocking_chair
chaise
lawn_chair


In [24]:
show_word2vec_and_autoextend('table', synset_idx=1)

####    word2vec     #####
tables
ConocoPhillips_BPAmerica
Capitalized_Included
tray
dining_room
banquette
rapping_cappella
sideboard
linen_tablecloth
Tables
####    AutoExtend     #####
worktable
table
bookcase
chair
room
sideboard
tray
workbench
furniture
credenza


## 上位下位

In [13]:
def show_word2vec_and_hypo_hyper(word, synset_idx=0):
    # print('####    word2vec     #####')
    # for i, word_w2 in enumerate(word_embed.most_similar([word])):
    #     print(f'{word_w2[0]}')
    
    # print('####    word2vec + 上位下位     #####')
    # for i, word_w2 in enumerate(word_embed.most_similar([word])):
    #     if i < 6:
    #         print(f'{word_w2[0]}')
    #     else:
    #         break

    print(f'{get_hypernum(word, idx=synset_idx)[0]}')
    print(f'{get_hypernum(get_hypernum(word, idx=synset_idx)[0])[0]}')
    print(f'{get_hypernum(get_hypernum(get_hypernum(word, idx=synset_idx)[0])[0])[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    
    

In [14]:
show_word2vec_and_hypo_hyper('suit')

garment
clothing
consumer_goods
pinstripe
single-breasted_suit
business_suit
slack_suit
zoot_suit
business_suit


In [15]:
show_word2vec_and_hypo_hyper('table', synset_idx=1)

furniture
furnishing
accessory
card_table
worktable
stand
pedestal_table
console_table
tea_table


In [16]:
show_word2vec_and_hypo_hyper('dog', synset_idx=0)

canine
tooth
bone
pug
lapdog
lapdog
mexican_hairless
puppy
basenji


In [17]:
show_word2vec_and_hypo_hyper('chair', synset_idx=0)

seat
space
attribute
tablet-armed_chair
barber_chair
tablet-armed_chair
straight_chair
chair_of_state
wheelchair


## 類推

In [2]:
from word_extractor.lexemes_vector import AutoextendExtractor
from word_extractor.wordnet import get_hypernum, get_hyponym, get_not_similar_hyponym
import gensim
import random

lexemes_filepath = './data/lexemes.txt'
mapping_filepath = './data/mapping.txt'
autoextend = AutoextendExtractor(lexemes_filepath, mapping_filepath)

word2vec_filepath = './data/GoogleNews-vectors-negative300.bin'
word_embed = gensim.models.KeyedVectors.load_word2vec_format(word2vec_filepath, binary=True)


# 重複なし乱数
def rand_ints_nodup(a, b, k):
  ns = []
  while len(ns) < k:
    n = random.randint(a, b)
    if not n in ns:
      ns.append(n)
  return ns

[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def analogy_by_similarity(word):
    idx1, idx2, idx3  = rand_ints_nodup(0, 9, 3)
    words = word_embed.most_similar([word])
    results = word_embed.most_similar(positive=[words[idx1][0], words[idx2][0]],negative=[words[idx3][0]])
    output_word = None
    for result_word in results:
        if (result_word != words[idx1][0]) and \
           (result_word != words[idx2][0]) and \
           (result_word != words[idx3][0]):
           
            output_word = result_word
            break
        
    print(output_word)

In [16]:
def analogy_by_hypo_hyper(word, synset_idx=0):
    sensekey_input = autoextend.get_senseky_from_word(word, synset_idx=0)
    similar1 = autoextend.get_most_similar_lexemes(word, synset_idx=0)[1]
    similar2 = autoextend.get_most_similar_lexemes(word, synset_idx=0)[1]
    similar3 = autoextend.get_most_similar_lexemes(word, synset_idx=0)[1]
    hyper = get_hypernum(word, idx=synset_idx)[1]
    hyper_hyper = get_hypernum(get_hypernum(word, idx=synset_idx)[0])[1]
    hypo1 = get_hyponym(word, idx=synset_idx)[1]
    hypo2 = get_hyponym(word, idx=synset_idx)[1]
    hypo3 = get_hyponym(word, idx=synset_idx)[1]

    word_list = [sensekey_input, similar1, similar2, similar3, hyper, hyper_hyper, hypo1, hypo2, hypo3]
    
    for i in word_list:
        if i is not None:
            print(i)
        else:
            print('sensekey is None')

    for _ in range(20):
        idx1, idx2, idx3 = rand_ints_nodup(0, len(word_list)-1, 3)
        autoextend.get_analogy_by_sensekey(word_list[idx1], word_list[idx2], word_list[idx3])

    analogy_words = []
    analogy_words.append(autoextend.get_analogy_by_sensekey(hyper, sensekey_input, similar1))
    analogy_words.append(autoextend.get_analogy_by_sensekey(hyper, sensekey_input, similar2))
    analogy_words.append(autoextend.get_analogy_by_sensekey(sensekey_input, hypo1, similar1))
    analogy_words.append(autoextend.get_analogy_by_sensekey(sensekey_input, hypo1, similar2))
    analogy_words.append(autoextend.get_analogy_by_sensekey(sensekey_input, hypo2, similar1))
    analogy_words.append(autoextend.get_analogy_by_sensekey(sensekey_input, hypo2, similar2))
    analogy_words.append(autoextend.get_analogy_by_sensekey(hyper_hyper, sensekey_input, similar1))
    analogy_words.append(autoextend.get_analogy_by_sensekey(hyper_hyper, sensekey_input, similar2))

    


In [17]:
def analogy_by_hypo_hyper_other(word1, word2, synset_idx=0):
    sensekey_input1 = autoextend.get_senseky_from_word(word1, synset_idx=0)
    sensekey_input2 = autoextend.get_senseky_from_word(word2, synset_idx=0)
    similar1 = autoextend.get_most_similar_lexemes(word1, synset_idx=0)[1]
    similar2 = autoextend.get_most_similar_lexemes(word1, synset_idx=0)[1]
    hyper = get_hypernum(word1, idx=synset_idx)[1]
    hyper_hyper = get_hypernum(get_hypernum(word1, idx=synset_idx)[0])[1]
    hyper_hyper_hyper = get_hypernum(get_hypernum(get_hypernum(word1, idx=synset_idx)[0])[0])[1]
    hypo1 = get_hyponym(word1, idx=synset_idx)[1]
    hypo2 = get_hyponym(word1, idx=synset_idx)[1]
    
    print(sensekey_input1, sensekey_input2 ,similar1, similar2, hyper, hyper_hyper, hypo1, hypo2)

    autoextend.get_analogy_by_sensekey(hyper, sensekey_input1, sensekey_input2)
    autoextend.get_analogy_by_sensekey(sensekey_input1, hypo1, sensekey_input2)
    autoextend.get_analogy_by_sensekey(sensekey_input1, hypo2, sensekey_input2)
    autoextend.get_analogy_by_sensekey(hyper_hyper, sensekey_input1, sensekey_input2)

    autoextend.get_analogy_by_sensekey(hyper, sensekey_input2, sensekey_input1)
    autoextend.get_analogy_by_sensekey(sensekey_input2, hypo1, sensekey_input1)
    autoextend.get_analogy_by_sensekey(sensekey_input2, hypo2, sensekey_input1)
    autoextend.get_analogy_by_sensekey(hyper_hyper, sensekey_input2, sensekey_input1)

    print('hyper____')
    autoextend.get_analogy_by_sensekey(hyper_hyper_hyper, sensekey_input1, sensekey_input2)
    autoextend.get_analogy_by_sensekey(hyper_hyper_hyper, sensekey_input1, hypo1)
    autoextend.get_analogy_by_sensekey(hyper_hyper_hyper, sensekey_input2, hypo2)
    autoextend.get_analogy_by_sensekey(hyper_hyper_hyper, sensekey_input2, similar1)
    autoextend.get_analogy_by_sensekey(hyper_hyper_hyper, sensekey_input2, similar2)


In [18]:
analogy_by_hypo_hyper_other('table', 'beauty')

table%1:14:00:: beauty%1:07:00:: table%1:06:01:: contents%1:10:00:: array%1:14:00:: arrangement%1:14:00:: contents%1:10:00:: actuarial_table%1:14:00::
array - table + beauty = loveliness
table - contents + beauty = glamor
None
arrangement - table + beauty = loveliness
array - beauty + table = slender
beauty - contents + table = glamor
None
arrangement - beauty + table = slender
hyper____
planning - table + beauty = loveliness
planning - table + contents = contained
None
planning - beauty + table = slender
planning - beauty + contents = table


In [19]:
analogy_by_hypo_hyper_other('dryer', 'fun')

dryer%1:06:00:: fun%1:04:00:: washing_machine%1:06:00:: stove%1:06:00:: appliance%1:06:00:: device%1:06:00:: clothes_dryer%1:06:00:: clothes_dryer%1:06:00::
appliance - dryer + fun = durables
dryer - clothes_dryer + fun = hair_dryer
dryer - clothes_dryer + fun = hair_dryer
device - dryer + fun = carpet
appliance - fun + dryer = clothes_dryer
fun - clothes_dryer + dryer = hair_dryer
fun - clothes_dryer + dryer = hair_dryer
device - fun + dryer = clothes_dryer
hyper____
instrumentality - dryer + fun = bag
instrumentality - dryer + clothes_dryer = unlawfully
instrumentality - fun + clothes_dryer = dryer
instrumentality - fun + washing_machine = washer
instrumentality - fun + stove = cooker


In [20]:
analogy_by_hypo_hyper_other('dryer', 'strange')

dryer%1:06:00:: strange%3:00:00:: water_heater%1:06:00:: microwave_oven%1:06:00:: appliance%1:06:00:: device%1:06:00:: clothes_dryer%1:06:00:: hand_blower%1:06:00::
appliance - dryer + strange = durables
dryer - clothes_dryer + strange = hair_dryer
None
device - dryer + strange = carpet
appliance - strange + dryer = clothes_dryer
strange - clothes_dryer + dryer = hair_dryer
None
device - strange + dryer = clothes_dryer
hyper____
instrumentality - dryer + strange = preserver
instrumentality - dryer + clothes_dryer = unlawfully
None
instrumentality - strange + water_heater = stove
instrumentality - strange + microwave_oven = microwave


In [21]:
def compare_analogy(word, synset_idx=0):
    print('####    類似度の類推     #####')
    for _ in range(10):
        analogy_by_similarity(word)
    
    print('####    上位下位の類推     #####')
    analogy_by_hypo_hyper(word, synset_idx=synset_idx)

In [22]:
compare_analogy('suit', synset_idx=0)

####    類似度の類推     #####
('Complaint', 0.5637338161468506)
('Mustard_pakki', 0.5386776924133301)
('Al_Edhari', 0.49727824330329895)
('pompadoured_hair', 0.6859167814254761)
('lawsuit', 0.671337366104126)
('pompadoured_hair', 0.6738342642784119)
('lawsuits_alleging', 0.5511248707771301)
('lawsuit', 0.5822794437408447)
('alleging', 0.5666015148162842)
('suit', 0.5710189938545227)
####    上位下位の類推     #####
suit%1:06:00::
gabardine%1:06:03::
zoot_suit%1:06:00::
suit%1:06:01::
garment%1:06:00::
clothing%1:06:00::
zoot_suit%1:06:00::
slack_suit%1:06:00::
double-breasted_suit%1:06:00::
None
None
None
None
suit - suit + gabardine = trousers
None
None
None
clothing - garment + suit = apparel
zoot_suit - clothing + suit = porkpie_hat
None
None
None
None
suit - zoot_suit + garment = screamer
gabardine - zoot_suit + garment = screamer
None
None
None
None
garment - suit + gabardine = trousers
garment - suit + zoot_suit = jacket
suit - zoot_suit + gabardine = screamer
suit - zoot_suit + zoot_suit = 

In [10]:
compare_analogy('table', synset_idx=1)

####    類似度の類推     #####
('trays', 0.5813458561897278)
('Ethan_Hession', 0.44691479206085205)
('wine_goblet', 0.5873473286628723)
('drill_sergeant_Perkovic', 0.47566887736320496)
('reconciles_EBITDA', 0.44722437858581543)
('dinning_room', 0.608176052570343)
('reconciles_EBITDA', 0.44722437858581543)
('drill_sergeant_Perkovic', 0.45419415831565857)
('toasted_cashews', 0.4707796573638916)
('tables', 0.5040887594223022)
####    上位下位の類推     #####
table%1:14:00::
table%1:14:01::
tray%1:06:00::
drawer%1:18:01::
furniture%1:06:00::
furnishing%1:06:00::
stand%1:06:04::
conference_table%1:06:00::
tea_table%1:06:00::
None
None
table - drawer + tray = toast
None
None
table - tray + table = draft
None
stand - drawer + furniture = cobblestone
None
None
furniture - table + table = sideboard
furniture - table + tray = turntable
table - stand + table = club
table - stand + tray = lazy_susan
None
None
None
None


In [17]:
compare_analogy('wheel', synset_idx=0)

####    類似度の類推     #####
('Nancy_Pekarek_spokeswoman', 0.505904495716095)
('stealth_removable', 0.580708384513855)
('rider_Shinichi_Nakatomi', 0.5683863162994385)
('No.##_Roush_Racing', 0.5009022951126099)
('Rear_wheel', 0.5558632612228394)
('footpeg', 0.609529435634613)
####    上位下位の類推     #####
wheel%1:06:00:: nosewheel%1:06:00:: paddlewheel%1:06:00:: machine%1:06:02:: device%1:06:00:: wagon_wheel%1:06:00:: daisy_print_wheel%1:06:00::
None
None
None
None
None
None
device - nosewheel + paddlewheel = sternwheeler
None
nosewheel - device + wheel = landing_gear
None
machine - wheel + nosewheel = tailplane
machine - wheel + paddlewheel = sternwheeler
None
None
None
None
device - wheel + nosewheel = tailplane
device - wheel + paddlewheel = sternwheeler


In [18]:
compare_analogy('tire', synset_idx=1)

####    類似度の類推     #####
('tire', 0.6417492032051086)
('tire', 0.6120626330375671)
('beadlock', 0.583296000957489)
('Pirelli_tire', 0.5449734926223755)
('Corteco', 0.5167824625968933)
('tire', 0.5241057872772217)
####    上位下位の類推     #####
tire%1:06:00:: tubeless_tire%1:06:00:: tubeless_tire%1:06:00:: devolve%2:29:00:: delegating%1:04:00:: retire%2:37:00:: poop_out%2:29:00::
None
retire - tubeless_tire + tubeless_tire = withdraw
None
tubeless_tire - delegating + tubeless_tire = tubeless
None
None
None
None
retire - tubeless_tire + tire = connexion
None
None
None
tire - retire + tubeless_tire = pneumatic_tire
tire - retire + tubeless_tire = pneumatic_tire
None
None
delegating - tire + tubeless_tire = decentralizing
delegating - tire + tubeless_tire = decentralizing


In [33]:
compare_analogy('chair')

####    類似度の類推     #####
('Co_Chair', 0.6625238060951233)
('Chairs', 0.6326252222061157)
('chairperson', 0.6012726426124573)
('chair', 0.6340569853782654)
('cochairs', 0.5543997287750244)
('Chairwoman', 0.7123143672943115)
('Co_Chair', 0.6384541392326355)
('Chair_Elect', 0.60837721824646)
('Chairperson', 0.6735294461250305)
('chaired', 0.555755078792572)
####    上位下位の類推     #####
chair%1:06:00:: stool%1:06:00:: chaise_longue%1:06:00:: seat%1:06:01:: space%1:15:00:: folding_chair%1:06:00:: straight_chair%1:06:00::
None
None
None
None
chair - barber_chair + chaise_longue = chaise
None
chaise_longue - armchair + space = chaise
space - folding_chair + chair = livingroom
None
None
seat - chair + stool = footstool
seat - chair + chaise_longue = chaise
chair - folding_chair + stool = milking_stool
chair - folding_chair + chaise_longue = chaise
None
None
space - chair + stool = harborage
space - chair + chaise_longue = chaise


In [20]:
compare_analogy('suit')

####    類似度の類推     #####
('pompadoured_hair', 0.6018907427787781)
('alleging', 0.580407977104187)
('Wearing_beige', 0.5911272764205933)
('Al_Edhari', 0.47900304198265076)
('lawsuit', 0.5446116328239441)
('sported_bushy_reddish_sideburns', 0.5645790100097656)
####    上位下位の類推     #####
suit%1:06:00:: tuxedo%1:06:00:: trousers%1:06:00:: garment%1:06:00:: clothing%1:06:00:: slack_suit%1:06:00:: slack_suit%1:06:00::
None
None
None
garment - clothing + trousers = jacket
None
None
clothing - garment + tuxedo = tux
trousers - garment + clothing = clothes
None
None
garment - suit + tuxedo = tux
garment - suit + trousers = skirt
None
None
None
None
clothing - suit + tuxedo = tux
clothing - suit + trousers = clothes


In [None]:
analogy_by_hypo_hyper_other('beauty', 'table')

beauty%1:07:00:: prettiness%1:07:00:: loveliness%1:07:00:: appearance%1:07:00:: quality%1:07:00:: pulchritude%1:07:00:: pulchritude%1:07:00::
appearance beauty beauty
['slender', 'art', 'light', 'light', 'light', 'clear', 'light', 'mercantilism', 'shape', 'mug']
appearance - beauty + beauty = slender
appearance beauty beauty
['slender', 'art', 'light', 'light', 'light', 'clear', 'light', 'mercantilism', 'shape', 'mug']
appearance - beauty + beauty = slender
beauty pulchritude pulchritude
['forward', 'blow', 'blow', 'have', 'turn', 'sheer', 'fresh', 'blow', 'inflation', 'heavy']
beauty - pulchritude + pulchritude = forward
beauty pulchritude pulchritude
['forward', 'blow', 'blow', 'have', 'turn', 'sheer', 'fresh', 'blow', 'inflation', 'heavy']
beauty - pulchritude + pulchritude = forward
beauty pulchritude pulchritude
['forward', 'blow', 'blow', 'have', 'turn', 'sheer', 'fresh', 'blow', 'inflation', 'heavy']
beauty - pulchritude + pulchritude = forward
beauty pulchritude pulchritude
['f

In [10]:
analogy_by_hypo_hyper_other('beauty', 'chair')

beauty%1:07:00:: loveliness%1:07:00:: comeliness%1:07:00:: appearance%1:07:00:: quality%1:07:00:: picturesqueness%1:07:00:: handsomeness%1:07:00::
appearance beauty beauty
['chair', 'swivel_chair', 'armchair', 'overstuffed_chair', 'highchair', 'recliner', 'rocking_chair', 'wheelchair', 'reclining_chair', 'seat']
appearance - beauty + beauty = chair
appearance beauty beauty
['chair', 'swivel_chair', 'armchair', 'overstuffed_chair', 'highchair', 'recliner', 'rocking_chair', 'wheelchair', 'reclining_chair', 'seat']
appearance - beauty + beauty = chair
beauty picturesqueness picturesqueness
None
beauty picturesqueness picturesqueness
None
beauty handsomeness handsomeness
['heavy', 'chair', 'quilt', 'bookfair', 'crochet', 'mat', 'peoria', 'picnic', 'heavy', 'fair']
beauty - handsomeness + handsomeness = heavy
beauty handsomeness handsomeness
['heavy', 'chair', 'quilt', 'bookfair', 'crochet', 'mat', 'peoria', 'picnic', 'heavy', 'fair']
beauty - handsomeness + handsomeness = heavy
quality bea

NameError: name 'compare_analogy' is not defined