In [1]:
import numpy as np
import pandas as pd
import gensim

from word_extractor.lexemes_vector import get_most_similar_lexemes, get_verctor_from_word, get_verctor_from_sense_key, get_word_from_vector
from word_extractor.wordnet import get_hypernum, get_hyponym, get_not_similar_hyponym
from utils.dataloader import read_lexemes_dict_and_list, read_mapping

input_words = ['car', 'toughness', 'velocity', 'tire']
synset_filepath = './data/synsets.txt'
lexemes_filepath = './data/lexemes.txt'
mapping_filepath = './data/mapping.txt'
word2vec_filepath = './data/GoogleNews-vectors-negative300.bin'

lexemes_dict, lexemes_list = read_lexemes_dict_and_list(lexemes_filepath)
mapping_dict = read_mapping(mapping_filepath)
word_embed = gensim.models.KeyedVectors.load_word2vec_format(word2vec_filepath, binary=True)

[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ryogo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
def show_word2vec_and_autoextend(word, synset_idx=0):
    print('####    word2vec     #####')
    for i, word_w2 in enumerate(word_embed.most_similar([word])):
        print(f'{word_w2[0]}')

    print('####    AutoExtend     #####')
    most_similar_lexemes = get_most_similar_lexemes(word, lexemes_dict, lexemes_list, mapping_dict, rank_range=10, is_single=False, synset_idx=synset_idx)
    for i, lexeme in enumerate(most_similar_lexemes[0]):
        print(f'{lexeme[0][:-18]}')

# 実験１ 単語実験

## 語義曖昧性

Suit

In [34]:
show_word2vec_and_autoextend('suit')

####    word2vec     #####
suits
lawsuit
Suit
Atta_chakki_delivery
lawsuits
countersuit
polka_dot_clown
His_showmanship_rhinestone
complaint
lawsuit_alleging
####    AutoExtend     #####
zoot_suit
garment
gabardine
tailcoat
tuxedo
suit
tux
suit
suit
trousers


In [23]:
show_word2vec_and_autoextend('chair')

####    word2vec     #####
chairs
Chair
chairperson
chairwoman
chairman
Vice_Chair
Co_Chair
chairing
Chairs
cochair
####    AutoExtend     #####
seat
sofa
armchair
stool
recliner
chaise_longue
swivel_chair
rocking_chair
chaise
lawn_chair


In [24]:
show_word2vec_and_autoextend('table', synset_idx=1)

####    word2vec     #####
tables
ConocoPhillips_BPAmerica
Capitalized_Included
tray
dining_room
banquette
rapping_cappella
sideboard
linen_tablecloth
Tables
####    AutoExtend     #####
worktable
table
bookcase
chair
room
sideboard
tray
workbench
furniture
credenza


## 上位下位

In [50]:
def show_word2vec_and_hypo_hyper(word, synset_idx=0):
    print('####    word2vec     #####')
    for i, word_w2 in enumerate(word_embed.most_similar([word])):
        print(f'{word_w2[0]}')
    
    print('####    word2vec + 上位下位     #####')
    for i, word_w2 in enumerate(word_embed.most_similar([word])):
        if i < 6:
            print(f'{word_w2[0]}')
        else:
            break

    print(f'{get_hypernum(word, idx=synset_idx)[0]}')
    print(f'{get_hypernum(get_hypernum(word, idx=synset_idx)[0])[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    print(f'{get_hyponym(word, idx=synset_idx)[0]}')
    
    

In [57]:
show_word2vec_and_hypo_hyper('suit')

####    word2vec     #####
suits
lawsuit
Suit
Atta_chakki_delivery
lawsuits
countersuit
polka_dot_clown
His_showmanship_rhinestone
complaint
lawsuit_alleging
####    word2vec + 上位下位     #####
suits
lawsuit
Suit
Atta_chakki_delivery
lawsuits
countersuit
garment
clothing
slack_suit
double-breasted_suit


In [52]:
show_word2vec_and_hypo_hyper('table', synset_idx=1)

####    word2vec     #####
tables
ConocoPhillips_BPAmerica
Capitalized_Included
tray
dining_room
banquette
rapping_cappella
sideboard
linen_tablecloth
Tables
####    word2vec + 上位下位     #####
tables
ConocoPhillips_BPAmerica
Capitalized_Included
tray
dining_room
banquette
furniture
furnishing
card_table
desk


In [53]:
show_word2vec_and_hypo_hyper('dog', synset_idx=0)

####    word2vec     #####
dogs
puppy
pit_bull
pooch
cat
golden_retriever
German_shepherd
Rottweiler
beagle
pup
####    word2vec + 上位下位     #####
dogs
puppy
pit_bull
pooch
cat
golden_retriever
canine
tooth
spitz
toy_dog


In [56]:
show_word2vec_and_hypo_hyper('chair', synset_idx=0)

####    word2vec     #####
chairs
Chair
chairperson
chairwoman
chairman
Vice_Chair
Co_Chair
chairing
Chairs
cochair
####    word2vec + 上位下位     #####
chairs
Chair
chairperson
chairwoman
chairman
Vice_Chair
seat
space
armchair
rocking_chair


## 類推