# 言語処理100本ノック 2020 (Rev 2)



## 第7章: 単語ベクトル

### 60. 単語ベクトルの読み込みと表示

In [35]:
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

# https://stmind.hatenablog.com/entry/2017/06/18/230106
# https://blog.amedama.jp/entry/gensim-fasttext-pre-trained-word-vectors

import gensim

googlenews_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
print(googlenews_w2v["United_States"])
print(len(googlenews_w2v["United_States"]))


[-3.61328125e-02 -4.83398438e-02  2.35351562e-01  1.74804688e-01
 -1.46484375e-01 -7.42187500e-02 -1.01562500e-01 -7.71484375e-02
  1.09375000e-01 -5.71289062e-02 -1.48437500e-01 -6.00585938e-02
  1.74804688e-01 -7.71484375e-02  2.58789062e-02 -7.66601562e-02
 -3.80859375e-02  1.35742188e-01  3.75976562e-02 -4.19921875e-02
 -3.56445312e-02  5.34667969e-02  3.68118286e-04 -1.66992188e-01
 -1.17187500e-01  1.41601562e-01 -1.69921875e-01 -6.49414062e-02
 -1.66992188e-01  1.00585938e-01  1.15722656e-01 -2.18750000e-01
 -9.86328125e-02 -2.56347656e-02  1.23046875e-01 -3.54003906e-02
 -1.58203125e-01 -1.60156250e-01  2.94189453e-02  8.15429688e-02
  6.88476562e-02  1.87500000e-01  6.49414062e-02  1.15234375e-01
 -2.27050781e-02  3.32031250e-01 -3.27148438e-02  1.77734375e-01
 -2.08007812e-01  4.54101562e-02 -1.23901367e-02  1.19628906e-01
  7.44628906e-03 -9.03320312e-03  1.14257812e-01  1.69921875e-01
 -2.38281250e-01 -2.79541016e-02 -1.21093750e-01  2.47802734e-02
  7.71484375e-02 -2.81982


### 61. 単語の類似度

In [11]:
# https://qiita.com/Qiitaman/items/fa393d93ce8e61a857b1

import numpy as np

def cos_sim(v1, v2):
  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

X = googlenews_w2v["United_States"]
Y = googlenews_w2v["U.S."]

print(cos_sim(X, Y))


0.7310775



### 62. 類似度の高い単語10件

In [13]:
# https://qiita.com/DancingEnginee1/items/b10c8ef7893d99aa53be

print(googlenews_w2v.most_similar('United_States'))

[('Unites_States', 0.7877248525619507), ('Untied_States', 0.7541370987892151), ('United_Sates', 0.7400725483894348), ('U.S.', 0.7310774922370911), ('theUnited_States', 0.6404394507408142), ('America', 0.6178409457206726), ('UnitedStates', 0.6167312264442444), ('Europe', 0.6132988929748535), ('countries', 0.6044804453849792), ('Canada', 0.6019068956375122)]


### 63. 加法構成性によるアナロジー

In [14]:
# https://www.pc-koubou.jp/magazine/9905

googlenews_w2v.most_similar(positive=[u"Spain",u"Athens"], negative=["Madrid"],topn=10)

[('Greece', 0.6898480653762817),
 ('Aristeidis_Grigoriadis', 0.5606847405433655),
 ('Ioannis_Drymonakos', 0.5552908778190613),
 ('Greeks', 0.545068621635437),
 ('Ioannis_Christou', 0.5400862693786621),
 ('Hrysopiyi_Devetzi', 0.5248445272445679),
 ('Heraklio', 0.5207759737968445),
 ('Athens_Greece', 0.516880989074707),
 ('Lithuania', 0.5166866183280945),
 ('Iraklion', 0.5146791338920593)]


### 64. アナロジーデータでの実験

In [29]:
import pandas as pd

cols=["category","v1","v2","v3","v4"]
df = pd.DataFrame(columns=cols)
with open('questions-words.txt') as f:
  for i,l in enumerate(f):
    s = l.strip().split()
    if s[0] == ':':
      cat = s[1]
    else:
      row = [cat]
      row += s
      df = df.append(pd.DataFrame([[cat]+s],columns=cols), ignore_index=True)
    # if i > 5:
      # break
df

Unnamed: 0,category,v1,v2,v3,v4
0,capital-common-countries,Athens,Greece,Baghdad,Iraq
1,capital-common-countries,Athens,Greece,Bangkok,Thailand
2,capital-common-countries,Athens,Greece,Beijing,China
3,capital-common-countries,Athens,Greece,Berlin,Germany
4,capital-common-countries,Athens,Greece,Bern,Switzerland
...,...,...,...,...,...
19539,gram9-plural-verbs,write,writes,talk,talks
19540,gram9-plural-verbs,write,writes,think,thinks
19541,gram9-plural-verbs,write,writes,vanish,vanishes
19542,gram9-plural-verbs,write,writes,walk,walks


In [92]:
# https://note.nkmk.me/python-pandas-map-applymap-apply/
# https://qiita.com/MysteriousMonkey/items/a238da520993f1f4b0cf

from tqdm.notebook import tqdm_notebook

def calc_most_similar(row):
  r = googlenews_w2v.most_similar(positive=[row["v2"],row["v3"]], negative=[row["v1"]],topn=1)
  print(r)
  return pd.Series( list(r)[0] )

tqdm_notebook.pandas(desc="progress: ")

#dh = pd.DataFrame(df.head())
df[["word","Score"]] = df.progress_apply(calc_most_similar,axis=1)

progress:   0%|          | 0/19544 [00:00<?, ?it/s]

[('Iraqi', 0.635187029838562)]
[('Thailand', 0.7137669920921326)]
[('China', 0.7235777378082275)]
[('Germany', 0.6734623312950134)]
[('Switzerland', 0.4919748306274414)]
[('Egypt', 0.7527808547019958)]
[('Australia', 0.5837324857711792)]
[('Viet_Nam', 0.6276342272758484)]
[('Cuba', 0.6460990905761719)]
[('Finland', 0.68999844789505)]
[('Pakistan', 0.7233324646949768)]
[('Afghan', 0.6160916090011597)]
[('Britain', 0.5646187663078308)]
[('Spain', 0.7036612629890442)]
[('Russia', 0.7382973432540894)]
[('Norway', 0.6470744013786316)]
[('Canada', 0.5912168622016907)]
[('France', 0.6724624037742615)]
[('Italy', 0.6826189756393433)]
[('Sweden', 0.6757245063781738)]
[('Iran', 0.7671369910240173)]
[('Japan', 0.725609540939331)]
[('Thailand', 0.6745353937149048)]
[('China', 0.6915649175643921)]
[('Germany', 0.6057409048080444)]
[('coach_Bobby_Curlings', 0.4347156882286072)]
[('Egypt', 0.6848487854003906)]
[('Mr_Rudd', 0.6186108589172363)]
[('Vietnam', 0.749355137348175)]
[('Cuba', 0.713325500488

In [93]:
df.to_csv('ch07-64-result.txt')
df

Unnamed: 0,category,v1,v2,v3,v4,word,Score
0,capital-common-countries,Athens,Greece,Baghdad,Iraq,Iraqi,0.635187
1,capital-common-countries,Athens,Greece,Bangkok,Thailand,Thailand,0.713767
2,capital-common-countries,Athens,Greece,Beijing,China,China,0.723578
3,capital-common-countries,Athens,Greece,Berlin,Germany,Germany,0.673462
4,capital-common-countries,Athens,Greece,Bern,Switzerland,Switzerland,0.491975
...,...,...,...,...,...,...,...
19539,gram9-plural-verbs,write,writes,talk,talks,talked,0.544719
19540,gram9-plural-verbs,write,writes,think,thinks,thinks,0.617773
19541,gram9-plural-verbs,write,writes,vanish,vanishes,disappear,0.600271
19542,gram9-plural-verbs,write,writes,walk,walks,walks,0.553434



### 65. アナロジータスクでの正解率

- <https://www.soh-devlog.tokyo/nlp100-7-65/>

|意味的アナロジー|文法的アナロジー|
|--|--|
|capital-common-countries, capital-world, currency, city-in-state, family|gram1-adjective-to-adverb, gram2-opposite, gram3-comparative, gram4-superlative, gram5-present-participle, gram6-nationality-adjective, gram7-past-tense, gram8-plural, gram9-plural-verbs|



In [79]:
df["category"].unique()

array(['capital-common-countries', 'capital-world', 'currency',
       'city-in-state', 'family', 'gram1-adjective-to-adverb',
       'gram2-opposite', 'gram3-comparative', 'gram4-superlative',
       'gram5-present-participle', 'gram6-nationality-adjective',
       'gram7-past-tense', 'gram8-plural', 'gram9-plural-verbs'],
      dtype=object)

In [81]:
sem = df[~df["category"].str.contains("gram")]
syn = df[df["category"].str.contains("gram")]

In [85]:
print("意味的アナロジー({})正解率：{}".format(len(sem), (sem["v4"] == sem["word"]).sum() / len(sem)))
print("文法的アナロジー({})正解率：{}".format(len(syn), (syn["v4"] == syn["word"]).sum() / len(syn)))

意味的アナロジー(8869)正解率：0.0004510091329349419
文法的アナロジー(10675)正解率：0.0


In [87]:
sem[sem["v4"]!=sem["word"]]

Unnamed: 0,category,v1,v2,v3,v4,word,Score
0,capital-common-countries,Athens,Greece,Baghdad,Iraq,Iraqi,0.635187
5,capital-common-countries,Athens,Greece,Cairo,Egypt,,
6,capital-common-countries,Athens,Greece,Canberra,Australia,,
7,capital-common-countries,Athens,Greece,Hanoi,Vietnam,,
8,capital-common-countries,Athens,Greece,Havana,Cuba,,
...,...,...,...,...,...,...,...
8864,family,uncle,aunt,son,daughter,,
8865,family,uncle,aunt,sons,daughters,,
8866,family,uncle,aunt,stepbrother,stepsister,,
8867,family,uncle,aunt,stepfather,stepmother,,



### 66. WordSimilarity-353での評価