コーパスを読み込み対象単語（bank）を含む文を抽出

In [None]:
import google.colab.drive
google.colab.drive.mount('/content/drive/')

sentences_in_corpus = []
with open('/content/drive/MyDrive/nlpseminar2024/simple_corpus.txt') as f:
  for data_line in f:
    data_line = data_line.rstrip()
    sentences_in_corpus.append(data_line)

# 確認用に出力
for sentence in sentences_in_corpus:
    print(sentence)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
He drew some money from the bank.
John got a bank transfer form to make a bank transfer.
My father had worked as a bank clerk for a long time.
Someone raided a bank.
It functions as a data bank.
They walked along the river bank.
The city stands on the right bank of the Saine.
There is a sand bank between the two towns.
They sat down against the bank by the wayside.
The heavy rain broke the bank.


BERTの準備（BERTを使用するための定型処理）

In [None]:
!pip install transformers
import torch
import transformers
from transformers import AutoTokenizer
from transformers import BertModel

model_name = 'bert-base-cased' # 'bert-base-uncased' to ignore difference in upper/lower cases
bert_model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # CPU mode or GPU
bert_model.to(device)
bert_model.eval()
print('Working on:', device)

Working on: cuda:0


対象文中の全単語をベクトル化

In [None]:
import copy
vectors_with_indices = []
print('Device:', device)
for sentence in sentences_in_corpus:
  # token to ids
  tokenized = tokenizer(sentence,
                        return_tensors='pt', padding=True, truncation=True)
  token_ids = tokenized['input_ids'].to(device)
  mask_ids = tokenized['attention_mask'].to(device)

  bert_output = bert_model(token_ids, mask_ids)                                   # token（単語をベクトルに変換）

  tokens = tokenizer.tokenize(sentence)
  last_hidden_state = bert_output.last_hidden_state[0]                            # 最終層の出力（ベクトル）を利用（他の層や平均をとってもよい）
  # 0番目は文頭トークン[CLS]のため除外．1番目からスタート
  for token_index in range(1, len(last_hidden_state)):
    token_vector  = last_hidden_state[token_index]
    token_vector = token_vector.to('cpu').detach().numpy().copy()                 # GPUからCPUへ
    vectors_with_indices.append([tokens,        #分割結果
                                 token_index-1, #単語番号（何番目の単語か）
                                 token_vector]) #単語ベクトル

Device: cuda:0


対象文中の対象フレーズの位置を見つける処理

In [None]:
# 次の三行（3-5）は対象フレーズを見つけるためのプログラムの準備
# util.pyの中に具体的な処理が書かれている
import sys
sys.path.append('drive/My Drive/nlpseminar2024')
import util

#target_sentence = 'There is a large river bank.'
target_sentence = 'I drew some bucks from the bank.'
target_word = 'bank'  # 分析対象としてbankを指定

print('Target word:', target_word)
tokens_of_target_word =  tokenizer.tokenize(target_word)                          # 一単語でも複数のサブワードに分割される可能性あり

target_words = tokenizer.tokenize(target_sentence)
print(target_words)
# (何番目の単語から，何番目の単語までか)
target_span = util.find_target_spans(target_words, tokens_of_target_word)
print(target_span, target_sentence)

Target word: bank
['I', 'drew', 'some', 'bucks', 'from', 'the', 'bank', '.']
[(6, 7)] I drew some bucks from the bank.


対象単語をベクトルに変換

In [None]:
tokenized = tokenizer(target_sentence,
                      return_tensors='pt', padding=True, truncation=True)
token_ids = tokenized['input_ids'].to(device)
mask_ids = tokenized['attention_mask'].to(device)

# BERTに入力しベクトルに変換
bert_output = bert_model(token_ids, mask_ids)

# BERTの出力は文頭に特殊トークン[CLS]（文頭トークン）を含むため
# indexが一つずれることに注意（begin_index + 1）
begin_index, end_index = target_span[0]
token_vectors = bert_output.last_hidden_state[0, begin_index+1:end_index+1]

# 固定サイズにするために平均をとる
mean_token_vector = torch.mean(token_vectors, dim=0)

# Numpyのベクトルに型変換
target_vector = mean_token_vector.to('cpu').detach().numpy().copy()
print(target_vector[0:10]) # ベクトルの0～9番目の値を表示

[-0.51508087  0.09701937 -0.08343408  0.23143525  0.13654947  0.07786781
  0.01467468 -0.38582292 -0.5197119  -0.00053196]


対象フレーズのベクトルとコーパス中の全単語に対するベクトル間の類似度を計算

In [None]:
import numpy as np  # ベクトル，行列計算モジュール
similarities = []
norm_of_target_vec = np.linalg.norm(target_vector)                                # 対象単語のベクトルのノルム（長さ）
for tokens, token_index, token_vector in vectors_with_indices:
  product =  np.dot(token_vector, target_vector) # 二つのベクトルの内積
  norm = np.linalg.norm(token_vector) # 類似度計算対象の単語のベクトルのノルム
  cos_sim =  product/(norm_of_target_vec*norm)   # 余弦類似度
  similarities.append((cos_sim, tokens, token_index))

sorted_sims = sorted(similarities, key=lambda x: x[0], reverse=True)              # 類似度の高い順にソート

結果の表示

In [None]:
print('------')
print('Target word:', target_word)
print('Target sentence:', target_sentence)
print('------')

top_n = 5 # 上位5件
tag = '*' # 検索された語をわかりやすく表示するためのタグ
for cos_sim, tokens, token_index in sorted_sims[:top_n]:
  output_tokens = tokens[:]
  output_tokens[token_index] = tag + output_tokens[token_index] + tag
  sentence_for_output = ' '.join(output_tokens)

  print(cos_sim, sentence_for_output)

------
Target word: bank
Target sentence: I drew some bucks from the bank.
------
0.97155577 He drew some money from the *bank* .
0.90962034 Someone raided a *bank* .
0.8676604 My father had worked as a *bank* clerk for a long time .
0.8359408 John got a *bank* transfer form to make a bank transfer .
0.81758964 John got a bank transfer form to make a *bank* transfer .
