In [None]:
# ここではコサイン類似度を計算
# 本当の前処理(ストップワード除外等)は00_preprocessingを作成

In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0
!pip install datasets

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.3 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 11.1 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 407 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.4 MB/s 
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl s

In [18]:
# ライブラリ
import numpy as np
import pandas as pd
import torch
#from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertJapaneseTokenizer, BertModel, BertConfig
from google.colab import drive

In [3]:
# パラメータ
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

drive.mount("/content/drive/")
INPUT_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/input/" 
MODEL_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/output/model/" 
VECTOR_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/output/" 
OUTPUT_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/output/" 

cuda
Mounted at /content/drive/


In [13]:
# モジュールロード
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME) 

config = BertConfig.from_json_file(MODEL_PATH + '/last.bin/config.json')
model = BertModel(config).to(device)
model.load_state_dict(torch.load(MODEL_PATH + "/last.bin/pytorch_model.bin"))
# print(model)

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

<All keys matched successfully>

In [6]:
# データ
df = pd.read_csv(INPUT_PATH + "sample_3.csv", encoding='cp932') # shift-jis
print(df.shape)
df.head(3)

(20, 4)


Unnamed: 0,Q,T,Template,Train
0,振込限度額を教えてください。,振込限度額は100万円です。,1,1
1,１回の振込上限額はいくらですか？,振込限度額は100万円です。,1,0
2,口座開設したいです。,口座開設方法はこちらになります。,2,1


In [15]:
# データ選択
df = df[df['Train']==1].reset_index(drop=True)[['Q','T','Template']]
print(df.shape)
df.head(3)

(10, 3)


Unnamed: 0,Q,T,Template
0,振込限度額を教えてください。,振込限度額は100万円です。,1
1,口座開設したいです。,口座開設方法はこちらになります。,2
2,口座解約したいです。,口座解約の方法はこちらです。,3


In [16]:
# tokenize
def tokenize_text(tokenizer, text):
  encoding = tokenizer(text, max_length=512, padding='max_length', truncation=True)
  encoding = {k: torch.tensor(v) for k, v in encoding.items()}
  return encoding

In [17]:
# encoding(Q)
encodings = []
for q in df['Q'].tolist():
  encoding = tokenize_text(tokenizer, q)
  encodings.append(encoding)
print(len(encodings))

10


In [20]:
# ベクトル化(Q)
vec_q = torch.tensor([]).to(device)
for batch in DataLoader(encodings, batch_size=4):
  input_ids = batch['input_ids'].to(device)
  attention_mask = batch['attention_mask'].to(device)

  with torch.no_grad():
    cls = model(input_ids, attention_mask, return_dict=True).last_hidden_state[:, 0, :] # CLSトークン
    #print(cls.shape)
    vec_q = torch.cat((vec_q, cls), 0)
print(vec_q.shape)

torch.Size([10, 768])


In [21]:
# encoding(QT)
encodings = []
for qt in (df['Q'] + "[SEP]" + df['T']).tolist():
  encoding = tokenize_text(tokenizer, qt)
  encodings.append(encoding)
print(len(encodings))

10


In [22]:
# ベクトル化(QT)
vec_qt = torch.tensor([]).to(device)
for batch in DataLoader(encodings, batch_size=4):
  input_ids = batch['input_ids'].to(device)
  attention_mask = batch['attention_mask'].to(device)

  with torch.no_grad():
    cls = model(input_ids, attention_mask, return_dict=True).last_hidden_state[:, 0, :] # CLSトークン
    #print(cls.shape)
    vec_qt = torch.cat((vec_qt, cls), 0)
print(vec_qt.shape)

torch.Size([10, 768])


In [52]:
# tensor->numpy
vec_q_ary = vec_q.cpu().numpy()
vec_qt_ary = vec_qt.cpu().numpy()

# 正規化用L2ノルム
l2_q = np.linalg.norm(vec_q_ary, ord=2, axis=1, keepdims=True)
l2_qt = np.linalg.norm(vec_qt_ary, ord=2, axis=1, keepdims=True)

# 正規化
vec_q_normalized = vec_q_ary / l2_q
vec_qt_normalized = vec_qt_ary/ l2_qt

# コサイン類似度
cossim = vec_q_normalized.dot(vec_qt_normalized.T)
print(cossim.shape)
print(cossim[2,1], cossim[1,2])

(10, 10)
0.8352119 0.79373693


In [53]:
# 出力
np.save(OUTPUT_PATH + "cossim_matrix.npy", cossim)