In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0
!pip install datasets

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 13.5 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 39.8 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 34.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 22.9 MB/s 
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl

In [2]:
# ライブラリ
import numpy as np
import pandas as pd
import torch
#from torch import nn, optim
#from torch.utils.data import Dataset, DataLoader
from transformers import BertJapaneseTokenizer, BertModel, BertConfig
from google.colab import drive

In [3]:
# パラメータ
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

drive.mount("/content/drive/")
INPUT_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/input/" 
MODEL_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/output/model/" 
VECTOR_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/output/" 
OUTPUT_PATH = "/content/drive/My Drive/NLP/work/10_my_task/01_bert_sentence_cls/output/" 

cuda
Mounted at /content/drive/


In [4]:
# モジュールロード
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME) 

config = BertConfig.from_json_file(MODEL_PATH + '/last.bin/config.json')
model = BertModel(config).to(device)
model.load_state_dict(torch.load(MODEL_PATH + "/last.bin/pytorch_model.bin"))
# print(model)

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

<All keys matched successfully>

In [5]:
# データ
df = pd.read_csv(INPUT_PATH + "sample_3.csv", encoding='shift-jis')
print(df.shape)
df.head(3)

(20, 4)


Unnamed: 0,Q,T,Template,Train
0,振込限度額を教えてください。,振込限度額は100万円です。,1,1
1,１回の振込上限額はいくらですか？,振込限度額は100万円です。,1,0
2,口座開設したいです。,口座開設方法はこちらになります。,2,1


In [7]:
# データ選択
df_train = df[df['Train']==1].reset_index(drop=True)[['Q','T','Template']]
df_test = df[df['Train']==0].reset_index(drop=True)[['Q','T','Template']]
print(df_train.shape, df_test.shape)
df_train.head(3)

(10, 3) (10, 3)


Unnamed: 0,Q,T,Template
0,振込限度額を教えてください。,振込限度額は100万円です。,1
1,口座開設したいです。,口座開設方法はこちらになります。,2
2,口座解約したいです。,口座解約の方法はこちらです。,3


In [8]:
# INPUT

# QTの文章ベクトル(学習データ)
vec_qt = np.load(OUTPUT_PATH + "vec_bert.npy")
print(vec_qt.shape)

# 対応するテンプレート番号
df_templ = pd.read_csv(OUTPUT_PATH + "template.csv")
print(df_templ.shape)
df_templ.head(3)

(10, 768)
(10, 1)


Unnamed: 0,Template
0,1
1,2
2,3


In [9]:
# tokenize
def tokenize_text(tokenizer, text):
  encoding = tokenizer(text, max_length=512, padding='max_length', truncation=True)
  encoding = {k: torch.tensor(v) for k, v in encoding.items()}
  return encoding

In [10]:
# embedding
def bert_embedding_single_text(q, tokenizer, model, device):
  encoding = tokenize_text(tokenizer,  q)
  input_ids = encoding['input_ids'].unsqueeze(0).to(device)
  attention_mask = encoding['attention_mask'].unsqueeze(0).to(device)
  
  model.eval()
  with torch.no_grad():
    cls = model(input_ids, attention_mask, return_dict=True).last_hidden_state[:, 0, :]

  return cls.cpu().numpy()

In [11]:
# 文章ベクトル間のL2ノルムからテンプレート番号(上位3位)を出力
def predict_template(vec_q, vec_qt, df_templ):
  # 行数をそろえる
  tiled_vec_q = np.tile(vec_q, (len(vec_qt),1))
  #print(tiled_vec_q.shape)
  df_l2 = pd.DataFrame(np.linalg.norm((vec_qt - tiled_vec_q), ord=2, axis=1), columns=["L2"])
  #print(df_l2.shape)
  df_score = pd.concat([df_templ, df_l2], axis=1)
  df_score = df_score.sort_values(by=['L2'], ascending=True)
  #df_score.head(3)

  # TODO: confident list

  return df_score['Template'].iloc[:3].tolist()

In [12]:
def evaluation(df, tokenizer, model, device, vec_qt, df_templ):
  correct_at_1 = 0
  correct_at_3 = 0

  for i in range(len(df)):
    # encoding
    q = df['Q'].iloc[i]
    vec_q = bert_embedding_single_text(q, tokenizer, model, device)
    # print(vec_q.shape)

    # predict
    prediction = predict_template(vec_q, vec_qt, df_templ)
    #print(prediction)

    # ans
    ans = df['Template'].iloc[i]
    #print(ans)

    # agg
    if prediction[0]==ans:
      correct_at_1 += 1

    if ans in prediction:
      correct_at_3 += 1

  r_at_1 = correct_at_1/len(df)
  r_at_3 = correct_at_3 / len(df)
  return [r_at_1, r_at_3]
  # print(r_at_1, r_at_3)

In [13]:
print(evaluation(df_train, tokenizer, model, device, vec_qt, df_templ))
print(evaluation(df_test, tokenizer, model, device, vec_qt, df_templ))

[0.7, 1.0]
[0.2, 0.6]


In [56]:
# correct_at_1 = 0
# correct_at_3 = 0

# for i in range(len(df)):
#   # encoding
#   q = df['Q'].iloc[i]
#   vec_q = bert_embedding_single_text(q, tokenizer, model, device)
#   # print(vec_q.shape)

#   # predict
#   prediction = predict_template(vec_q, vec_qt, df_templ)
#   #print(prediction)

#   # ans
#   ans = df['Template'].iloc[i]
#   #print(ans)

#   # agg
#   if prediction[0]==ans:
#     correct_at_1 += 1

#   if ans in prediction:
#     correct_at_3 += 1

# r_at_1 = correct_at_1/len(df)
# r_at_3 = correct_at_3 / len(df)
# print(r_at_1, r_at_3)