In [9]:
from transformers import pipeline

question_answerer = pipeline('question-answering')
question_answerer({
    'context': "A fire caused by human error has occred in the plant",
    'question': 'Why does the fire occured ?',
})

{'score': 0.8579399585723877, 'start': 17, 'end': 28, 'answer': 'human error'}

# Question Answering

In [7]:
from transformers import BertJapaneseTokenizer, AutoModelForQuestionAnswering
import torch

# 入力テキスト
context = "本日お昼頃、高崎方面へ自転車で出かけました。"
question="どこへ出かけた？"

# モデルとトークナイザーの準備
model = AutoModelForQuestionAnswering.from_pretrained('output/')  
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking') 

# 推論の実行
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
output = model(**inputs)
answer_start = torch.argmax(output.start_logits)  
answer_end = torch.argmax(output.end_logits) + 1 
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

# 結果出力
print("質問: "+question)
print("応答: "+answer)

404 Client Error: Not Found for url: https://huggingface.co/output//resolve/main/config.json


OSError: Can't load config for 'output/'. Make sure that:

- 'output/' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'output/' is the correct path to a directory containing a config.json file



# Embedding

In [5]:
# !pip install fugashi

# !pip install ipadic

# https://www.nogawanogawa.com/entry/bert_embedding
# ↑内のリファレンスも参考になりそう
import numpy as np
import pandas as pd
import torch
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
sent = "吾輩は猫である。名前はまだない。"
tokenized_text = tokenizer.tokenize(sent)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
model = BertForSequenceClassification.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking", # 日本語Pre trainedモデルの指定
    num_labels = 2, # ラベル数（今回はBinayなので2、数値を増やせばマルチラベルも対応可）
    output_attentions = False, # アテンションベクトルを出力するか
    output_hidden_states = True, # 隠れ層を出力するか
)

model.eval()
with torch.no_grad(): # 勾配計算なし
    all_encoder_layers = model(tokens_tensor)
    
embedding = all_encoder_layers[1][-2].numpy()[0]
t = np.mean(embedding, axis=0)
t = t.reshape(1, 768)
pd.DataFrame(t)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.279862,-0.378882,-0.039784,-0.339315,-0.673706,0.026561,0.099009,0.057616,-0.64411,0.077871,...,0.04993,0.312057,-0.245344,0.038624,-0.098731,0.567021,-0.036311,-0.281213,-0.378871,-0.459427


- bert を使うのが当たり前；異なる表現だが同じ意図の問い合わせを，ベクトル空間上で特定の関係をもつペアとしてとらえる例
  - https://www.ai-shift.co.jp/techblog/183

- bert と w2v の違い；bert はサブワード単位；トークナイズもサブワード単位で行わないといけない
  - https://ichi.pro/word-2-vec-to-bert-no-chigai-211699796194769