## 文書分類

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_classification_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-marc_ja"
)

In [3]:
positive_text = "明日の運動会が楽しみだ。"
print(text_classification_pipeline(positive_text)[0])

{'label': 'positive', 'score': 0.9991204142570496}


In [4]:
negative_text = "明日の遠足は雨予報なので行きたくない。"
print(text_classification_pipeline(negative_text)[0])

{'label': 'negative', 'score': 0.9057907462120056}


## 自然言語推論

二つのテキストの論理関係を予測するタスク。

* `llm-book/bert-base-japanese-v3-jnli`を使う

In [6]:
nli_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-jnli"
)

In [7]:
text = "二人の男性がジェット機を見ています"
entailment_text = "ジェット機を見ている人が二人います"
# entailment: 含意
print(nli_pipeline({
    "text": text,
    "text_pair": entailment_text
}))

{'label': 'entailment', 'score': 0.9964311122894287}


In [8]:
contradition_text = "二人の男性が飛んでいます"
# contradition: 矛盾
print(nli_pipeline({
    "text": text,
    "text_pair": contradition_text
}))

{'label': 'contradiction', 'score': 0.9990535378456116}


In [9]:
neutral_text = "2人の男性が、白い飛行機を眺めています"
# neutral: 中立
print(nli_pipeline(
    {"text": text,
    "text_pair": neutral_text}
))

{'label': 'neutral', 'score': 0.9959145188331604}


## 意味的類似度計算

In [10]:
text_sim_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-jsts",
    function_to_apply="none"
)

In [11]:
text = "川べりでサーフボードを持った人たちがいます"
sim_text = "サーファーたちが川べりにたっています"

result = text_sim_pipeline({
    "text": text,
    "text_pair": sim_text
})
print(result)

{'label': 'LABEL_0', 'score': 3.258415937423706}


In [12]:
dissim_text = "トイレの壁に黒いタオルがかけられています"

result = text_sim_pipeline({
    "text": text,
    "text_pair": dissim_text
})

print(result)

{'label': 'LABEL_0', 'score': 0.04162188991904259}


In [13]:
from torch.nn.functional import cosine_similarity

sim_enc_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-unsup-simcse-jawiki",
    task="feature-extraction"
)

config.json: 100%|██████████| 634/634 [00:00<00:00, 566kB/s]
pytorch_model.bin: 100%|██████████| 445M/445M [00:12<00:00, 34.3MB/s] 
tokenizer_config.json: 100%|██████████| 529/529 [00:00<00:00, 1.46MB/s]
vocab.txt: 100%|██████████| 231k/231k [00:00<00:00, 38.7MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 527kB/s]


In [15]:
text_emb = sim_enc_pipeline(text, return_tensors=True)[0][0]
sim_emb = sim_enc_pipeline(sim_text, return_tensors=True)[0][0]

# calculate cosine similarity
sim_pair_score = cosine_similarity(text_emb, sim_emb, dim=0)
print(sim_pair_score.item())


0.8597443699836731


In [16]:
dissim_emb = sim_enc_pipeline(dissim_text, return_tensors=True)[0][0]
dissim_pair_score = cosine_similarity(text_emb, dissim_emb, dim=0)
print(dissim_pair_score.item())

0.4588705599308014


## 固有表現認識

In [17]:
from pprint import pprint

ner_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-ner-wikipedia-dataset",
    aggregation="simple"
)

config.json: 100%|██████████| 1.93k/1.93k [00:00<00:00, 3.23MB/s]
pytorch_model.bin: 100%|██████████| 443M/443M [00:12<00:00, 36.9MB/s] 
tokenizer_config.json: 100%|██████████| 529/529 [00:00<00:00, 1.86MB/s]
vocab.txt: 100%|██████████| 231k/231k [00:00<00:00, 707kB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 462kB/s]


TypeError: TokenClassificationPipeline._sanitize_parameters() got an unexpected keyword argument 'aggregation'

In [None]:
text = "大谷翔平は岩手県水沢市出身のプロ野球選手"