正例も負例もBM25で検索(名詞と形容詞と動詞)

In [1]:
import random
import glob
from tqdm import tqdm

import torch
import pandas as pd
import numpy as np 
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from mlflow import log_metric, log_param, log_artifact
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from typing import Dict, List, Tuple
import numpy as np
from scipy import sparse as sp
import json

def is_key_exist(d: Dict, key: any):
    return d.get(key) is not None


class Okapi:
    def __init__(self, extract_words_func: callable, b: float = 0.75, k1: float = 1.2, delta: float = 1.0,
                 norm: bool = True):
        """
        :param extract_words_func: ドキュメントを単語リスト化する関数オブジェクト.
        :param b: constant
        :param k1: constant
        :param delta: constant
        """
        self.K1, self.B, self.delta = k1, b, delta  # 定数
        self.norm = norm  # 正規化するかしないか
        self.word2id_dict = {}  # 単語とインデックスの辞書
        self.idf = np.array([])  # inverse document frequency
        self.avg_word_count_in_doc = 0  # ドキュメント内の単語数の平均
        if not callable(extract_words_func):
            raise RuntimeError("extract_words_funcは呼び出し可能オブジェクトでなければいけません")
        self.extract_words_func = extract_words_func

    def fit_transform(self, documents: List[str]):
        self.fit(documents)
        return self.transform(documents)

    def fit(self, documents: List[str]):
        """
        ベクトライザーのセットアップ
        IDFのみ設定
        :param documents:
        """
        counter = 0
        for document in documents:
            searched_dict = {}
            words = self.extract_words_func(document)
            self.avg_word_count_in_doc += len(words)
            for word in words:
                if is_key_exist(searched_dict, word):
                    continue
                searched_dict[word] = True
                # 他のドキュメントですでに出た単語
                if is_key_exist(self.word2id_dict, word):
                    self.idf[self.word2id_dict[word]] += 1.0
                    continue
                self.word2id_dict[word] = counter
                counter += 1
                self.idf = np.append(self.idf, [1.0])
        documents_len = len(documents)
        self.idf = np.log2(documents_len / (self.idf + 0.0000001))  # logに00が入らないようにする
        self.avg_word_count_in_doc = self.avg_word_count_in_doc / documents_len

    def transform(self, documents: List[str]) -> sp.lil_matrix:
        """
        ドキュメントを重み付け
        :param documents:
        :return: object of scipy.sparse.lil_matrix
        """
        result = sp.lil_matrix((len(documents), len(self.word2id_dict)))
        for i, doc in enumerate(documents):
            # 単語の出現頻度
            word_weight_dict, words_count = self._terms_frequency(doc)
            # Combine Weight重み付け
            for ind in word_weight_dict.keys():
                word_weight_dict[ind] = self._bm25_weight(ind, word_weight_dict[ind], words_count)

            if self.norm:
                # 正規化
                total_dist = sum(list(map(lambda item: item[1], word_weight_dict.items())))
                for ind in word_weight_dict.keys():
                    word_weight_dict[ind] /= total_dist

            # 疎行列にベクトル追加
            for item in word_weight_dict.items():
                result[i, item[0]] = item[1]
        return result

    def _terms_frequency(self, doc: str) -> Tuple[Dict[int, float], int]:
        """
        ドキュメント内の単語出現頻度を返す
        :param doc:
        :return:
        """
        word_weight_dict = {}  # key: 単語ID, value: 頻度
        words = self.extract_words_func(doc)

        # Term Frequency
        for word in words:
            if not is_key_exist(self.word2id_dict, word):
                # TODO 辞書に無い単語の扱い
                continue

            if is_key_exist(word_weight_dict, self.word2id_dict[word]):
                word_weight_dict[self.word2id_dict[word]] += 1.0
            else:
                word_weight_dict[self.word2id_dict[word]] = 1.0
        return word_weight_dict, len(words)

    def _bm25_weight(self, word_index: int, word_freq: float, word_count_in_doc: int) -> float:
        """
        Okapi BM25+重み計算
        :param word_index:
        :param word_freq:
        :param word_count_in_doc:
        :return:
        """
        return self.idf[word_index] * (self.delta + (word_freq * (self.K1 + 1.0))) / (
                word_freq + self.K1 * (1.0 - self.B + self.B * (word_count_in_doc / self.avg_word_count_in_doc)))

    def get_feature_names(self) -> List[str]:
        """
        重み付けする単語リストを返す
        :return:
        """
        return list(self.word2id_dict.keys())
    
    def similarity(self, text, documents, result):
        scores = [0] * len(documents)
        words = self.extract_words_func(text)
        for ch in words:            
            for i in range(len(documents)):
                if is_key_exist(self.word2id_dict, ch):
                    id = self.word2id_dict[ch]
                    #対応する点数を足していく
                    scores[i] += result[i, id]
        return scores
                    


from janome.tokenizer import Tokenizer
t = Tokenizer()
def tokenize(text):    
    lst = []
    for token in t.tokenize(text):  # 形態素解析
        hinshi = (token.part_of_speech).split(',')[0]  # 品詞情報
        hinshi_2 = (token.part_of_speech).split(',')[1]
        #print(hinshi)
        if hinshi in ['名詞', '形容詞', '動詞']:  # 品詞が名詞の場合のみ以下実行
            if not hinshi_2 in ['空白','*']:  
            # 品詞情報の2項目目が空白か*の場合は以下実行しない
                word = str(token).split()[0]  # 単語を取得
                if not ',*,' in word:  # 単語に*が含まれない場合は以下実行
                    lst.append(word)
                    #text_file.write(wakati + "\n")
                    #print(wakati)
                    # オブジェクトwakatiに単語とスペースを追加 
    #wakati_list.append(wakati) # 分かち書き結果をリストに追加
    return lst

a  = []
#データベース
dataset = json.load(open("../../data/PoliInfo3-FormalRun-FactVerification/Pref13_tokyo.json", "r"))

# カテゴリーをラベルに変更、文字列の正規化する。
for sample in dataset:
    text = sample['Utterance']
    a.append(text)
    
f = tokenize
o = Okapi(f)
result = o.fit_transform(a)

In [4]:
import json
json_open = open('../../data/PoliInfo3-FormalRun-FactVerification/PoliInfo3_FactVerification_Formal_Train.json', 'r')
json_load = json.load(json_open)

context = []
labels = []
utterance_sammary = []
for v in json_load:
    scores = o.similarity(v["UtteranceSummary"], a, result)
    scores2 = np.array(scores)
    scores2_argsort = np.argsort(-scores2)[:7]
    text = ""
    #print(scores2_argsort[:20])
    for i in scores2_argsort:
        data = dataset[i]["Utterance"]
        text = text + data
    context.append(text)
    utterance_sammary.append(v["UtteranceSummary"])
    if v["DocumentEntailment"] == False:
        labels.append(0)
    elif v["DocumentEntailment"] == True:
        labels.append(1)

In [5]:
print(labels)

[1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 

In [6]:
train_val_utterance_summary = utterance_sammary[:921]
test_utterance_summary = utterance_sammary[921:]
train_val_context = context[:921]
test_context = context[921:]
train_val_labels = labels[:921]
test_labels = labels[921:]

In [7]:
# 1. BERT Tokenizerを用いて単語分割・IDへ変換
## Tokenizerの準備
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [8]:
# 最大単語数の確認
max_len = []

# 1文づつ処理
for sent1, sent2 in zip(train_val_utterance_summary, train_val_context):
    token_words_1 = tokenizer.tokenize(sent1)
    token_words_2 = tokenizer.tokenize(sent2)
    token_words_1.extend(token_words_2)
    # 文章数を取得してリストへ格納
    max_len.append(len(token_words_1))
    
max_length = max(max_len) +3 # 最大単語数にSpecial token（[CLS], [SEP]）の+2をした値が最大単語数

# 最大の値を確認
print('最大単語数: ', max_length)

最大単語数:  229


In [9]:
dataset_for_loader = []

end_term = "[SEP]"

# 1文づつ処理
for x , y, label in zip(train_val_utterance_summary, train_val_context, train_val_labels):
    sent= x  + end_term + y

    encoding = tokenizer(
            x,
            y,
            max_length=max_length, 
            padding='max_length',
            truncation=True
        )
    
    encoding['labels'] = label # ラベルを追加
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset_for_loader.append(encoding)

In [10]:
print(len(dataset_for_loader))

921


In [11]:
# test:val:test = 8:1:1 で分割
train_size = int(0.9 * len(dataset_for_loader))
val_size = len(dataset_for_loader) - train_size

# データセットを分割
train_dataset, val_dataset = random_split(dataset=dataset_for_loader, lengths=[train_size, val_size])

In [13]:
# データローダの作成
dataloader_train = DataLoader(
    train_dataset, batch_size=16, shuffle=True
)
dataloader_val = DataLoader(val_dataset, batch_size=16)
#dataloader_test = DataLoader(test_dataset, batch_size=256)

In [42]:
for idx, batch, in enumerate(dataloader_train):
    print(f'# batch {idx}')
    print(batch)

# batch 0
{'input_ids': tensor([[   2, 6411,  237,  ...,    6, 4411,    3],
        [   2, 7657, 1100,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([ True, False])}
# batch 1
{'input_ids': tensor([[  2, 130, 489,  ...,   0,   0,   0],
        [  2, 280,   5,  ...,   0,   0,   0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([False,  True])}
# batch 2
{'input_ids': tensor([[    2, 14317,    17,  ...,    11,  1630,     3],
        [    2,  7000, 14461,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tens

In [14]:
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
bert_sc = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
bert_sc = bert_sc.cuda(5)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [15]:
# 6-14
class BertForSequenceClassification_pl(pl.LightningModule):
        
    def __init__(self, model_name, num_labels, lr):
        # model_name: Transformersのモデルの名前
        # num_labels: ラベルの数
        # lr: 学習率

        super().__init__()
        
        # 引数のnum_labelsとlrを保存。
        # 例えば、self.hparams.lrでlrにアクセスできる。
        # チェックポイント作成時にも自動で保存される。
        self.save_hyperparameters() 

        # BERTのロード
        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        
    # 学習データのミニバッチ(`batch`)が与えられた時に損失を出力する関数を書く。
    # batch_idxはミニバッチの番号であるが今回は使わない。
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss) # 損失を'train_loss'の名前でログをとる。
        return loss
        
    # 検証データのミニバッチが与えられた時に、
    # 検証データを評価する指標を計算する関数を書く。
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss) # 損失を'val_loss'の名前でログをとる。

    # テストデータのミニバッチが与えられた時に、
    # テストデータを評価する指標を計算する関数を書く。
    def test_step(self, batch, batch_idx):
        labels = batch.pop('labels') # バッチからラベルを取得
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = (labels_predicted == labels).sum().item()
        num_correct1 = (labels == 1).sum().item()
        #num_correct2 = (labels == 1 and labels_predicted == labels).sum().item()
        #tp = (labels_predicted == 1 and labels == 1).sum().item()
        #fp = (labels_predicted == 1 and labels == 0).sum().item()
        #fn = (labels_predicted == 0 and labels == 1).sum().item()
        #tn = (labels_predicted == 0 and labels == 1).sum().item()
        accuracy = num_correct/labels.size(0) #精度
        #accuracy2 = (tp + tn) / (tp + fp + tn + fn)
        #recall = tp / (tp + fn)
        #precision = tp / (tp + fp)
        #f_value = 2*precision*recall/(precision+recall)
        self.log('accuracy', accuracy) # 精度を'accuracy'の名前でログをとる。
        self.log('num_correct1', num_correct1)
        #self.log('num_correct2', num_correct2)
        #self.log('accuracy2', accuracy2)
        #self.log('precision', precision)
        #self.log('recall', recall)
        #self.log('f_value', f_value)
    
    def predict_step(self, batch, batch_idx):
        # enable Monte Carlo Dropout
        output = self.bert_sc(**batch)
        scores = output.logits
        labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()
        return labels_predicted

    # 学習に用いるオプティマイザを返す関数を書く。
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [16]:
# 6-15
# 学習時にモデルの重みを保存する条件を指定
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

# 学習の方法を指定
trainer = pl.Trainer(
    gpus=[5], 
    max_epochs=10,
    callbacks = [checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [17]:
# 6-16
# PyTorch Lightningモデルのロード
model = BertForSequenceClassification_pl(
    MODEL_NAME, num_labels=2, lr=1e-5
)

model = model.cuda(5)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [18]:
trainer.fit(model, dataloader_train, dataloader_val)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]

  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert_sc | BertForSequenceClassification | 110 M 
----------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.476   Total estimated model params size (MB)


                                                              



Epoch 9: 100%|██████████| 58/58 [00:19<00:00,  2.97it/s, loss=0.512, v_num=0]


1

In [21]:
#http://localhost:6009/#scalars&runSelectionState=eyJsaWdodG5pbmdfbG9ncy92ZXJzaW9uXzAiOmZhbHNlLCJsaWdodG5pbmdfbG9ncy92ZXJzaW9uXzEiOmZhbHNlLCJsaWdodG5pbmdfbG9ncy92ZXJzaW9uXzIiOnRydWV9
!tensorboard --logdir ./

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6009/ (Press CTRL+C to quit)
^C


In [35]:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


Testing: 100%|██████████| 22/22 [00:11<00:00,  2.00it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.8405744433403015, 'num_correct1': 123.02835845947266}
--------------------------------------------------------------------------------
Accuracy: 0.84


In [18]:
print(test)

[{'accuracy': 0.8538447618484497, 'precision': 1.9264768362045288, 'recall': 1.7602777481079102, 'f_value': 1.8387457132339478}]


In [19]:
# 6-17
best_model_path = checkpoint.best_model_path # ベストモデルのファイル
print('ベストモデルのファイル: ', checkpoint.best_model_path)
print('ベストモデルの検証データに対する損失: ', checkpoint.best_model_score)

ベストモデルのファイル:  /disk/ssd14tc/mamemiya/textual_entailment/factcheck/bm25-finetuning/model/epoch=1-step=103.ckpt
ベストモデルの検証データに対する損失:  tensor(0.6805, device='cuda:5')


In [20]:
# 6-20
# PyTorch Lightningモデルのロード
model = BertForSequenceClassification_pl.load_from_checkpoint(
    best_model_path
) 

# Transformers対応のモデルを./model_transformesに保存
model.bert_sc.save_pretrained('./model_transformers') 

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [21]:
# 6-21
bert_sc = BertForSequenceClassification.from_pretrained(
    './model_transformers'
)

bert_sc.cuda(5)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:
t1_2 = test_utterance_summary
t2_2 = test_context
labels2 = test_labels

In [23]:
predicted = []
correct_labels = []

for x , y, label in zip(t1_2, t2_2, labels2):
    
    correct_labels.append(label)
    
    encoding = tokenizer(
            x,
            y,
            max_length=max_length, 
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
    
    encoding = { k: v.cuda(5) for k, v in encoding.items() }

    with torch.no_grad():
        output = bert_sc.forward(**encoding)
        scores = output.logits
        labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()
        predicted.append(labels_predicted)
    
#print(predicted)

In [24]:
test_num = len(predicted)
num_correct = 0
tp = 0
fp = 0
fn = 0
tn = 0

for i in range(test_num):
    if predicted[i] == correct_labels[i]:
        num_correct += 1
    
    if predicted[i] == 1 and correct_labels[i] == 1:
        tp += 1
    
    if predicted[i] == 1 and correct_labels[i] == 0:
        fp += 1
    
    if predicted[i] == 0 and correct_labels[i] == 1:
        fn += 1
    
    if predicted[i] == 0 and correct_labels[i] == 0:
        tn += 1

accuracy = num_correct / test_num
recall = tp /(tp + fn)
precision = tp /(tp + fp)
f_value = 2*recall*precision / (precision + recall)
print("accuracy: " + str(accuracy))
print("recall: " + str(recall))
print("precision: " + str(precision))
print("f_value: " + str(f_value))

accuracy: 0.4803921568627451
recall: 0.6451612903225806
precision: 0.5633802816901409
f_value: 0.6015037593984963


In [26]:
print(predicted)

[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1]


In [26]:
#文を入力して含意・非含意判定

hypothesis = ""
premise = "改革を行わない"
    
encoding = tokenizer(
            hypothesis,
            premise,
            max_length=max_length, 
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
    
encoding = { k: v.cuda(3) for k, v in encoding.items() }

with torch.no_grad():
    output = bert_sc.forward(**encoding)
    scores = output.logits
    labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()
    if labels_predicted == 1:
        print("entailment")
    else:
        print("non-entailment")

non-entailment
