In [1]:
import os
import sys
import torch
import torchvision
from tqdm import tqdm_notebook as tqdm
from transformers import BertModel, BertTokenizer

w_dir = %pwd
work_dir = os.path.dirname(w_dir)
work_dir

I1220 06:38:14.415892 139918188459840 file_utils.py:39] PyTorch version 1.1.0 available.


'/work'

In [2]:
sys.path.append(w_dir+'/fgc_support_retri')

In [3]:
from fgc_support_retri.ser_extractor import *
from fgc_support_retri.utils import read_fgc, read_hotpot
from fgc_support_retri.eval import evalaluate_f1

In [4]:
class SER_sent_extract_V2:
    def __init__(self):
        device = torch.device("cpu")
        bert_model_name = config.BERT_EMBEDDING
        bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        bert_indexer = BertSentV1Idx(bert_tokenizer)
        model = BertSentenceSupModel_V2.from_pretrained(bert_model_name)
        model_path = config.TRAINED_MODELS / '20191219_test2' / 'model_epoch20_eval_recall_0.524_f1_0.465.m'
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()
        
        self.tokenizer = bert_tokenizer
        self.model = model
        self.bert_indexer = bert_indexer
        self.device = device
    
    def predict(self, items):
        predictions = []
        for item in tqdm(items):
            with torch.no_grad():
                train_set = SerSentenceDataset([item], transform=torchvision.transforms.Compose([BertSentV2Idx(self.tokenizer)]))
                batch = bert_sentV2_collate([sample for sample in train_set])
                for key in ['input_ids', 'token_type_ids', 'attention_mask', 'tf_type', 'idf_type']:
                    batch[key] = batch[key].to(self.device)
                prediction = self.model.predict(batch, threshold=0.5)
                prediction.sort()
                item['sup_prediction'] = prediction
                predictions.append(prediction)
                
        return predictions 

In [5]:
fgc_items = read_fgc(config.FGC_DEV)

no gold supporting evidence
{'QID': 'D032Q10', 'QTYPE': '进阶题', 'QTEXT': '第二次簽訂的北美貿易協定從簽署至生效過了幾日?', 'SENTS': [{'text': '第二次签订的北美贸易协定从签署至生效过了几日?', 'start': 0, 'end': 23}], 'ANSWER': [{'ATEXT': '資訊不足無法判定', 'ATOKEN': [{'text': '资讯不足无法判定', 'start': -1}], 'ATEXT_CN': '资讯不足无法判定'}], 'ATYPE': 'Date-Duration', 'AMODE': 'Date-Duration', 'ASPAN': [], 'SHINT': [], 'QTEXT_CN': '第二次签订的北美贸易协定从签署至生效过了几日?'}
no gold supporting evidence
{'QID': 'D049Q04', 'QTYPE': '申论', 'QTEXT': '「雅婷逐字稿」的命名起源為何?', 'SENTS': [{'text': '「雅婷逐字稿」的命名起源为何?', 'start': 0, 'end': 15}], 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}], 'ATEXT_CN': ''}], 'ATYPE': 'Event', 'AMODE': 'Single-Span-Extraction', 'ASPAN': [], 'SHINT': [], 'QTEXT_CN': '「雅婷逐字稿」的命名起源为何?'}
no gold supporting evidence
{'QID': 'D086Q03', 'QTYPE': '申论', 'QTEXT': '不可再生能源的意義是什麼？', 'SENTS': [{'text': '不可再生能源的意义是什么？', 'start': 0, 'end': 13}], 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}], 'ATEXT_CN': ''}], 'ATYPE': 'Object', 'AMODE': 'Sing

In [6]:
extractor = SER_sent_extract_V2()

predictions = []
predictions = extractor.predict(fgc_items)
precision, recall, f1 = evalaluate_f1(fgc_items, predictions)
print(precision)
print(recall)
print(f1)

I1220 06:38:15.843599 139918188459840 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00
I1220 06:38:23.559087 139918188459840 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.0c16faba8be66db3f02805c912e4cf94d3c9cffc1f12fa1a39906f9270f76d33
I1220 06:38:23.562205 139918188459840 configuration_utils.py:169] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_

HBox(children=(IntProgress(value=0, max=213), HTML(value='')))


0.4293785310734463
0.47010309278350515
0.44881889763779526


In [9]:
for item in fgc_items:
    
    new_sents = []
    for s_i, s in enumerate(item['SENTS']):
        new_sents.append({s_i: s['text']})
    del item['SENTS']
    item['SENTS'] = new_sents


KeyError: 'text'

In [10]:
fgc_items[5]

{'QID': 'D004Q07',
 'SUP_EVIDENCE': [11, 19],
 'QTEXT': '蘇東坡死後葬在哪裡?',
 'ANS': '郟縣小峨眉山',
 'ASPAN': [{'text': '葬', 'start': 230, 'end': 231},
  {'text': '郏县', 'start': 232, 'end': 234},
  {'text': '小峨眉山', 'start': 234, 'end': 238},
  {'text': '苏轼', 'start': 144, 'end': 146}],
 'sup_prediction': [19],
 'SENTS': [{0: '元祐元年（1086年），'},
  {1: '宋哲宗即位，'},
  {2: '高太皇太后垂帘听政，'},
  {3: '回朝任礼部郎中、中书舍人、翰林学士，'},
  {4: '元祐四年（1089年）拜龙图阁学士，'},
  {5: '曾出任杭州、颍州等知州职务，'},
  {6: '官至礼部尚书。'},
  {7: '\n绍圣元年（1094年）被哲宗贬谪至惠州、儋州（海南岛）。'},
  {8: '\n元符三年（1100年），'},
  {9: '宋徽宗即位，'},
  {10: '向太后垂帘听政，'},
  {11: '下诏让苏轼北还。'},
  {12: '\n建中靖国元年（1101年），'},
  {13: '夏天因冷饮过度，'},
  {14: '下痢不止，又误服黄芪，'},
  {15: '结果病情恶化，'},
  {16: '「齿间出血如蚯蚓者无数」，'},
  {17: '七月二十八日于常州孙氏馆病卒，'},
  {18: '享年六十四岁。'},
  {19: '由弟苏辙归葬于郏县小峨眉山。'},
  {20: '南宋宋孝宗追赠谥号「文忠」。'},
  {21: '\n苏轼疲于应付新旧党争，'},
  {22: '遇事「如食内有蝇，吐之乃已」，'},
  {23: '苏轼既反对王安石比较急进的改革措施，'},
  {24: '也不同意旧党司马光尽废新法，'},
  {25: '所以虽然新党一直称苏轼为旧党，'},
  {26: '但其实他在新旧两党之间均受排斥，'},
  {27: '仕途坎坷，时常远贬外方，'},
  {28: