In [None]:
from collections import Counter, defaultdict
import logging
import re
import json
import jieba
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
logger = logging.getLogger('les2')
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
# console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

In [None]:
with open('./data/question.json', 'r', encoding='utf-8') as f:
    train_set = json.loads(f.read())

In [None]:
# 训练集共20000篇文章
len(train_set)

In [None]:
def precision_recall_f1(prediction, ground_truth):
    if not isinstance(prediction, list):
        prediction_tokens = prediction.split()
    else:
        prediction_tokens = prediction
    if not isinstance(ground_truth, list):
        ground_truth_tokens = ground_truth.split()
    else:
        ground_truth_tokens = ground_truth
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0, 0, 0
    p = 1.0 * num_same / len(prediction_tokens)
    r = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * p * r) / (p + r)
    return p, r, f1

In [None]:
def recall(prediction, ground_truth):
    return precision_recall_f1(prediction, ground_truth)[1]


def f1_score(prediction, ground_truth):
    return precision_recall_f1(prediction, ground_truth)[2]

In [None]:
def metric_max_over_ground_truths(metric_fn, prediction, ground_truth):
    score = metric_fn(prediction, ground_truth)
    return score

In [None]:
# 找到最相关的段落和在段落中的位置
def find_fake_answer(sample):
    for a_idx, answer_token in enumerate(sample['questions']):
        most_related_para = -1
        most_related_para_len = 999999
        max_related_score = 0
#         print('a_idx=',a_idx, 'answer_token=',answer_token)
        for p_idx, para_tokens in enumerate(sample['segmented_article_content']):
            related_score = metric_max_over_ground_truths(recall,
                                                          para_tokens,
                                                          answer_token['segmented_answer'])
#             print('p_idx=',p_idx,'related_score=',related_score)
            if related_score > max_related_score \
                    or (related_score == max_related_score
                        and len(para_tokens) < most_related_para_len):
                most_related_para = p_idx
                most_related_para_len = len(para_tokens)
                max_related_score = related_score
        sample['questions'][a_idx]['most_related_para'] = most_related_para
        most_related_para_tokens = sample['segmented_article_content'][most_related_para]
        
        answer_tokens = set(answer_token['segmented_answer'])
        best_match_score = 0
        best_match_span = [-1, -1]
        best_fake_answer = None
        
        for start_tidx in range(len(most_related_para_tokens)):
            if most_related_para_tokens[start_tidx] not in answer_tokens:
                continue
            for end_tidx in range(len(most_related_para_tokens) - 1, start_tidx - 1, -1):
                span_tokens = most_related_para_tokens[start_tidx: end_tidx + 1]
                match_score = metric_max_over_ground_truths(f1_score, span_tokens,
                                                                answer_token['segmented_answer'])
                if match_score == 0:
                    break
                if match_score > best_match_score:
                    best_match_span = [start_tidx, end_tidx]
                    best_match_score = match_score
                    best_fake_answer = ''.join(span_tokens)
        sample['questions'][a_idx]['answer_spans'] = best_match_span
        sample['questions'][a_idx]['fake_answers'] = best_fake_answer
        sample['questions'][a_idx]['match_scores'] = best_match_score
    return sample

In [None]:
def clean_data(sample):
    # 文章内容和标题分段->分词：将标题插入到分段后的首位置
    sample['segmented_article_title'] = \
        list(jieba.cut(''.join(re.split(r'\u3000+|\s+|\t+',sample['article_title'].strip()))))
    
    sample_splited_para = re.split(r'\u3000+|\s+|\t+',sample['article_content'].strip())
    if len(sample_splited_para) == 1 and len(sample_splited_para[0]) > 200:
        sample_splited_para = re.split(r'\。',sample['article_content'].strip())
    sample_splited_list = []
    for para in sample_splited_para:
        sample_splited_list.append(list(jieba.cut(para.strip(), cut_all=False)))
    sample_splited_list.insert(0, sample['segmented_article_title'])

    sample['segmented_article_content'] = sample_splited_list
       
    # 问题和答案分词处理
    for i,question in enumerate(sample['questions']):
        sample['questions'][i]['segmented_question'] = \
            list(jieba.cut(''.join(question['question'].strip().split('\u3000+|\s+|\t+'))))
        sample['questions'][i]['segmented_answer'] = \
            list(jieba.cut(''.join(question['answer'].strip().split('\u3000+|\s+|\t+'))))
    return sample

In [None]:
def store_prerpocess_data():
    preprocess_data = []
    for i in range(1,201):
        with open('./data/preprocessed_%d.json' % i, 'r', encoding='utf-8') as f:
            d = json.load(f)
        preprocess_data.extend(d)
    with open('./data/preprocessed.json', 'w', encoding='utf-8') as f:
            json.dump(preprocess_data, f)

In [None]:
# 数据去重
with open('./data/preprocessed.json', 'r', encoding='utf-8') as f:
    data_preprocessed = json.load(f)

In [None]:
title = Counter([data_preprocessed[i]['article_title'] for i in range(len(data_preprocessed))])

In [None]:
len(title)

In [None]:
{x : title[x] for x in title if title[x] >= 2 }

In [None]:
title_set = set()
data_qc = []
for sample in data_preprocessed:
    title = sample['article_title']
    if title in title_set:
        continue
    else:
        title_set.add(title)
        data_qc.append(sample)

In [None]:
len(data_qc)

In [None]:
data_qc[100]

In [None]:
with open('./data/preprocessed_qc.json', 'w', encoding='utf-8') as f:
    json.dump(data_qc, f)

In [None]:
def train_test_split(dataset,train_percent=0.9):
    index = np.arange(len(dataset))
    np.random.shuffle(index)

    train_size = int(len(dataset) * train_percent)
    train_index = index[:train_size]
    test_index = index[train_size:]
    train_set, test_set = [], []
    for index in train_index:
        train_set.append(dataset[index])
    for index in test_index:
        test_set.append(dataset[index])
        
    return train_set, test_set

In [None]:
trainset, testset = train_test_split(data_qc)

In [None]:
class LESDataset(object):
    def __init__(self, max_p_len, max_q_len,vocab, train_file=None, test_file=None):
        self.max_p_len = max_p_len
        self.max_q_len = max_q_len
        self.vocab = vocab
        if train_file:
            self.train_set = self._load_dataset(train_file)
        if test_file:
            self.test_set = self._load_dataset(test_file)

    def _load_dataset(self, data_path, train=True):
        """
        加载数据集
        :param data_path:
        :return:
        """
        with open(data_path, 'r', encoding='utf-8') as f:
            data_set = json.load(f)
        if train:
            data = []
            for sample in data_set:
                for qa_pairs in sample['questions']:
                    if qa_pairs['answer_spans'][0] == -1:
                        continue
                    data.append({'question':qa_pairs['segmented_question'],
                                'passage':sample['segmented_article_content'][qa_pairs['most_related_para']],
                                'answer_span':qa_pairs['answer_spans']})
        return data
    
    def word_iter(self, set_name):
        if set_name == 'train':
            data_set = self.train_set

        for sample in data_set:
            for question in sample['questions']:
                for word in question['segmented_question']:
                    yield word
                for word in sample['segmented_article_content'][question['most_related_para']]:
                    yield word
                    
    def gen_mini_batches(self, set_name, batch_size, pad_id=0,shuffle=True):
        if set_name == 'train':
            data = self.train_set
            
        data_size = len(data)
        indices = np.arange(data_size)
        if shuffle:
            np.random.shuffle(indices)
        for batch_start in np.arange(0, data_size, batch_size):
            batch_indices = indices[batch_start:batch_start+batch_size]
            batch_data = [data[i] for i in batch_indices]
            yield self._one_mini_batch(batch_data, pad_id)
            
    def _one_mini_batch(self, batch_data_raw, pad_id):
        batch_data = {'question_token_ids':[],
                     'question_length':[],
                     'passage_token_ids':[],
                     'passage_length':[],
                     'start_id':[],
                     'end_id':[]}
        for qa_pairs in batch_data_raw:
            batch_data['question_token_ids'].append(self.convert_to_ids(qa_pairs['question'])),
            batch_data['question_length'].append(len(qa_pairs['question']))
            batch_data['passage_token_ids'].append(self.convert_to_ids(qa_pairs['passage']))
            batch_data['passage_length'].append(len(qa_pairs['passage']))
            batch_data['start_id'].append(qa_pairs['answer_span'][0])
            batch_data['end_id'].append(qa_pairs['answer_span'][1])
            
        batch_data = self._dynamic_padding(batch_data, pad_id)
        return batch_data
    
    def _dynamic_padding(self, batch_data, pad_id):
        pad_p_len = min(self.max_p_len, max(batch_data['passage_length']))
        pad_q_len = min(self.max_q_len, max(batch_data['question_length']))
        batch_data['passage_token_ids'] = [(ids + [pad_id] * (pad_p_len - len(ids)))[:pad_p_len]
                                                for ids in batch_data['passage_token_ids']]
        batch_data['question_token_ids'] = [(ids + [pad_id] * (pad_q_len - len(ids)))[:pad_q_len]
                                                for ids in batch_data['question_token_ids']]
        return batch_data
            
        
    def convert_to_ids(self,tokens):
        ids = []
        for token in tokens:
            ids.append(self.vocab.token2id[token.lower()])
        return ids
        
        

In [None]:
class Vocab(object):

    def __init__(self, filename=None, lower=False):
        self.id2token = {}
        self.token2id = {}
        self.token_cnt = defaultdict(int)
        self.lower = lower

        self.embed_dim = None
        self.embeddings = None

        self.pad_token = '<blank>'
        self.unk_token = '<unk>'

        self.initial_tokens = []
        self.initial_tokens.extend([self.pad_token, self.unk_token])

        for token in self.initial_tokens:
            self.add(token)


    def add(self, token, cnt=True):
        token = token.lower() if self.lower else token

        if token in self.token2id:
            idx = self.token2id[token]
        else:
            idx = len(self.token2id)
            self.token2id[token] = idx
            self.id2token[idx] = token
        if cnt:
            self.token_cnt[token] += 1

        return idx
    
    def randomly_init_embeddings(self, embed_dim):
        self.embed_dim = embed_dim
        self.embeddings = np.random.rand(len(self.token2id), embed_dim)
        
        for token in [self.pad_token, self.unk_token]:
            self.embeddings[self.token2id[token]] = np.zeros([embed_dim])

In [None]:
les_dataset = LESDataset(300, 60,vocab ,train_file='F:\\jupyter_file\\MC\\data\\trainset.json')

In [None]:
batches = les_dataset.gen_mini_batches('train', 32)

In [None]:
batch = next(batches)

In [None]:
batch['question_token_ids']

In [1]:
aa = [1,2,3,4,5,6,7,89,0]

In [2]:
b = [1,3,6]

In [3]:
aa[b]

TypeError: list indices must be integers or slices, not list

In [1]:
import tensorflow as tf

In [33]:
k = tf.Variable(3, 'k')

In [34]:
b = tf.Variable(2, 'b')

In [35]:
x = tf.placeholder('int32', [None],name='x')

In [36]:
y = tf.add(tf.multiply(k, x) , b)

In [37]:
sess = tf.Session()

In [38]:
init_op = tf.global_variables_initializer()

In [39]:
sess.run(init_op)

In [40]:
sess.run(y, feed_dict={x:[3]})

array([11])

In [41]:
vs = tf.trainable_variables()
for v in vs:
    print(v)

Tensor("Variable/read:0", shape=(), dtype=int32)
Tensor("Variable_1/read:0", shape=(), dtype=int32)
Tensor("Variable_2/read:0", shape=(), dtype=int32)
Tensor("Variable_3/read:0", shape=(), dtype=int32)
Tensor("Variable_4/read:0", shape=(), dtype=int32)
Tensor("Variable_5/read:0", shape=(), dtype=int32)


In [7]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [8]:
v1 = tf.Variable([1.0,2.3], name='v1')
v2 = tf.Variable(55.5, name='v2')

In [10]:
init_op = tf.global_variables_initializer()

In [12]:
saver = tf.train.Saver()

In [13]:
ckpt_path = './test.ckpt'

In [14]:
sess.run(init_op)

In [17]:
save_path = saver.save(sess, ckpt_path, global_step=1)

In [42]:
import numpy as np

In [43]:
array = np.arange(25).reshape(5,5)

In [44]:
array

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [45]:
b = tf.nn.embedding_lookup(array, [[1, 3],[2,3]])

In [47]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(b))
    print(c)

[[[ 5  6  7  8  9]
  [15 16 17 18 19]]

 [[10 11 12 13 14]
  [15 16 17 18 19]]]


NameError: name 'c' is not defined