In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook
from nltk.tokenize import word_tokenize
from sklearn.model_selection import StratifiedShuffleSplit

# Arg

In [2]:
args = Namespace(
    SQuADpath="data/SQuAd/BatchSQuAD",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    seed=1337
)

np.random.seed(args.seed)

# Vocabulary

In [3]:
class Vocabulary(object):
    

    def __init__(self, token_to_idx=None):
        

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
    def to_serializable(self):
        
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        
        return cls(**contents)

    def add_token(self, token):
        
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        
        return self._token_to_idx[token]

    def lookup_index(self, index):
        
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [4]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):    #<END> TOKEN 和 <BEGIN> TOKEN的作用是？ 在非生成模型中有用吗？
        
        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

# Vectovizer

In [5]:
class Vectorizer(object):
    def __init__(self, question_vocab, passage_vocab, max_question_length):
        self.question_vocab = question_vocab
        self.passage_vocab = passage_vocab
        self.max_question_length = max_question_length
        
    def _vectorize(self, indices, vector_length=-1, mask_index=0):
        
        if vector_length < 0:
            vector_length = len(indices)
        
        vector = np.zeros(vector_length, dtype=np.int64)
        vector[:len(indices)] = indices
        vector[len(indices):] = mask_index

        return vector 

    def _get_question_indices(self, text):
        
        indices = [self.question_vocab.begin_seq_index]
        indices.extend([self.question_vocab.lookup_token(token) for token in text])
        indices.append(self.question_vocab.begin_seq_index)
        
        return indices
    
    def _get_passage_indices(self, text):
        
        indices = [self.passage_vocab.begin_seq_index]
        indices.extend([self.passage_vocab.lookup_token(token) for token in text])
        indices.append(self.passage_vocab.begin_seq_index)
        
        return indices
    
    def vectorize(self, question, passage, use_dataset_max_lengths=True):
        
        source_vector_length = -1
        target_vector_length = -1
        
        if use_dataset_max_lengths:
            question_vector_length = self.max_question_length
            
        question_indices = self._get_question_indices(question)
        question_vector = self._vectorize(question_indices, 
                                        vector_length=question_vector_length, 
                                        mask_index=self.question_vocab.mask_index)
        
        passage_indices = self._get_passage_indices(passage)
        
        passage_vector = self._vectorize(passage_indices,
                                        mask_index=self.passage_vocab.mask_index)
        return {"question_vector": question_vector, 
                "passage_vector": passage_vector, 
                "question_length": len(question_indices)} #实际长度
    
    @classmethod
    def from_dataframe(cls, bitext_df, context, cutof=5): 
        """
        词频passage和question放在一起计算来cutof,
        vocab是分开放的，embedding也是分开的，所以训练也是分开的
        """
        
        question_vocab = SequenceVocabulary()
        passage_vocab = SequenceVocabulary()
        
        max_question_length = 0
        sum_word = []
        
        for question in bitext_df.question:
            sum_word.extend(question)
        
        for text in context.passage:
            sum_word.extend(text)
        
        word_freq = Counter(sum_word) 
        
        for question in bitext_df.question:
            if len(question) > max_question_length:
                max_question_length = len(question)
            for word in question:
                if word_freq[word] >= cutof:
                    question_vocab.add_token(word)
        
        for text in context.passage:
            for word in text:
                if word_freq[word] >= cutof:
                    passage_vocab.add_token(word)
        
        return cls(question_vocab, passage_vocab, max_question_length)
    
    @classmethod
    def from_serializable(cls, contents):
        title_vocab = SequenceVocabulary.from_serializable(contents['question_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['passage_vocab'])

        return cls(title_vocab=title_vocab, category_vocab=category_vocab)

    def to_serializable(self):
        return {'question_vocab': self.question_vocab.to_serializable(),
                'passage_vocab': self.passage_vocab.to_serializable()}
    

# Dataset

In [6]:
class QADataset(Dataset):
    
    def __init__(self, data_df, vectorizer):
        self.data_df = data_df
        self._vectorizer = vectorizer
        
        self.train_df = self.data_df[self.data_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.data_df[self.data_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.data_df[self.data_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
    @classmethod
    def load_from_df(cls, data_df, vec):
        
        train_subset = data_df[data_df.split=='train']
        return cls(train_subset, vec)
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, data_df, vectorizer_filepath):
        
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(data_df, vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        
        with open(vectorizer_filepath) as fp:
            return Vectorizer.from_serializable(json.load(fp))
        
    def save_vectorizer(self, vectorizer_filepath):
        
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
            
    def get_vectorizer(self):
     
        return self._vectorizer
    
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        
        row = self._target_df.iloc[index]

        vector_dict = self._vectorizer.vectorize(row.question, row.context)

        return {"question": vector_dict["question_vector"], 
                "passage": vector_dict["passage_vector"], 
                "question_length": vector_dict["question_length"],
                "answer_start": row.answer_start,
                "none_answer": row.is_impossible}
    def get_num_batches(self, batch_size):
        
        return len(self)//batch_size

# Model

In [1]:
class MJHQAmodel(nn.Module):
    def __init__(self, hidden_state, :
        

SyntaxError: unexpected EOF while parsing (<ipython-input-1-97a335f2d85d>, line 1)

# utils

In [7]:
def generate_nmt_batches(dataset, batch_size, shuffle=True, 
                            drop_last=True, device="cpu"):
    
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        lengths = data_dict['question_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        yield out_data_dict

In [8]:
def text2list(text_df, column=0):
    
    text_df.loc[:,column] = text_df.loc[:,column].apply(word_tokenize)
    
def mungdata(path, data_batch=10):
    """
    passage单独另外放一起，避免重复计算词频
    """
    QADataset = []
    passage = []
    for i in tqdm_notebook(range(data_batch)):
        with open(path+"/batch_{}.json".format(i+1)) as fp:
            batch = json.load(fp)
            for para in tqdm_notebook(batch):
                title = para['title']
                for par in para['paragraphs']:
                    text = {}
                    text['passage'] = word_tokenize(par['context'].lower())
                    passage.append(text)
                    for question in par['qas']:
                        sample = {}
                        sample['title'] =  title
                        sample['context'] = word_tokenize(par['context'].lower())
                        sample['question'] = word_tokenize(question['question'].lower())
                        sample['is_impossible'] = question['is_impossible']
                        if not question['is_impossible']:
                            sample['answer_start'] = question['answers'][0]['answer_start']
                            sample['answer_text'] = question['answers'][0]['text']
                        else:
                            sample['answer_start'] = -1
                            sample['answer_text'] = ''
                        QADataset.append(sample)
    QADataset = pd.DataFrame(QADataset)
    QADataset['split'] = 0
    passage = pd.DataFrame(passage)
   #由Title分层采样  
    split = StratifiedShuffleSplit(n_splits=1, test_size=args.test_proportion+args.val_proportion, random_state=args.seed)

    for train_index, test_index in split.split(QADataset, QADataset.title):
        QADataset.iloc[train_index,-1] = 'train'#修改值的常规方式，其他方式可能会在copy上修改，没用
        QADataset.iloc[test_index,-1] = 'test'

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=args.seed)

    for _, test_index in split.split(QADataset[QADataset.split=='train'], QADataset[QADataset.split=='train'].title):
        QADataset.iloc[test_index,-1] = 'val'
        
    return pd.DataFrame(QADataset), pd.DataFrame(passage)


In [9]:
class GLOVE(object):
    def __init__(self, dic):
        
        self.word_embedding = dic
        self.size = len(dic)
        
    @classmethod
    def loaddic(cls, path):
        
        with open(path, encoding="utf-8") as fp:
            GLoVe = {}
            for word_e in tqdm_notebook(fp.readlines()):
                word_e = word_e.split(" ")
                GLoVe[word_e[0]] = [float(x) for x in word_e[1:]]
            
        return cls(GLoVe)
    
    def dic2matrix(self, vocab):
        """
        因为array没有append方法，1.所以转成list再转回来
        2.也可以用np.append(好像有点不太对,注意要输入axis)
        """
        matrix = list(np.random.randn(4,50))
        for idx in range(4, len(vocab)):
            if vocab.lookup_index(idx) in self.word_embedding:
                matrix.append(np.array(self.word_embedding[vocab.lookup_index(idx)]))
            else:
                matrix.append(np.random.randn(50))
        return np.array(matrix)

# test

In [10]:
Glove = GLOVE.loaddic('data/GLoVe/glove.6B.50d.txt')


HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [11]:
Vectorize = Vectorizer.from_dataframe(*mungdata('data/SQuAd/BatchSQuAD', data_batch=1))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




In [12]:
Glove.dic2matrix(Vectorize.passage_vocab)

array([[-0.70318731, -0.49028236, -0.32181433, ..., -1.04630036,
         0.13988892, -1.73065584],
       [-0.13062312, -1.31026002, -2.17131242, ...,  0.33035865,
        -0.54251518, -1.05202857],
       [-0.77572065, -0.12322816, -0.53693127, ...,  0.8159017 ,
        -0.50309222,  0.1448642 ],
       ...,
       [-0.51983   , -0.28089   , -1.594     , ..., -0.12471   ,
         0.21602   , -0.21437   ],
       [ 0.29329   , -0.49989   , -0.17533   , ..., -0.54471   ,
        -0.22229   ,  0.55851   ],
       [ 0.72141   , -0.15303   , -1.3826    , ...,  0.17129   ,
         0.24375   , -0.34653   ]])

In [13]:
dataset = QADataset.load_from_df(mungdata('data/SQuAd/BatchSQuAD', data_batch=1)[0], Vectorize)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




 #有一个问题 就是预训练的weights怎么放进模型里面 有何种方式

In [14]:
dataset[0]

{'question': array([ 2, 11, 12,  5,  6, 13, 14,  4, 15, 16, 17, 18, 10,  2,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int64),
 'passage': array([ 2,  4,  1,  1,  5,  1,  1,  6,  5,  7,  8,  9, 10, 11,  6, 12, 13,
        14, 15, 10, 16, 10, 17, 18, 19, 20, 21,  7, 19, 22, 23, 24, 10, 25,
        10, 26, 27, 23, 28, 29, 19, 30, 31, 32, 33, 34, 10, 19, 35, 36, 37,
        23, 38, 39, 40, 32, 41, 15, 42, 43, 44, 45,  1, 46, 47, 34, 21, 48,
        49, 50, 51, 10, 52, 53, 10, 38, 54, 55, 56, 42, 38, 57, 47, 58, 59,
        60, 42, 61, 62, 21, 63, 64, 65, 38, 66, 42,  4, 47, 67, 68, 10, 69,
        23, 70,  5, 71,  6, 10, 72, 73, 50, 32, 33, 74, 75, 76, 10, 77, 78,
        79, 80, 19, 81, 38, 82, 83, 84, 85, 86, 87, 88, 23, 70, 89, 19, 87,
        90, 91, 89, 21,  2], dtype=int64),
 'question_length': 14,
 'answer_start': 207,
 'none_answer': False}

In [15]:
for batch in generate_nmt_batches(dataset, 5, shuffle=True, 
                            drop_last=True, device="cpu"):
    print(batch)
    

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 76 and 40 in dimension 1 at C:\w\1\s\tmp_conda_3.7_055457\conda\conda-bld\pytorch_1565416617654\work\aten\src\TH/generic/THTensor.cpp:689