In [1]:
import pandas as pd
import time
import math
import re
import json

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertModel, BertTokenizer, AdamW, BertPreTrainedModel
from transformers import get_linear_schedule_with_warmup

In [2]:
def asMinutes(s): #s = time.time()-start_time
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [3]:
class Timer(object):
    """ A quick tic-toc timer
    Credit: http://stackoverflow.com/questions/5849800/tic-toc-functions-analog-in-python
    """
    def __init__(self, name=None, verbose=True):
        self.name = name
        self.verbose = verbose
        self.elapsed = None

    def __enter__(self):
        self.tstart = time.time()
        return self

    def __exit__(self, type, value, traceback):
        self.elapsed = time.time() - self.tstart
        if self.verbose:
            if self.name:
                print ('[%s]' % self.name,)
        print ('Total executing time for : %s' % self.elapsed)

In [4]:
def clean(context, answer, question, start_pos):
    q_len = len(question)
    if q_len > 40:
        question = question[-40:]
        q_len = 40
    if len(context) + q_len + 3 <= 512:
        return context, question
    else:
        remain = 512 - 3 - q_len
        if answer == '':
            return context[:remain], question
        if start_pos + len(answer) -1 < remain:
            return context[:remain], question
        else:
            return None, None

        
def load_data_to_df(data):
#     punc = '[，＠＃＄％＾＆＊，。、“\/.……%$!@#$「」：；%^&*()《》_+？—]'
    para_id, context, question, q_id, answer, answer_start, answer_end, answerable, answer_start= [], [], [], [], [], [], [], [], []
    for i in range(len(data['data'])):
        for article in data['data'][i]['paragraphs']:
            for qa in article['qas']:
                start_pos = qa['answers'][0]['answer_start']
                temp_context, temp_question = clean(article['context'], qa['answers'][0]['text'], qa['question'], start_pos)
                if temp_context == None:
                    continue
                answer_start.append(start_pos)
                question.append(temp_question)
                context.append(temp_context)
                answer.append(qa['answers'][0]['text'])
                para_id.append(data['data'][i]['id'])
                q_id.append(qa['id'])
                answerable.append(qa['answerable'])
    temp = {
        'para_id': para_id,
        'context': context,
        'question': question,
        'q_id': q_id,
        'answer': answer ,
        'answerable': answerable,
        'start_pos' : answer_start
    }
    df = pd.DataFrame(temp)
    return df

In [5]:
with open('./data/train.json' , 'rb') as f:
    train_data = json.loads(f.read())
with open('./data/dev.json' , 'rb') as f:
    valid_data = json.loads(f.read())
train_df = load_data_to_df(train_data)
valid_df = load_data_to_df(valid_data)

In [6]:
train_df.shape

(37341, 7)

In [7]:
valid_df.shape

(4843, 7)

In [8]:
# index = 35443
# question = train_df.iloc[index].question[-40:]
# context = train_df.iloc[index].context[:(512-len(question) - 3)]
# start_pos = train_df.iloc[index].start_pos

In [9]:
class QADataset(Dataset):
    def __init__(self, df, mode , tokenizer):
        assert mode in ["train", "test"]  # 
        self.mode = mode
        self.df = df
        self.tokenizer = tokenizer  # transformer 中的 BERT tokenizer
    
    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        question = data['question']
        context = data['context']
        start_pos, end_pos = -1, -1
        if self.mode == 'train':
            encoded = self.tokenizer.encode_plus(
                question, 
                context, 
                pad_to_max_length = True,
                add_special_tokens  = True,
                return_tensors = 'pt',
                return_token_type_ids = True,
                return_attention_mask = True,
                truncation_strategy = 'only_second'
            )
            inputs_ids = encoded['input_ids'].squeeze(0)
            token_type_ids = encoded['token_type_ids'].squeeze(0)
            attention_mask = encoded['attention_mask'].squeeze(0)
            answer = data['answer']
            start_pos = data['start_pos']
            answerable = 1 if data['answerable'] else 0
            if answerable == 1:
#                 start_pos, end_pos, answerable = self.reassign(inputs_ids, answer, )
#                 print(answer)
                start_pos, end_pos, answerable = self.reassign(question, answer, context[:start_pos])
            start_pos, end_pos, answerable = torch.tensor(start_pos, dtype=torch.long),  torch.tensor(end_pos, dtype=torch.long), torch.tensor(answerable, dtype=torch.long)
            return (inputs_ids, token_type_ids, attention_mask, start_pos, end_pos, answerable)
        encoded = self.tokenizer.encode_plus(
                question, 
                context, 
                max_length  = 512,
                pad_to_max_length = True,
                add_special_tokens  = True,
                return_tensors = 'pt',
                return_token_type_ids = True,
                return_attention_mask = True,
                truncation_strategy = 'only_second'
            )
        
        inputs_ids = encoded['input_ids'][0]
        token_type_ids = encoded['token_type_ids'][0]
        attention_mask = encoded['attention_mask'][0]
        return (inputs_ids, token_type_ids, attention_mask, data['q_id'])
    
    
    def reassign(self, question, answer, b_context):
        q_len = len(self.tokenizer.encode(question, add_special_tokens = False, return_tensors ='pt')[0]) 
        b_len = len(self.tokenizer.encode(b_context, add_special_tokens = False, return_tensors ='pt')[0])
        ans_len = len(self.tokenizer.encode(answer, add_special_tokens = False, return_tensors ='pt')[0])
        start_pos = q_len + b_len + 2
        end_pos = start_pos + ans_len -1
        return start_pos, end_pos, 1

    
    def __len__(self):
        return len(self.df)

In [11]:
class BertQA(BertPreTrainedModel):
    def __init__(self, config):
        super(BertQA, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
        self.ans_outputs = nn.Linear(config.hidden_size, 2) # crossentropy
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        answerable = None,
        args = None
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        #outputs 
        
        sequence_output = outputs[0] # sequence_output : torch.Size([batch_size, 512, 768])
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1) #torch.Size([batch_size, 512,])
        end_logits = end_logits.squeeze(-1)
        
        #answerable
        cls = outputs[1] # cls : torch.Size([batch_size, 768])
        cls = self.dropout(cls)
        ans_logits = self.ans_outputs(cls)

        outputs = (start_logits, end_logits, ans_logits) 
        if start_positions is not None and end_positions is not None :
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms

            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)
            start_logits = start_logits.masked_fill(token_type_ids == 0 ,  -math.inf)
            end_logits = end_logits.masked_fill(token_type_ids == 0 , -math.inf)

            loss_fct = CrossEntropyLoss(ignore_index = 0)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            
            weight = torch.tensor([0.7, 0.3]).cuda(args.device)
            loss_fct1 =  CrossEntropyLoss(weight = weight)
            ans_loss = loss_fct1(ans_logits, answerable)
            
            total_loss = (start_loss + end_loss + ans_loss) / 3
            outputs = (total_loss, start_loss+ end_loss, ans_loss) + outputs

           
        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)

In [12]:
class Args():
    def __init__(self):
        self.epoch = 4
        self.lr = 1e-5
        self.batch_size = 10
        self.gpu = torch.cuda.is_available()
        self.device = 0

In [13]:
def get_optimizer(model, trainloader_len, epoch, lr):
    optimizer = AdamW(model.parameters(), lr = lr)
    total_steps = trainloader_len * epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 100, num_training_steps = total_steps)
    return optimizer, scheduler

In [14]:
def train(data, model, optimizer, scheduler, args):
    inputs_ids, token_type_ids, attention_mask, start_pos, end_pos, answerable = [i.cuda(args.device) if args.gpu else i for i in data ]
    outputs = model(inputs_ids, attention_mask= attention_mask, token_type_ids= token_type_ids ,start_positions = start_pos, end_positions = end_pos, answerable = answerable, args = args)
    loss, SE_loss, ans_loss = outputs [0], outputs[1], outputs[2]
    loss.backward() # calculate gradient
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
    model.zero_grad()
    return loss, SE_loss, ans_loss

def evals(data, model, args):
    with torch.no_grad():
        inputs_ids, token_type_ids, attention_mask, start_pos, end_pos, answerable = [i.cuda(args.device) if args.gpu else i for i in data ]
        outputs = model(inputs_ids, attention_mask= attention_mask, token_type_ids= token_type_ids ,start_positions = start_pos, end_positions = end_pos, answerable = answerable, args =args)
        loss, SE_loss, ans_loss = outputs [0], outputs[1], outputs[2]
    return loss, SE_loss, ans_loss

In [15]:
def train_iters():
    args = Args()
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)

    trainset = QADataset(train_df, "train", tokenizer=tokenizer)
    trainloader = DataLoader(trainset, batch_size = args.batch_size, drop_last = True, shuffle= True)

    validset = QADataset(valid_df, "train", tokenizer=tokenizer)
    validloader = DataLoader(validset, batch_size = args.batch_size, drop_last = True, shuffle= True)
    model = BertQA.from_pretrained('bert-base-chinese')
    if args.gpu:
        model = model.cuda(args.device)
    model.train()
#     model.bert.embeddings.requires_grad = False
    train_loss_list, val_loss_list = [], []
    for i in range(args.epoch):
        index, total, total_SE_loss, total_ans_loss, total_loss, val_loss, min_val_loss = 0, 0, 0, 0, 0, 0, 100000
        optimizer, scheduler = get_optimizer(model, len(trainloader), args.epoch, args.lr)
        start_time = time.time()
        with Timer():
            for data in trainloader:
                loss, SE_loss, ans_loss = train(data, model, optimizer, scheduler, args)
                total += 1
                total_SE_loss += SE_loss
                total_ans_loss += ans_loss
                total_loss += loss 
                print(f'Epoch : { i + 1} / { args.epoch } ,iterations: {index}, SE_Loss: {total_SE_loss/ total}, ANS_Loss :{total_ans_loss/total},Average_Training Loss : {total_loss / total }', end = '\r')
                index += 1
            print(f'\n Total training time: {time.time() - start_time} ', end = '\n')
        train_loss_list.append(total_loss/ total)
        
        index, total, total_loss ,index = 0, 0, 0, 0
        start_time = time.time()
        with Timer():
            for data in validloader:
                loss, SE_loss, ans_loss = evals(data, model, args)
                total += 1
                total_loss += loss 
                print(f'Epoch : { i + 1} / { args.epoch } ,iterations: {index}, Average_Training Loss : {total_loss / total }', end = '\r')
                index += 1
            print(f'\n Total Validation time: {time.time() - start_time } ', end = '\n')
            val_loss = total_loss / total
            val_loss_list.append(val_loss)
        if val_loss < min_val_loss: 
            min_val_loss = val_loss
            checkpoint_path = f'./model/bertqa_ckpt_{i}.pt'
            torch.save({
                    'epoch': i,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'train_loss_list' : train_loss_list,
                    }, checkpoint_path)
    return train_loss_list, val_loss_list

In [25]:
train_iters()

## May be better:
- freeze embedding layer
- lr

## Prediction part

In [19]:
def clean_test(context, question):
    q_len = len(question)
    if q_len > 40:
        question = question[-40:]
        q_len = 40
    if len(context) + q_len + 3 < 512:
        return context, question
    else:
        remain = 512 - 3 - q_len
        return context[:remain], question
    
def load_data_to_df_test(test):
#     punc = '[，＠＃＄％＾＆＊，。、“\/.……%$!@#$「」：；%^&*()《》_+？—]'
    para_id, context, question, q_id, = [],[],[],[]
    for i in range(len(test['data'])):
        for article in test['data'][i]['paragraphs']:
            for qa in article['qas']:
                temp_context, temp_question = clean_test(article['context'], qa['question'])
                context.append(temp_context)
                question.append(temp_question)
                para_id.append(test['data'][i]['id'])
                q_id.append(qa['id'])

    data = {
        'para_id': para_id,
        'context': context,
        'question': question,
        'q_id': q_id,
    }
    test_df = pd.DataFrame(data=data)
    return test_df

In [20]:
class Args_test():
    def __init__(self):
        self.batch_size = 5
        self.gpu = torch.cuda.is_available()
        self.device = 0

In [21]:
def test(data, model, tokenizer, args):
    inputs_ids, token_type_ids, attention_mask = [i.cuda(args.device) if args.gpu else i for i in data[:3]]
    q_id = data[3]
    (start_score, end_score, answerable) = model(inputs_ids, attention_mask= attention_mask, token_type_ids= token_type_ids, args = args)
    a_val, a_idx = answerable.max(-1)

    start_score = start_score.masked_fill(token_type_ids == 0 ,  -100)
    end_score = end_score.masked_fill(token_type_ids == 0 ,  -100)
    s_val, s_idx = start_score.max(-1)
    e_val, e_idx = end_score.max(-1)
    ret = dict()
    for i in range(len(s_idx)):
        if a_idx[i] == 0:
            ans = ''
        else:
            ans = ''
            if s_idx[i] <= e_idx[i]:
                ans = inputs_ids[i][s_idx[i]: e_idx[i] + 1].tolist()
                ans = ''.join([tokenizer.convert_ids_to_tokens(i) for i in ans])
                spe_tok = ['#', '[CLS]', '[SEP]', '[UNK]', ' ']
                for j in spe_tok:
                    ans = ans.replace(j, '')
        ret[q_id[i]] = ans[:30]
    return ret


In [22]:
def test_iters(model_name):
    args = Args_test()
    
    with open('./data/dev.json' , 'rb') as f:
        test_data = json.loads(f.read())
    test_df = load_data_to_df_test(test_data)
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)
    testset = QADataset(test_df, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size = args.batch_size)
    
    # load model
    model = BertQA.from_pretrained('bert-base-chinese')
    ckpt = torch.load(f'./model/{model_name}')
    model.load_state_dict(ckpt['model_state_dict'])
    if args.gpu:
        model = model.cuda(args.device)
    prediction = dict()
    index = 0
    with Timer():
        with torch.no_grad():
            for data in testloader:
                pred = test(data, model, tokenizer, args)
                prediction.update(pred)
                print(f'prediction iterations: {index}, ans :{pred}', end = '\r')
                index += 1
    return prediction  

In [23]:
prediction = test_iters('bertqa_ckpt_3.pt')

Total executing time for : 83.69748377799988ba49754326d04cd0f8bd1545dc13ea4c864dd4c48c4a6b2d': '1992年', '0ffb7690f0a695f9f4d5e4d6d2e3e3a5446d5f88a61469b37b8d1bea': '2001年', 'e2213cd2c51f247019d27a4041141929740f65f390c4f78c30e33665': '2002年', '5aa22d46fb63e21de0bad92ea2ad6557f5be0b488db2de89b8d4c4e0': '美國'}497ad27c18e06836b5a04c127983ee05997f4a1f2dd8ccbd13d339e': '庾澄慶'}'}厥如尼字母、回汗國'}fa370563a84ea97ec039fe0cd4a': '11世紀'}


In [24]:
with open('prediction.json','w') as f:
    f.write(json.dumps(prediction, ensure_ascii=False))      