In [1]:
import string
import re
from nltk.corpus import wordnet

# stemming using by PorterStemmer: https://tartarus.org/martin/PorterStemmer/

class Tokenization():
    
    def __init__(self, version='en'):        
        self.stopwords = set()     
        if version == 'en':
            # shorten patterns  
            self.shorten_patterns = [
                           (r'won\'t', 'will not'),
                           (r'can\'t', 'cannot'),
                           (r'i\'m', 'i am'),
                           (r'(\w+)\'ll', '\g<1> will'),
                           (r'(\w+)n\'t', '\g<1> not'),
                           (r'(\w+)\'ve', '\g<1> have'),
                           (r'(\w+)\'s', '\g<1> is'),
                           (r'(\w+)\'re', '\g<1> are') ]         
             # repeat patterns
            self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
            self.repl = r'\1\2\3'
            # remove punctuation pattern, only [a-zA-Z0-9]
            self.punctuation_pattern = re.compile("[^\w]+", re.U)
            # remove number pattern, only [a-zA-Z]
            self.pattern = re.compile("[^\D]+", re.U)
            
        elif version == 'zh-tw':
            self.pattern = ''
            
    def shorten_replace(self, text):
        s = text
        for (pattern, repl) in self.shorten_patterns:
            s = re.sub(pattern, repl, s)
        return s
        
    def remove_punctuation(self, text):
        output_text = ''
        for c in text:
            if c not in string.punctuation:
                output_text += ''.join(c)
            else:
                output_text += ''.join(' ')
        return output_text
    
    def load_stopword_userdict(self, file_name):
        with open(file_name, 'r') as f:
            for word in f.readlines():
                if word not in self.stopwords:
                    self.stopwords.add(word.strip())            
        f.close()
        
    def load_stopword_dict(self):        
        self.load_stopword_userdict('stopwords.txt')
        self.stopwords = list(self.stopwords)
        self.stopwords.sort()
    
    def remove_stopwords(self, text):
        word_sequence = text.split()
        return ' '.join(word for word in word_sequence if word not in self.stopwords) 
                    
    def word_segmentation(self, pattern, word_sequence):
        return ' '.join(word for word in re.sub(pattern, ' ', word_sequence).split())
        
    def stemming(self, text):
        output_text = ''
        for word in text.split():
            output_text += self.porter_stem.stem(word, 0, len(word)-1)
            output_text += ' '
        return output_text
    
    def cut(self, text, shorten=True, punctuation=True, number=True, stopword=True, stemming=True):
        self.text = text.lower()      
        if shorten:
            self.text = self.shorten_replace(self.text)
        if punctuation:
            self.text = self.word_segmentation(self.punctuation_pattern, self.text)
        if number:
            self.text = self.word_segmentation(self.pattern, self.text)      
        if stopword:
            self.load_stopword_dict()
            self.text = self.remove_stopwords(self.text)
        if stemming:
            self.porter_stem = PorterStemmer()
            self.text = self.stemming(self.text)
        word_sequence = self.text.split()
        return ' '.join(word for word in word_sequence)

In [2]:
from time import time
from datetime import timedelta
from copy import deepcopy

import random
import numpy as np
import pandas as pd
from ml_metrics import mapk

import torch
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForMultipleChoice

# Random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# CUDA device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Settings

In [3]:
# Input files
document_csv_path = '../input/ntust-ir2020-homework6/documents.csv'
training_csv_path = '../input/ntust-ir2020-homework6/train_queries.csv'
testing_csv_path = '../input/ntust-ir2020-homework6/test_queries.csv'

# Input limitation
max_query_length = 64
max_input_length = 512
num_negatives = 3   # num. of negative documents to pair with a positive document

# Model finetuning
model_name_or_path = "bert-base-uncased"
max_epochs = 6
learning_rate = 2e-5
dev_set_ratio = 0.2   # make a ratio of training set as development set for rescoring weight sniffing
max_patience = 0      # earlystop if avg. loss on development set doesn't decrease for num. of epochs
batch_size = 4    # num. of inputs = 8 requires ~9200 MB VRAM (num. of inputs = batch_size * (num_negatives + 1))
num_workers = 2   # num. of jobs for pytorch dataloader

# Save paths
save_model_path = "models/bert_base_uncased"  # assign `None` for not saving the model
save_submission_path = "bm25_bert_rescoring.csv"
K = 1000   # for MAP@K

## Preparing

In [4]:
my_tokenizer = Tokenization()

In [5]:
# Build and save BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name_or_path)
if save_model_path is not None:
    save_tokenizer_path = "%s/tokenizer" % (save_model_path)
    tokenizer.save_pretrained(save_tokenizer_path)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [6]:
# Collect mapping of all document id and text
doc_id_to_text = {}
doc_df = pd.read_csv(document_csv_path)
doc_df.fillna("<Empty Document>", inplace=True)
id_text_pair = zip(doc_df["doc_id"], doc_df["doc_text"])
for i, pair in enumerate(id_text_pair, start=1):
    doc_id, doc_text = pair
    doc_id_to_text[doc_id] = doc_text
    
    print("Progress: %d/%d\r" % (i, len(doc_df)), end='')
    
doc_df.tail()

Progress: 1/100000Progress: 2/100000Progress: 3/100000Progress: 4/100000Progress: 5/100000Progress: 6/100000Progress: 7/100000Progress: 8/100000Progress: 9/100000Progress: 10/100000Progress: 11/100000Progress: 12/100000Progress: 13/100000Progress: 14/100000Progress: 15/100000Progress: 16/100000Progress: 17/100000Progress: 18/100000Progress: 19/100000Progress: 20/100000Progress: 21/100000Progress: 22/100000Progress: 23/100000Progress: 24/100000Progress: 25/100000Progress: 26/100000Progress: 27/100000Progress: 28/100000Progress: 29/100000Progress: 30/100000Progress: 31/100000Progress: 32/100000Progress: 33/100000Progress: 34/100000Progress: 35/100000Progress: 36/100000Progress: 37/100000Progress: 38/100000Progress: 39/100000Progress: 40/100000Progress: 41/100000Progress: 42/100000Progress: 43/100000Progress: 44/100000Progress: 45/100000Progress: 46/100000Progress: 47/100000Progress: 48/100000Progress: 49/100000Progress: 50/100000Progress:

Progress: 2435/100000Progress: 2436/100000Progress: 2437/100000Progress: 2438/100000Progress: 2439/100000Progress: 2440/100000Progress: 2441/100000Progress: 2442/100000Progress: 2443/100000Progress: 2444/100000Progress: 2445/100000Progress: 2446/100000Progress: 2447/100000Progress: 2448/100000Progress: 2449/100000Progress: 2450/100000Progress: 2451/100000Progress: 2452/100000Progress: 2453/100000Progress: 2454/100000Progress: 2455/100000Progress: 2456/100000Progress: 2457/100000Progress: 2458/100000Progress: 2459/100000Progress: 2460/100000Progress: 2461/100000Progress: 2462/100000Progress: 2463/100000Progress: 2464/100000Progress: 2465/100000Progress: 2466/100000Progress: 2467/100000Progress: 2468/100000Progress: 2469/100000Progress: 2470/100000Progress: 2471/100000Progress: 2472/100000Progress: 2473/100000Progress: 2474/100000Progress: 2475/100000Progress: 2476/100000Progress: 2477/100000Progress: 2478/100000Progress: 2479/100000Progress: 

Progress: 4434/100000Progress: 4435/100000Progress: 4436/100000Progress: 4437/100000Progress: 4438/100000Progress: 4439/100000Progress: 4440/100000Progress: 4441/100000Progress: 4442/100000Progress: 4443/100000Progress: 4444/100000Progress: 4445/100000Progress: 4446/100000Progress: 4447/100000Progress: 4448/100000Progress: 4449/100000Progress: 4450/100000Progress: 4451/100000Progress: 4452/100000Progress: 4453/100000Progress: 4454/100000Progress: 4455/100000Progress: 4456/100000Progress: 4457/100000Progress: 4458/100000Progress: 4459/100000Progress: 4460/100000Progress: 4461/100000Progress: 4462/100000Progress: 4463/100000Progress: 4464/100000Progress: 4465/100000Progress: 4466/100000Progress: 4467/100000Progress: 4468/100000Progress: 4469/100000Progress: 4470/100000Progress: 4471/100000Progress: 4472/100000Progress: 4473/100000Progress: 4474/100000Progress: 4475/100000Progress: 4476/100000Progress: 4477/100000Progress: 4478/100000Progress: 

Progress: 6934/100000Progress: 6935/100000Progress: 6936/100000Progress: 6937/100000Progress: 6938/100000Progress: 6939/100000Progress: 6940/100000Progress: 6941/100000Progress: 6942/100000Progress: 6943/100000Progress: 6944/100000Progress: 6945/100000Progress: 6946/100000Progress: 6947/100000Progress: 6948/100000Progress: 6949/100000Progress: 6950/100000Progress: 6951/100000Progress: 6952/100000Progress: 6953/100000Progress: 6954/100000Progress: 6955/100000Progress: 6956/100000Progress: 6957/100000Progress: 6958/100000Progress: 6959/100000Progress: 6960/100000Progress: 6961/100000Progress: 6962/100000Progress: 6963/100000Progress: 6964/100000Progress: 6965/100000Progress: 6966/100000Progress: 6967/100000Progress: 6968/100000Progress: 6969/100000Progress: 6970/100000Progress: 6971/100000Progress: 6972/100000Progress: 6973/100000Progress: 6974/100000Progress: 6975/100000Progress: 6976/100000Progress: 6977/100000Progress: 6978/100000Progress: 

Progress: 9292/100000Progress: 9293/100000Progress: 9294/100000Progress: 9295/100000Progress: 9296/100000Progress: 9297/100000Progress: 9298/100000Progress: 9299/100000Progress: 9300/100000Progress: 9301/100000Progress: 9302/100000Progress: 9303/100000Progress: 9304/100000Progress: 9305/100000Progress: 9306/100000Progress: 9307/100000Progress: 9308/100000Progress: 9309/100000Progress: 9310/100000Progress: 9311/100000Progress: 9312/100000Progress: 9313/100000Progress: 9314/100000Progress: 9315/100000Progress: 9316/100000Progress: 9317/100000Progress: 9318/100000Progress: 9319/100000Progress: 9320/100000Progress: 9321/100000Progress: 9322/100000Progress: 9323/100000Progress: 9324/100000Progress: 9325/100000Progress: 9326/100000Progress: 9327/100000Progress: 9328/100000Progress: 9329/100000Progress: 9330/100000Progress: 9331/100000Progress: 9332/100000Progress: 9333/100000Progress: 9334/100000Progress: 9335/100000Progress: 9336/100000Progress: 

Progress: 11433/100000Progress: 11434/100000Progress: 11435/100000Progress: 11436/100000Progress: 11437/100000Progress: 11438/100000Progress: 11439/100000Progress: 11440/100000Progress: 11441/100000Progress: 11442/100000Progress: 11443/100000Progress: 11444/100000Progress: 11445/100000Progress: 11446/100000Progress: 11447/100000Progress: 11448/100000Progress: 11449/100000Progress: 11450/100000Progress: 11451/100000Progress: 11452/100000Progress: 11453/100000Progress: 11454/100000Progress: 11455/100000Progress: 11456/100000Progress: 11457/100000Progress: 11458/100000Progress: 11459/100000Progress: 11460/100000Progress: 11461/100000Progress: 11462/100000Progress: 11463/100000Progress: 11464/100000Progress: 11465/100000Progress: 11466/100000Progress: 11467/100000Progress: 11468/100000Progress: 11469/100000Progress: 11470/100000Progress: 11471/100000Progress: 11472/100000Progress: 11473/100000Progress: 11474/100000Progress: 11475/100000Progress: 1

Progress: 13806/100000Progress: 13807/100000Progress: 13808/100000Progress: 13809/100000Progress: 13810/100000Progress: 13811/100000Progress: 13812/100000Progress: 13813/100000Progress: 13814/100000Progress: 13815/100000Progress: 13816/100000Progress: 13817/100000Progress: 13818/100000Progress: 13819/100000Progress: 13820/100000Progress: 13821/100000Progress: 13822/100000Progress: 13823/100000Progress: 13824/100000Progress: 13825/100000Progress: 13826/100000Progress: 13827/100000Progress: 13828/100000Progress: 13829/100000Progress: 13830/100000Progress: 13831/100000Progress: 13832/100000Progress: 13833/100000Progress: 13834/100000Progress: 13835/100000Progress: 13836/100000Progress: 13837/100000Progress: 13838/100000Progress: 13839/100000Progress: 13840/100000Progress: 13841/100000Progress: 13842/100000Progress: 13843/100000Progress: 13844/100000Progress: 13845/100000Progress: 13846/100000Progress: 13847/100000Progress: 13848/100000Progress: 1

Progress: 15420/100000Progress: 15421/100000Progress: 15422/100000Progress: 15423/100000Progress: 15424/100000Progress: 15425/100000Progress: 15426/100000Progress: 15427/100000Progress: 15428/100000Progress: 15429/100000Progress: 15430/100000Progress: 15431/100000Progress: 15432/100000Progress: 15433/100000Progress: 15434/100000Progress: 15435/100000Progress: 15436/100000Progress: 15437/100000Progress: 15438/100000Progress: 15439/100000Progress: 15440/100000Progress: 15441/100000Progress: 15442/100000Progress: 15443/100000Progress: 15444/100000Progress: 15445/100000Progress: 15446/100000Progress: 15447/100000Progress: 15448/100000Progress: 15449/100000Progress: 15450/100000Progress: 15451/100000Progress: 15452/100000Progress: 15453/100000Progress: 15454/100000Progress: 15455/100000Progress: 15456/100000Progress: 15457/100000Progress: 15458/100000Progress: 15459/100000Progress: 15460/100000Progress: 15461/100000Progress: 15462/100000Progress: 1

Progress: 16931/100000Progress: 16932/100000Progress: 16933/100000Progress: 16934/100000Progress: 16935/100000Progress: 16936/100000Progress: 16937/100000Progress: 16938/100000Progress: 16939/100000Progress: 16940/100000Progress: 16941/100000Progress: 16942/100000Progress: 16943/100000Progress: 16944/100000Progress: 16945/100000Progress: 16946/100000Progress: 16947/100000Progress: 16948/100000Progress: 16949/100000Progress: 16950/100000Progress: 16951/100000Progress: 16952/100000Progress: 16953/100000Progress: 16954/100000Progress: 16955/100000Progress: 16956/100000Progress: 16957/100000Progress: 16958/100000Progress: 16959/100000Progress: 16960/100000Progress: 16961/100000Progress: 16962/100000Progress: 16963/100000Progress: 16964/100000Progress: 16965/100000Progress: 16966/100000Progress: 16967/100000Progress: 16968/100000Progress: 16969/100000Progress: 16970/100000Progress: 16971/100000Progress: 16972/100000Progress: 16973/100000Progress: 1

Progress: 18819/100000Progress: 18820/100000Progress: 18821/100000Progress: 18822/100000Progress: 18823/100000Progress: 18824/100000Progress: 18825/100000Progress: 18826/100000Progress: 18827/100000Progress: 18828/100000Progress: 18829/100000Progress: 18830/100000Progress: 18831/100000Progress: 18832/100000Progress: 18833/100000Progress: 18834/100000Progress: 18835/100000Progress: 18836/100000Progress: 18837/100000Progress: 18838/100000Progress: 18839/100000Progress: 18840/100000Progress: 18841/100000Progress: 18842/100000Progress: 18843/100000Progress: 18844/100000Progress: 18845/100000Progress: 18846/100000Progress: 18847/100000Progress: 18848/100000Progress: 18849/100000Progress: 18850/100000Progress: 18851/100000Progress: 18852/100000Progress: 18853/100000Progress: 18854/100000Progress: 18855/100000Progress: 18856/100000Progress: 18857/100000Progress: 18858/100000Progress: 18859/100000Progress: 18860/100000Progress: 18861/100000Progress: 1

Progress: 20930/100000Progress: 20931/100000Progress: 20932/100000Progress: 20933/100000Progress: 20934/100000Progress: 20935/100000Progress: 20936/100000Progress: 20937/100000Progress: 20938/100000Progress: 20939/100000Progress: 20940/100000Progress: 20941/100000Progress: 20942/100000Progress: 20943/100000Progress: 20944/100000Progress: 20945/100000Progress: 20946/100000Progress: 20947/100000Progress: 20948/100000Progress: 20949/100000Progress: 20950/100000Progress: 20951/100000Progress: 20952/100000Progress: 20953/100000Progress: 20954/100000Progress: 20955/100000Progress: 20956/100000Progress: 20957/100000Progress: 20958/100000Progress: 20959/100000Progress: 20960/100000Progress: 20961/100000Progress: 20962/100000Progress: 20963/100000Progress: 20964/100000Progress: 20965/100000Progress: 20966/100000Progress: 20967/100000Progress: 20968/100000Progress: 20969/100000Progress: 20970/100000Progress: 20971/100000Progress: 20972/100000Progress: 2

Progress: 23314/100000Progress: 23315/100000Progress: 23316/100000Progress: 23317/100000Progress: 23318/100000Progress: 23319/100000Progress: 23320/100000Progress: 23321/100000Progress: 23322/100000Progress: 23323/100000Progress: 23324/100000Progress: 23325/100000Progress: 23326/100000Progress: 23327/100000Progress: 23328/100000Progress: 23329/100000Progress: 23330/100000Progress: 23331/100000Progress: 23332/100000Progress: 23333/100000Progress: 23334/100000Progress: 23335/100000Progress: 23336/100000Progress: 23337/100000Progress: 23338/100000Progress: 23339/100000Progress: 23340/100000Progress: 23341/100000Progress: 23342/100000Progress: 23343/100000Progress: 23344/100000Progress: 23345/100000Progress: 23346/100000Progress: 23347/100000Progress: 23348/100000Progress: 23349/100000Progress: 23350/100000Progress: 23351/100000Progress: 23352/100000Progress: 23353/100000Progress: 23354/100000Progress: 23355/100000Progress: 23356/100000Progress: 2

Progress: 25429/100000Progress: 25430/100000Progress: 25431/100000Progress: 25432/100000Progress: 25433/100000Progress: 25434/100000Progress: 25435/100000Progress: 25436/100000Progress: 25437/100000Progress: 25438/100000Progress: 25439/100000Progress: 25440/100000Progress: 25441/100000Progress: 25442/100000Progress: 25443/100000Progress: 25444/100000Progress: 25445/100000Progress: 25446/100000Progress: 25447/100000Progress: 25448/100000Progress: 25449/100000Progress: 25450/100000Progress: 25451/100000Progress: 25452/100000Progress: 25453/100000Progress: 25454/100000Progress: 25455/100000Progress: 25456/100000Progress: 25457/100000Progress: 25458/100000Progress: 25459/100000Progress: 25460/100000Progress: 25461/100000Progress: 25462/100000Progress: 25463/100000Progress: 25464/100000Progress: 25465/100000Progress: 25466/100000Progress: 25467/100000Progress: 25468/100000Progress: 25469/100000Progress: 25470/100000Progress: 25471/100000Progress: 2

Progress: 27884/100000Progress: 27885/100000Progress: 27886/100000Progress: 27887/100000Progress: 27888/100000Progress: 27889/100000Progress: 27890/100000Progress: 27891/100000Progress: 27892/100000Progress: 27893/100000Progress: 27894/100000Progress: 27895/100000Progress: 27896/100000Progress: 27897/100000Progress: 27898/100000Progress: 27899/100000Progress: 27900/100000Progress: 27901/100000Progress: 27902/100000Progress: 27903/100000Progress: 27904/100000Progress: 27905/100000Progress: 27906/100000Progress: 27907/100000Progress: 27908/100000Progress: 27909/100000Progress: 27910/100000Progress: 27911/100000Progress: 27912/100000Progress: 27913/100000Progress: 27914/100000Progress: 27915/100000Progress: 27916/100000Progress: 27917/100000Progress: 27918/100000Progress: 27919/100000Progress: 27920/100000Progress: 27921/100000Progress: 27922/100000Progress: 27923/100000Progress: 27924/100000Progress: 27925/100000Progress: 27926/100000Progress: 2

Progress: 29928/100000Progress: 29929/100000Progress: 29930/100000Progress: 29931/100000Progress: 29932/100000Progress: 29933/100000Progress: 29934/100000Progress: 29935/100000Progress: 29936/100000Progress: 29937/100000Progress: 29938/100000Progress: 29939/100000Progress: 29940/100000Progress: 29941/100000Progress: 29942/100000Progress: 29943/100000Progress: 29944/100000Progress: 29945/100000Progress: 29946/100000Progress: 29947/100000Progress: 29948/100000Progress: 29949/100000Progress: 29950/100000Progress: 29951/100000Progress: 29952/100000Progress: 29953/100000Progress: 29954/100000Progress: 29955/100000Progress: 29956/100000Progress: 29957/100000Progress: 29958/100000Progress: 29959/100000Progress: 29960/100000Progress: 29961/100000Progress: 29962/100000Progress: 29963/100000Progress: 29964/100000Progress: 29965/100000Progress: 29966/100000Progress: 29967/100000Progress: 29968/100000Progress: 29969/100000Progress: 29970/100000Progress: 2

Progress: 32414/100000Progress: 32415/100000Progress: 32416/100000Progress: 32417/100000Progress: 32418/100000Progress: 32419/100000Progress: 32420/100000Progress: 32421/100000Progress: 32422/100000Progress: 32423/100000Progress: 32424/100000Progress: 32425/100000Progress: 32426/100000Progress: 32427/100000Progress: 32428/100000Progress: 32429/100000Progress: 32430/100000Progress: 32431/100000Progress: 32432/100000Progress: 32433/100000Progress: 32434/100000Progress: 32435/100000Progress: 32436/100000Progress: 32437/100000Progress: 32438/100000Progress: 32439/100000Progress: 32440/100000Progress: 32441/100000Progress: 32442/100000Progress: 32443/100000Progress: 32444/100000Progress: 32445/100000Progress: 32446/100000Progress: 32447/100000Progress: 32448/100000Progress: 32449/100000Progress: 32450/100000Progress: 32451/100000Progress: 32452/100000Progress: 32453/100000Progress: 32454/100000Progress: 32455/100000Progress: 32456/100000Progress: 3

Progress: 34427/100000Progress: 34428/100000Progress: 34429/100000Progress: 34430/100000Progress: 34431/100000Progress: 34432/100000Progress: 34433/100000Progress: 34434/100000Progress: 34435/100000Progress: 34436/100000Progress: 34437/100000Progress: 34438/100000Progress: 34439/100000Progress: 34440/100000Progress: 34441/100000Progress: 34442/100000Progress: 34443/100000Progress: 34444/100000Progress: 34445/100000Progress: 34446/100000Progress: 34447/100000Progress: 34448/100000Progress: 34449/100000Progress: 34450/100000Progress: 34451/100000Progress: 34452/100000Progress: 34453/100000Progress: 34454/100000Progress: 34455/100000Progress: 34456/100000Progress: 34457/100000Progress: 34458/100000Progress: 34459/100000Progress: 34460/100000Progress: 34461/100000Progress: 34462/100000Progress: 34463/100000Progress: 34464/100000Progress: 34465/100000Progress: 34466/100000Progress: 34467/100000Progress: 34468/100000Progress: 34469/100000Progress: 3

Progress: 36927/100000Progress: 36928/100000Progress: 36929/100000Progress: 36930/100000Progress: 36931/100000Progress: 36932/100000Progress: 36933/100000Progress: 36934/100000Progress: 36935/100000Progress: 36936/100000Progress: 36937/100000Progress: 36938/100000Progress: 36939/100000Progress: 36940/100000Progress: 36941/100000Progress: 36942/100000Progress: 36943/100000Progress: 36944/100000Progress: 36945/100000Progress: 36946/100000Progress: 36947/100000Progress: 36948/100000Progress: 36949/100000Progress: 36950/100000Progress: 36951/100000Progress: 36952/100000Progress: 36953/100000Progress: 36954/100000Progress: 36955/100000Progress: 36956/100000Progress: 36957/100000Progress: 36958/100000Progress: 36959/100000Progress: 36960/100000Progress: 36961/100000Progress: 36962/100000Progress: 36963/100000Progress: 36964/100000Progress: 36965/100000Progress: 36966/100000Progress: 36967/100000Progress: 36968/100000Progress: 36969/100000Progress: 3

Progress: 38411/100000Progress: 38412/100000Progress: 38413/100000Progress: 38414/100000Progress: 38415/100000Progress: 38416/100000Progress: 38417/100000Progress: 38418/100000Progress: 38419/100000Progress: 38420/100000Progress: 38421/100000Progress: 38422/100000Progress: 38423/100000Progress: 38424/100000Progress: 38425/100000Progress: 38426/100000Progress: 38427/100000Progress: 38428/100000Progress: 38429/100000Progress: 38430/100000Progress: 38431/100000Progress: 38432/100000Progress: 38433/100000Progress: 38434/100000Progress: 38435/100000Progress: 38436/100000Progress: 38437/100000Progress: 38438/100000Progress: 38439/100000Progress: 38440/100000Progress: 38441/100000Progress: 38442/100000Progress: 38443/100000Progress: 38444/100000Progress: 38445/100000Progress: 38446/100000Progress: 38447/100000Progress: 38448/100000Progress: 38449/100000Progress: 38450/100000Progress: 38451/100000Progress: 38452/100000Progress: 38453/100000Progress: 3

Progress: 40426/100000Progress: 40427/100000Progress: 40428/100000Progress: 40429/100000Progress: 40430/100000Progress: 40431/100000Progress: 40432/100000Progress: 40433/100000Progress: 40434/100000Progress: 40435/100000Progress: 40436/100000Progress: 40437/100000Progress: 40438/100000Progress: 40439/100000Progress: 40440/100000Progress: 40441/100000Progress: 40442/100000Progress: 40443/100000Progress: 40444/100000Progress: 40445/100000Progress: 40446/100000Progress: 40447/100000Progress: 40448/100000Progress: 40449/100000Progress: 40450/100000Progress: 40451/100000Progress: 40452/100000Progress: 40453/100000Progress: 40454/100000Progress: 40455/100000Progress: 40456/100000Progress: 40457/100000Progress: 40458/100000Progress: 40459/100000Progress: 40460/100000Progress: 40461/100000Progress: 40462/100000Progress: 40463/100000Progress: 40464/100000Progress: 40465/100000Progress: 40466/100000Progress: 40467/100000Progress: 40468/100000Progress: 4

Progress: 42569/100000Progress: 42570/100000Progress: 42571/100000Progress: 42572/100000Progress: 42573/100000Progress: 42574/100000Progress: 42575/100000Progress: 42576/100000Progress: 42577/100000Progress: 42578/100000Progress: 42579/100000Progress: 42580/100000Progress: 42581/100000Progress: 42582/100000Progress: 42583/100000Progress: 42584/100000Progress: 42585/100000Progress: 42586/100000Progress: 42587/100000Progress: 42588/100000Progress: 42589/100000Progress: 42590/100000Progress: 42591/100000Progress: 42592/100000Progress: 42593/100000Progress: 42594/100000Progress: 42595/100000Progress: 42596/100000Progress: 42597/100000Progress: 42598/100000Progress: 42599/100000Progress: 42600/100000Progress: 42601/100000Progress: 42602/100000Progress: 42603/100000Progress: 42604/100000Progress: 42605/100000Progress: 42606/100000Progress: 42607/100000Progress: 42608/100000Progress: 42609/100000Progress: 42610/100000Progress: 42611/100000Progress: 4

Progress: 44786/100000Progress: 44787/100000Progress: 44788/100000Progress: 44789/100000Progress: 44790/100000Progress: 44791/100000Progress: 44792/100000Progress: 44793/100000Progress: 44794/100000Progress: 44795/100000Progress: 44796/100000Progress: 44797/100000Progress: 44798/100000Progress: 44799/100000Progress: 44800/100000Progress: 44801/100000Progress: 44802/100000Progress: 44803/100000Progress: 44804/100000Progress: 44805/100000Progress: 44806/100000Progress: 44807/100000Progress: 44808/100000Progress: 44809/100000Progress: 44810/100000Progress: 44811/100000Progress: 44812/100000Progress: 44813/100000Progress: 44814/100000Progress: 44815/100000Progress: 44816/100000Progress: 44817/100000Progress: 44818/100000Progress: 44819/100000Progress: 44820/100000Progress: 44821/100000Progress: 44822/100000Progress: 44823/100000Progress: 44824/100000Progress: 44825/100000Progress: 44826/100000Progress: 44827/100000Progress: 44828/100000Progress: 4

Progress: 46924/100000Progress: 46925/100000Progress: 46926/100000Progress: 46927/100000Progress: 46928/100000Progress: 46929/100000Progress: 46930/100000Progress: 46931/100000Progress: 46932/100000Progress: 46933/100000Progress: 46934/100000Progress: 46935/100000Progress: 46936/100000Progress: 46937/100000Progress: 46938/100000Progress: 46939/100000Progress: 46940/100000Progress: 46941/100000Progress: 46942/100000Progress: 46943/100000Progress: 46944/100000Progress: 46945/100000Progress: 46946/100000Progress: 46947/100000Progress: 46948/100000Progress: 46949/100000Progress: 46950/100000Progress: 46951/100000Progress: 46952/100000Progress: 46953/100000Progress: 46954/100000Progress: 46955/100000Progress: 46956/100000Progress: 46957/100000Progress: 46958/100000Progress: 46959/100000Progress: 46960/100000Progress: 46961/100000Progress: 46962/100000Progress: 46963/100000Progress: 46964/100000Progress: 46965/100000Progress: 46966/100000Progress: 4

Progress: 49424/100000Progress: 49425/100000Progress: 49426/100000Progress: 49427/100000Progress: 49428/100000Progress: 49429/100000Progress: 49430/100000Progress: 49431/100000Progress: 49432/100000Progress: 49433/100000Progress: 49434/100000Progress: 49435/100000Progress: 49436/100000Progress: 49437/100000Progress: 49438/100000Progress: 49439/100000Progress: 49440/100000Progress: 49441/100000Progress: 49442/100000Progress: 49443/100000Progress: 49444/100000Progress: 49445/100000Progress: 49446/100000Progress: 49447/100000Progress: 49448/100000Progress: 49449/100000Progress: 49450/100000Progress: 49451/100000Progress: 49452/100000Progress: 49453/100000Progress: 49454/100000Progress: 49455/100000Progress: 49456/100000Progress: 49457/100000Progress: 49458/100000Progress: 49459/100000Progress: 49460/100000Progress: 49461/100000Progress: 49462/100000Progress: 49463/100000Progress: 49464/100000Progress: 49465/100000Progress: 49466/100000Progress: 4

Progress: 51923/100000Progress: 51924/100000Progress: 51925/100000Progress: 51926/100000Progress: 51927/100000Progress: 51928/100000Progress: 51929/100000Progress: 51930/100000Progress: 51931/100000Progress: 51932/100000Progress: 51933/100000Progress: 51934/100000Progress: 51935/100000Progress: 51936/100000Progress: 51937/100000Progress: 51938/100000Progress: 51939/100000Progress: 51940/100000Progress: 51941/100000Progress: 51942/100000Progress: 51943/100000Progress: 51944/100000Progress: 51945/100000Progress: 51946/100000Progress: 51947/100000Progress: 51948/100000Progress: 51949/100000Progress: 51950/100000Progress: 51951/100000Progress: 51952/100000Progress: 51953/100000Progress: 51954/100000Progress: 51955/100000Progress: 51956/100000Progress: 51957/100000Progress: 51958/100000Progress: 51959/100000Progress: 51960/100000Progress: 51961/100000Progress: 51962/100000Progress: 51963/100000Progress: 51964/100000Progress: 51965/100000Progress: 5

Progress: 54062/100000Progress: 54063/100000Progress: 54064/100000Progress: 54065/100000Progress: 54066/100000Progress: 54067/100000Progress: 54068/100000Progress: 54069/100000Progress: 54070/100000Progress: 54071/100000Progress: 54072/100000Progress: 54073/100000Progress: 54074/100000Progress: 54075/100000Progress: 54076/100000Progress: 54077/100000Progress: 54078/100000Progress: 54079/100000Progress: 54080/100000Progress: 54081/100000Progress: 54082/100000Progress: 54083/100000Progress: 54084/100000Progress: 54085/100000Progress: 54086/100000Progress: 54087/100000Progress: 54088/100000Progress: 54089/100000Progress: 54090/100000Progress: 54091/100000Progress: 54092/100000Progress: 54093/100000Progress: 54094/100000Progress: 54095/100000Progress: 54096/100000Progress: 54097/100000Progress: 54098/100000Progress: 54099/100000Progress: 54100/100000Progress: 54101/100000Progress: 54102/100000Progress: 54103/100000Progress: 54104/100000Progress: 5

Progress: 56422/100000Progress: 56423/100000Progress: 56424/100000Progress: 56425/100000Progress: 56426/100000Progress: 56427/100000Progress: 56428/100000Progress: 56429/100000Progress: 56430/100000Progress: 56431/100000Progress: 56432/100000Progress: 56433/100000Progress: 56434/100000Progress: 56435/100000Progress: 56436/100000Progress: 56437/100000Progress: 56438/100000Progress: 56439/100000Progress: 56440/100000Progress: 56441/100000Progress: 56442/100000Progress: 56443/100000Progress: 56444/100000Progress: 56445/100000Progress: 56446/100000Progress: 56447/100000Progress: 56448/100000Progress: 56449/100000Progress: 56450/100000Progress: 56451/100000Progress: 56452/100000Progress: 56453/100000Progress: 56454/100000Progress: 56455/100000Progress: 56456/100000Progress: 56457/100000Progress: 56458/100000Progress: 56459/100000Progress: 56460/100000Progress: 56461/100000Progress: 56462/100000Progress: 56463/100000Progress: 56464/100000Progress: 5

Progress: 58422/100000Progress: 58423/100000Progress: 58424/100000Progress: 58425/100000Progress: 58426/100000Progress: 58427/100000Progress: 58428/100000Progress: 58429/100000Progress: 58430/100000Progress: 58431/100000Progress: 58432/100000Progress: 58433/100000Progress: 58434/100000Progress: 58435/100000Progress: 58436/100000Progress: 58437/100000Progress: 58438/100000Progress: 58439/100000Progress: 58440/100000Progress: 58441/100000Progress: 58442/100000Progress: 58443/100000Progress: 58444/100000Progress: 58445/100000Progress: 58446/100000Progress: 58447/100000Progress: 58448/100000Progress: 58449/100000Progress: 58450/100000Progress: 58451/100000Progress: 58452/100000Progress: 58453/100000Progress: 58454/100000Progress: 58455/100000Progress: 58456/100000Progress: 58457/100000Progress: 58458/100000Progress: 58459/100000Progress: 58460/100000Progress: 58461/100000Progress: 58462/100000Progress: 58463/100000Progress: 58464/100000Progress: 5

Progress: 60579/100000Progress: 60580/100000Progress: 60581/100000Progress: 60582/100000Progress: 60583/100000Progress: 60584/100000Progress: 60585/100000Progress: 60586/100000Progress: 60587/100000Progress: 60588/100000Progress: 60589/100000Progress: 60590/100000Progress: 60591/100000Progress: 60592/100000Progress: 60593/100000Progress: 60594/100000Progress: 60595/100000Progress: 60596/100000Progress: 60597/100000Progress: 60598/100000Progress: 60599/100000Progress: 60600/100000Progress: 60601/100000Progress: 60602/100000Progress: 60603/100000Progress: 60604/100000Progress: 60605/100000Progress: 60606/100000Progress: 60607/100000Progress: 60608/100000Progress: 60609/100000Progress: 60610/100000Progress: 60611/100000Progress: 60612/100000Progress: 60613/100000Progress: 60614/100000Progress: 60615/100000Progress: 60616/100000Progress: 60617/100000Progress: 60618/100000Progress: 60619/100000Progress: 60620/100000Progress: 60621/100000Progress: 6

Progress: 62652/100000Progress: 62653/100000Progress: 62654/100000Progress: 62655/100000Progress: 62656/100000Progress: 62657/100000Progress: 62658/100000Progress: 62659/100000Progress: 62660/100000Progress: 62661/100000Progress: 62662/100000Progress: 62663/100000Progress: 62664/100000Progress: 62665/100000Progress: 62666/100000Progress: 62667/100000Progress: 62668/100000Progress: 62669/100000Progress: 62670/100000Progress: 62671/100000Progress: 62672/100000Progress: 62673/100000Progress: 62674/100000Progress: 62675/100000Progress: 62676/100000Progress: 62677/100000Progress: 62678/100000Progress: 62679/100000Progress: 62680/100000Progress: 62681/100000Progress: 62682/100000Progress: 62683/100000Progress: 62684/100000Progress: 62685/100000Progress: 62686/100000Progress: 62687/100000Progress: 62688/100000Progress: 62689/100000Progress: 62690/100000Progress: 62691/100000Progress: 62692/100000Progress: 62693/100000Progress: 62694/100000Progress: 6

Progress: 64760/100000Progress: 64761/100000Progress: 64762/100000Progress: 64763/100000Progress: 64764/100000Progress: 64765/100000Progress: 64766/100000Progress: 64767/100000Progress: 64768/100000Progress: 64769/100000Progress: 64770/100000Progress: 64771/100000Progress: 64772/100000Progress: 64773/100000Progress: 64774/100000Progress: 64775/100000Progress: 64776/100000Progress: 64777/100000Progress: 64778/100000Progress: 64779/100000Progress: 64780/100000Progress: 64781/100000Progress: 64782/100000Progress: 64783/100000Progress: 64784/100000Progress: 64785/100000Progress: 64786/100000Progress: 64787/100000Progress: 64788/100000Progress: 64789/100000Progress: 64790/100000Progress: 64791/100000Progress: 64792/100000Progress: 64793/100000Progress: 64794/100000Progress: 64795/100000Progress: 64796/100000Progress: 64797/100000Progress: 64798/100000Progress: 64799/100000Progress: 64800/100000Progress: 64801/100000Progress: 64802/100000Progress: 6

Progress: 66920/100000Progress: 66921/100000Progress: 66922/100000Progress: 66923/100000Progress: 66924/100000Progress: 66925/100000Progress: 66926/100000Progress: 66927/100000Progress: 66928/100000Progress: 66929/100000Progress: 66930/100000Progress: 66931/100000Progress: 66932/100000Progress: 66933/100000Progress: 66934/100000Progress: 66935/100000Progress: 66936/100000Progress: 66937/100000Progress: 66938/100000Progress: 66939/100000Progress: 66940/100000Progress: 66941/100000Progress: 66942/100000Progress: 66943/100000Progress: 66944/100000Progress: 66945/100000Progress: 66946/100000Progress: 66947/100000Progress: 66948/100000Progress: 66949/100000Progress: 66950/100000Progress: 66951/100000Progress: 66952/100000Progress: 66953/100000Progress: 66954/100000Progress: 66955/100000Progress: 66956/100000Progress: 66957/100000Progress: 66958/100000Progress: 66959/100000Progress: 66960/100000Progress: 66961/100000Progress: 66962/100000Progress: 6

Progress: 69419/100000Progress: 69420/100000Progress: 69421/100000Progress: 69422/100000Progress: 69423/100000Progress: 69424/100000Progress: 69425/100000Progress: 69426/100000Progress: 69427/100000Progress: 69428/100000Progress: 69429/100000Progress: 69430/100000Progress: 69431/100000Progress: 69432/100000Progress: 69433/100000Progress: 69434/100000Progress: 69435/100000Progress: 69436/100000Progress: 69437/100000Progress: 69438/100000Progress: 69439/100000Progress: 69440/100000Progress: 69441/100000Progress: 69442/100000Progress: 69443/100000Progress: 69444/100000Progress: 69445/100000Progress: 69446/100000Progress: 69447/100000Progress: 69448/100000Progress: 69449/100000Progress: 69450/100000Progress: 69451/100000Progress: 69452/100000Progress: 69453/100000Progress: 69454/100000Progress: 69455/100000Progress: 69456/100000Progress: 69457/100000Progress: 69458/100000Progress: 69459/100000Progress: 69460/100000Progress: 69461/100000Progress: 6

Progress: 70758/100000Progress: 70759/100000Progress: 70760/100000Progress: 70761/100000Progress: 70762/100000Progress: 70763/100000Progress: 70764/100000Progress: 70765/100000Progress: 70766/100000Progress: 70767/100000Progress: 70768/100000Progress: 70769/100000Progress: 70770/100000Progress: 70771/100000Progress: 70772/100000Progress: 70773/100000Progress: 70774/100000Progress: 70775/100000Progress: 70776/100000Progress: 70777/100000Progress: 70778/100000Progress: 70779/100000Progress: 70780/100000Progress: 70781/100000Progress: 70782/100000Progress: 70783/100000Progress: 70784/100000Progress: 70785/100000Progress: 70786/100000Progress: 70787/100000Progress: 70788/100000Progress: 70789/100000Progress: 70790/100000Progress: 70791/100000Progress: 70792/100000Progress: 70793/100000Progress: 70794/100000Progress: 70795/100000Progress: 70796/100000Progress: 70797/100000Progress: 70798/100000Progress: 70799/100000Progress: 70800/100000Progress: 7

Progress: 72902/100000Progress: 72903/100000Progress: 72904/100000Progress: 72905/100000Progress: 72906/100000Progress: 72907/100000Progress: 72908/100000Progress: 72909/100000Progress: 72910/100000Progress: 72911/100000Progress: 72912/100000Progress: 72913/100000Progress: 72914/100000Progress: 72915/100000Progress: 72916/100000Progress: 72917/100000Progress: 72918/100000Progress: 72919/100000Progress: 72920/100000Progress: 72921/100000Progress: 72922/100000Progress: 72923/100000Progress: 72924/100000Progress: 72925/100000Progress: 72926/100000Progress: 72927/100000Progress: 72928/100000Progress: 72929/100000Progress: 72930/100000Progress: 72931/100000Progress: 72932/100000Progress: 72933/100000Progress: 72934/100000Progress: 72935/100000Progress: 72936/100000Progress: 72937/100000Progress: 72938/100000Progress: 72939/100000Progress: 72940/100000Progress: 72941/100000Progress: 72942/100000Progress: 72943/100000Progress: 72944/100000Progress: 7

Progress: 74418/100000Progress: 74419/100000Progress: 74420/100000Progress: 74421/100000Progress: 74422/100000Progress: 74423/100000Progress: 74424/100000Progress: 74425/100000Progress: 74426/100000Progress: 74427/100000Progress: 74428/100000Progress: 74429/100000Progress: 74430/100000Progress: 74431/100000Progress: 74432/100000Progress: 74433/100000Progress: 74434/100000Progress: 74435/100000Progress: 74436/100000Progress: 74437/100000Progress: 74438/100000Progress: 74439/100000Progress: 74440/100000Progress: 74441/100000Progress: 74442/100000Progress: 74443/100000Progress: 74444/100000Progress: 74445/100000Progress: 74446/100000Progress: 74447/100000Progress: 74448/100000Progress: 74449/100000Progress: 74450/100000Progress: 74451/100000Progress: 74452/100000Progress: 74453/100000Progress: 74454/100000Progress: 74455/100000Progress: 74456/100000Progress: 74457/100000Progress: 74458/100000Progress: 74459/100000Progress: 74460/100000Progress: 7

Progress: 76892/100000Progress: 76893/100000Progress: 76894/100000Progress: 76895/100000Progress: 76896/100000Progress: 76897/100000Progress: 76898/100000Progress: 76899/100000Progress: 76900/100000Progress: 76901/100000Progress: 76902/100000Progress: 76903/100000Progress: 76904/100000Progress: 76905/100000Progress: 76906/100000Progress: 76907/100000Progress: 76908/100000Progress: 76909/100000Progress: 76910/100000Progress: 76911/100000Progress: 76912/100000Progress: 76913/100000Progress: 76914/100000Progress: 76915/100000Progress: 76916/100000Progress: 76917/100000Progress: 76918/100000Progress: 76919/100000Progress: 76920/100000Progress: 76921/100000Progress: 76922/100000Progress: 76923/100000Progress: 76924/100000Progress: 76925/100000Progress: 76926/100000Progress: 76927/100000Progress: 76928/100000Progress: 76929/100000Progress: 76930/100000Progress: 76931/100000Progress: 76932/100000Progress: 76933/100000Progress: 76934/100000Progress: 7

Progress: 78417/100000Progress: 78418/100000Progress: 78419/100000Progress: 78420/100000Progress: 78421/100000Progress: 78422/100000Progress: 78423/100000Progress: 78424/100000Progress: 78425/100000Progress: 78426/100000Progress: 78427/100000Progress: 78428/100000Progress: 78429/100000Progress: 78430/100000Progress: 78431/100000Progress: 78432/100000Progress: 78433/100000Progress: 78434/100000Progress: 78435/100000Progress: 78436/100000Progress: 78437/100000Progress: 78438/100000Progress: 78439/100000Progress: 78440/100000Progress: 78441/100000Progress: 78442/100000Progress: 78443/100000Progress: 78444/100000Progress: 78445/100000Progress: 78446/100000Progress: 78447/100000Progress: 78448/100000Progress: 78449/100000Progress: 78450/100000Progress: 78451/100000Progress: 78452/100000Progress: 78453/100000Progress: 78454/100000Progress: 78455/100000Progress: 78456/100000Progress: 78457/100000Progress: 78458/100000Progress: 78459/100000Progress: 7

Progress: 80416/100000Progress: 80417/100000Progress: 80418/100000Progress: 80419/100000Progress: 80420/100000Progress: 80421/100000Progress: 80422/100000Progress: 80423/100000Progress: 80424/100000Progress: 80425/100000Progress: 80426/100000Progress: 80427/100000Progress: 80428/100000Progress: 80429/100000Progress: 80430/100000Progress: 80431/100000Progress: 80432/100000Progress: 80433/100000Progress: 80434/100000Progress: 80435/100000Progress: 80436/100000Progress: 80437/100000Progress: 80438/100000Progress: 80439/100000Progress: 80440/100000Progress: 80441/100000Progress: 80442/100000Progress: 80443/100000Progress: 80444/100000Progress: 80445/100000Progress: 80446/100000Progress: 80447/100000Progress: 80448/100000Progress: 80449/100000Progress: 80450/100000Progress: 80451/100000Progress: 80452/100000Progress: 80453/100000Progress: 80454/100000Progress: 80455/100000Progress: 80456/100000Progress: 80457/100000Progress: 80458/100000Progress: 8

Progress: 82607/100000Progress: 82608/100000Progress: 82609/100000Progress: 82610/100000Progress: 82611/100000Progress: 82612/100000Progress: 82613/100000Progress: 82614/100000Progress: 82615/100000Progress: 82616/100000Progress: 82617/100000Progress: 82618/100000Progress: 82619/100000Progress: 82620/100000Progress: 82621/100000Progress: 82622/100000Progress: 82623/100000Progress: 82624/100000Progress: 82625/100000Progress: 82626/100000Progress: 82627/100000Progress: 82628/100000Progress: 82629/100000Progress: 82630/100000Progress: 82631/100000Progress: 82632/100000Progress: 82633/100000Progress: 82634/100000Progress: 82635/100000Progress: 82636/100000Progress: 82637/100000Progress: 82638/100000Progress: 82639/100000Progress: 82640/100000Progress: 82641/100000Progress: 82642/100000Progress: 82643/100000Progress: 82644/100000Progress: 82645/100000Progress: 82646/100000Progress: 82647/100000Progress: 82648/100000Progress: 82649/100000Progress: 8

Progress: 84850/100000Progress: 84851/100000Progress: 84852/100000Progress: 84853/100000Progress: 84854/100000Progress: 84855/100000Progress: 84856/100000Progress: 84857/100000Progress: 84858/100000Progress: 84859/100000Progress: 84860/100000Progress: 84861/100000Progress: 84862/100000Progress: 84863/100000Progress: 84864/100000Progress: 84865/100000Progress: 84866/100000Progress: 84867/100000Progress: 84868/100000Progress: 84869/100000Progress: 84870/100000Progress: 84871/100000Progress: 84872/100000Progress: 84873/100000Progress: 84874/100000Progress: 84875/100000Progress: 84876/100000Progress: 84877/100000Progress: 84878/100000Progress: 84879/100000Progress: 84880/100000Progress: 84881/100000Progress: 84882/100000Progress: 84883/100000Progress: 84884/100000Progress: 84885/100000Progress: 84886/100000Progress: 84887/100000Progress: 84888/100000Progress: 84889/100000Progress: 84890/100000Progress: 84891/100000Progress: 84892/100000Progress: 8

Progress: 87140/100000Progress: 87141/100000Progress: 87142/100000Progress: 87143/100000Progress: 87144/100000Progress: 87145/100000Progress: 87146/100000Progress: 87147/100000Progress: 87148/100000Progress: 87149/100000Progress: 87150/100000Progress: 87151/100000Progress: 87152/100000Progress: 87153/100000Progress: 87154/100000Progress: 87155/100000Progress: 87156/100000Progress: 87157/100000Progress: 87158/100000Progress: 87159/100000Progress: 87160/100000Progress: 87161/100000Progress: 87162/100000Progress: 87163/100000Progress: 87164/100000Progress: 87165/100000Progress: 87166/100000Progress: 87167/100000Progress: 87168/100000Progress: 87169/100000Progress: 87170/100000Progress: 87171/100000Progress: 87172/100000Progress: 87173/100000Progress: 87174/100000Progress: 87175/100000Progress: 87176/100000Progress: 87177/100000Progress: 87178/100000Progress: 87179/100000Progress: 87180/100000Progress: 87181/100000Progress: 87182/100000Progress: 8

Progress: 89295/100000Progress: 89296/100000Progress: 89297/100000Progress: 89298/100000Progress: 89299/100000Progress: 89300/100000Progress: 89301/100000Progress: 89302/100000Progress: 89303/100000Progress: 89304/100000Progress: 89305/100000Progress: 89306/100000Progress: 89307/100000Progress: 89308/100000Progress: 89309/100000Progress: 89310/100000Progress: 89311/100000Progress: 89312/100000Progress: 89313/100000Progress: 89314/100000Progress: 89315/100000Progress: 89316/100000Progress: 89317/100000Progress: 89318/100000Progress: 89319/100000Progress: 89320/100000Progress: 89321/100000Progress: 89322/100000Progress: 89323/100000Progress: 89324/100000Progress: 89325/100000Progress: 89326/100000Progress: 89327/100000Progress: 89328/100000Progress: 89329/100000Progress: 89330/100000Progress: 89331/100000Progress: 89332/100000Progress: 89333/100000Progress: 89334/100000Progress: 89335/100000Progress: 89336/100000Progress: 89337/100000Progress: 8

Progress: 91414/100000Progress: 91415/100000Progress: 91416/100000Progress: 91417/100000Progress: 91418/100000Progress: 91419/100000Progress: 91420/100000Progress: 91421/100000Progress: 91422/100000Progress: 91423/100000Progress: 91424/100000Progress: 91425/100000Progress: 91426/100000Progress: 91427/100000Progress: 91428/100000Progress: 91429/100000Progress: 91430/100000Progress: 91431/100000Progress: 91432/100000Progress: 91433/100000Progress: 91434/100000Progress: 91435/100000Progress: 91436/100000Progress: 91437/100000Progress: 91438/100000Progress: 91439/100000Progress: 91440/100000Progress: 91441/100000Progress: 91442/100000Progress: 91443/100000Progress: 91444/100000Progress: 91445/100000Progress: 91446/100000Progress: 91447/100000Progress: 91448/100000Progress: 91449/100000Progress: 91450/100000Progress: 91451/100000Progress: 91452/100000Progress: 91453/100000Progress: 91454/100000Progress: 91455/100000Progress: 91456/100000Progress: 9

Progress: 93913/100000Progress: 93914/100000Progress: 93915/100000Progress: 93916/100000Progress: 93917/100000Progress: 93918/100000Progress: 93919/100000Progress: 93920/100000Progress: 93921/100000Progress: 93922/100000Progress: 93923/100000Progress: 93924/100000Progress: 93925/100000Progress: 93926/100000Progress: 93927/100000Progress: 93928/100000Progress: 93929/100000Progress: 93930/100000Progress: 93931/100000Progress: 93932/100000Progress: 93933/100000Progress: 93934/100000Progress: 93935/100000Progress: 93936/100000Progress: 93937/100000Progress: 93938/100000Progress: 93939/100000Progress: 93940/100000Progress: 93941/100000Progress: 93942/100000Progress: 93943/100000Progress: 93944/100000Progress: 93945/100000Progress: 93946/100000Progress: 93947/100000Progress: 93948/100000Progress: 93949/100000Progress: 93950/100000Progress: 93951/100000Progress: 93952/100000Progress: 93953/100000Progress: 93954/100000Progress: 93955/100000Progress: 9

Progress: 96168/100000Progress: 96169/100000Progress: 96170/100000Progress: 96171/100000Progress: 96172/100000Progress: 96173/100000Progress: 96174/100000Progress: 96175/100000Progress: 96176/100000Progress: 96177/100000Progress: 96178/100000Progress: 96179/100000Progress: 96180/100000Progress: 96181/100000Progress: 96182/100000Progress: 96183/100000Progress: 96184/100000Progress: 96185/100000Progress: 96186/100000Progress: 96187/100000Progress: 96188/100000Progress: 96189/100000Progress: 96190/100000Progress: 96191/100000Progress: 96192/100000Progress: 96193/100000Progress: 96194/100000Progress: 96195/100000Progress: 96196/100000Progress: 96197/100000Progress: 96198/100000Progress: 96199/100000Progress: 96200/100000Progress: 96201/100000Progress: 96202/100000Progress: 96203/100000Progress: 96204/100000Progress: 96205/100000Progress: 96206/100000Progress: 96207/100000Progress: 96208/100000Progress: 96209/100000Progress: 96210/100000Progress: 9

Progress: 98412/100000Progress: 98413/100000Progress: 98414/100000Progress: 98415/100000Progress: 98416/100000Progress: 98417/100000Progress: 98418/100000Progress: 98419/100000Progress: 98420/100000Progress: 98421/100000Progress: 98422/100000Progress: 98423/100000Progress: 98424/100000Progress: 98425/100000Progress: 98426/100000Progress: 98427/100000Progress: 98428/100000Progress: 98429/100000Progress: 98430/100000Progress: 98431/100000Progress: 98432/100000Progress: 98433/100000Progress: 98434/100000Progress: 98435/100000Progress: 98436/100000Progress: 98437/100000Progress: 98438/100000Progress: 98439/100000Progress: 98440/100000Progress: 98441/100000Progress: 98442/100000Progress: 98443/100000Progress: 98444/100000Progress: 98445/100000Progress: 98446/100000Progress: 98447/100000Progress: 98448/100000Progress: 98449/100000Progress: 98450/100000Progress: 98451/100000Progress: 98452/100000Progress: 98453/100000Progress: 98454/100000Progress: 9

Unnamed: 0,doc_id,doc_text
99995,LA123190-0105,CLERKS AT 13 STORES ARRESTED AFTER MINORS BUY ...
99996,LA123190-0108,LOOKING TO 1991; \n THE NEW YEAR PROMISES TRE...
99997,LA123190-0117,"LOCAL; \n GIRL, 14, DIES IN DRIVE-BY INCIDENT..."
99998,LA123190-0119,"GREECE, ISRAEL HIT BY EXODUS FROM ALBANIA \n ..."
99999,LA123190-0124,<Empty Document>


# Training

## Split a ratio of training set as development set

In [7]:
train_df = pd.read_csv(training_csv_path)
dev_df, train_df = np.split(train_df, [int(dev_set_ratio*len(train_df))])
dev_df.reset_index(drop=True, inplace=True)
train_df.reset_index(drop=True, inplace=True)

print("train_df shape:", train_df.shape)
print("dev_df shape:", dev_df.shape)
train_df.tail()

train_df shape: (96, 5)
dev_df shape: (24, 5)


Unnamed: 0,query_id,query_text,pos_doc_ids,bm25_top1000,bm25_top1000_scores
91,641,Valdez wildlife marine life,FT911-1460 FT931-15213 FT931-16010 FT933-7162 ...,LA120989-0014 LA032390-0003 LA040889-0009 LA03...,34.16304495 32.97577181 31.31040724 30.8527172...
92,642,Tiananmen Square protesters,FBIS3-1941 FBIS3-2223 FBIS3-2224 FBIS3-26281 F...,FT922-10319 FT931-8730 FT942-5501 FBIS4-24379 ...,32.38429409 30.71831856 29.63771818 29.4676000...
93,648,family leave law,FBIS3-43072 FBIS3-61562 FBIS4-25261 FR940323-0...,FR941202-0-00181 FR941202-0-00176 FR941202-0-0...,24.51293307 23.98772391 23.42756181 23.0616218...
94,649,computer viruses,FBIS3-40468 FBIS3-42979 FBIS3-43017 FBIS4-5044...,FT944-9024 FBIS4-50440 FT921-5724 FT941-13624 ...,27.84369436 27.24267123 26.98326939 26.9108106...
95,650,tax evasion indicted,LA011689-0065 LA012589-0008 LA012889-0016 LA02...,LA040889-0060 LA053189-0041 LA092590-0146 LA06...,29.72207523 27.98961258 27.73561512 27.3372072...


## Build instances for training/development set

In [8]:
%%time
doc_id_to_token_ids = {}
def preprocess_df(df):
    ''' Preprocess DataFrame into training instances for BERT. '''
    instances = []
    
    # Parse CSV
    for i, row in df.iterrows():
        query_id, query_text, pos_doc_ids, bm25_top1000, _ = row
        pos_doc_id_list = pos_doc_ids.split()
        pos_doc_id_set = set(pos_doc_id_list)
        bm25_top1000_list = bm25_top1000.split()
        bm25_top1000_set = set(bm25_top1000_list)

        # Pair BM25 neg. with pos. samples
        labeled_pos_neg_list = []
        for pos_doc_id in pos_doc_id_list:
            neg_doc_id_set = bm25_top1000_set - pos_doc_id_set
            neg_doc_ids = random.sample(neg_doc_id_set, num_negatives)
            pos_position = random.randint(0, num_negatives)
            pos_neg_doc_ids = neg_doc_ids
            pos_neg_doc_ids.insert(pos_position, pos_doc_id)
            labeled_sample = (pos_neg_doc_ids, pos_position)
            labeled_pos_neg_list.append(labeled_sample)
            
        # Make query tokens for BERT
        query = ' '.join([ word for word in my_tokenizer.cut(query_text, shorten=False, stopword=False, stemming=False).split()[:max_query_length] ])
        query_tokens = tokenizer.tokenize(query)
        print(len(query_tokens))
        if len(query_tokens) > max_query_length:  # truncation
            query_tokens = query_tokens[:max_query_length]
        query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)
        query_token_ids.insert(0, tokenizer.cls_token_id)
        query_token_ids.append(tokenizer.sep_token_id)

        # Make input instances for all query/doc pairs
        for doc_ids, label in labeled_pos_neg_list:
            paired_input_ids = []
            paired_attention_mask = []
            paired_token_type_ids = []
            
            # Merge all pos/neg inputs as a single sample
            for doc_id in doc_ids:
                if doc_id in doc_id_to_token_ids:
                    doc_token_ids = doc_id_to_token_ids[doc_id]
                else:
                    doc_text = doc_id_to_text[doc_id]
                    doc_text = ' '.join([ word for word in my_tokenizer.cut(doc_text, shorten=False, stopword=False, stemming=False).split()[:max_input_length] ])
                    doc_tokens = tokenizer.tokenize(doc_text)
                    doc_token_ids = tokenizer.convert_tokens_to_ids(doc_tokens)
                    doc_id_to_token_ids[doc_id] = doc_token_ids
                doc_token_ids.append(tokenizer.sep_token_id)

                # make input sequences for BERT
                input_ids = query_token_ids + doc_token_ids
                token_type_ids = [0 for token_id in query_token_ids]
                token_type_ids.extend(1 for token_id in doc_token_ids)
                if len(input_ids) > max_input_length:  # truncation
                    input_ids = input_ids[:max_input_length]
                    token_type_ids = token_type_ids[:max_input_length]
                attention_mask = [1 for token_id in input_ids]
                
                # convert and collect inputs as tensors
                input_ids = torch.LongTensor(input_ids)
                attention_mask = torch.FloatTensor(attention_mask)
                token_type_ids = torch.LongTensor(token_type_ids)
                paired_input_ids.append(input_ids)
                paired_attention_mask.append(attention_mask)
                paired_token_type_ids.append(token_type_ids)
            label = torch.LongTensor([label]).squeeze()
            
            # Pre-pad tensor pairs for efficiency
            paired_input_ids = pad_sequence(paired_input_ids, batch_first=True)
            paired_attention_mask = pad_sequence(paired_attention_mask, batch_first=True)
            paired_token_type_ids = pad_sequence(paired_token_type_ids, batch_first=True)

            # collect all inputs as a dictionary
            instance = {}
            instance['input_ids'] = paired_input_ids.T  # transpose for code efficiency
            instance['attention_mask'] = paired_attention_mask.T
            instance['token_type_ids'] = paired_token_type_ids.T
            instance['label'] = label
            instances.append(instance)

        print("Progress: %d/%d\r" % (i+1, len(df)), end='')
    print()
    return instances

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


In [9]:
train_instances = preprocess_df(train_df)
dev_instances = preprocess_df(dev_df)

print("num. train_instances: %d" % len(train_instances))
print("num. dev_instances: %d" % len(dev_instances))
print("input_ids.T shape:", train_instances[0]['input_ids'].T.shape)
train_instances[0]['input_ids'].T

2
2rogress: 1/96
1rogress: 2/96
4rogress: 3/96
3rogress: 4/96
4rogress: 5/96
2rogress: 6/96
8rogress: 7/96
3rogress: 8/96
3rogress: 9/96
4rogress: 10/96
4rogress: 11/96
3rogress: 12/96
2rogress: 13/96
5rogress: 14/96
10ogress: 15/96
4rogress: 16/96
3rogress: 17/96
3rogress: 18/96
2rogress: 19/96
2rogress: 20/96
2rogress: 21/96
3rogress: 22/96
3rogress: 23/96
3rogress: 24/96
3rogress: 25/96
2rogress: 26/96
3rogress: 27/96
3rogress: 28/96
2rogress: 29/96
4rogress: 30/96
1rogress: 31/96
2rogress: 32/96
2rogress: 33/96
1rogress: 34/96
2rogress: 35/96
6rogress: 36/96
3rogress: 37/96
3rogress: 38/96
3rogress: 39/96
2rogress: 40/96
3rogress: 41/96
4rogress: 42/96
2rogress: 43/96
3rogress: 44/96
1rogress: 45/96
3rogress: 46/96
5rogress: 47/96
4rogress: 48/96
3rogress: 49/96
2rogress: 50/96
4rogress: 51/96
3rogress: 52/96
3rogress: 53/96
5rogress: 54/96
5rogress: 55/96
2rogress: 56/96
4rogress: 57/96
2rogress: 58/96
3rogress: 59/96
2rogress: 60/96
3rogress: 61/96
2rogress: 62/96
4rogress: 63/96

tensor([[  101,  3199,  3036,  ...,     0,     0,     0],
        [  101,  3199,  3036,  ...,     0,     0,     0],
        [  101,  3199,  3036,  ...,     0,     0,     0],
        [  101,  3199,  3036,  ...,  2085,  2022, 24681]])

## Build dataset and dataloader for PyTorch

In [10]:
class TrainingDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    
    def __len__(self):
        return len(self.instances)
        
    def __getitem__(self, i):
        instance = self.instances[i]
        input_ids = instance['input_ids']
        attention_mask = instance['attention_mask']
        token_type_ids = instance['token_type_ids']
        label = instance['label']
        return input_ids, attention_mask, token_type_ids, label
    
def get_train_dataloader(instances, batch_size=2, num_workers=4):
    def collate_fn(batch):
        input_ids, attention_mask, token_type_ids, labels = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True).transpose(1,2).contiguous()  # re-transpose
        attention_mask = pad_sequence(attention_mask, batch_first=True).transpose(1,2).contiguous()
        token_type_ids = pad_sequence(token_type_ids, batch_first=True).transpose(1,2).contiguous()
        labels = torch.stack(labels)
        return input_ids, attention_mask, token_type_ids, labels
    
    dataset = TrainingDataset(instances)
    dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=True, \
                            batch_size=batch_size, num_workers=num_workers)
    return dataloader

# Demo
# dataloader = get_train_dataloader(train_instances)
# for batch in dataloader:
#     input_ids, attention_mask, token_type_ids, labels = batch
#     break
    
# print(input_ids.shape)
# print(input_ids)

## Initialize and finetune BERT

In [11]:
model = BertForMultipleChoice.from_pretrained(model_name_or_path)
model.cuda()

optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer.zero_grad()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

### (TO-DO!) Define validation function for earlystopping

In [12]:
def validate(model, instances):
    total_loss = 0
    model.eval()
    dataloader = get_train_dataloader(instances, batch_size=batch_size, num_workers=num_workers)
    for batch in dataloader:
        batch = (tensor.cuda() for tensor in batch)
        input_ids, attention_mask, token_type_ids, labels = batch
        
        ''' TO-DO: 
        1. Compute the cross-entropy loss (using built-in loss of BertForMultipleChoice)
          (Hint: You need to call a function of model which takes all the 4 tensors in the batch as inputs)
          
        2. Sum up the loss of all dev-set samples
          (Hint: The built-in loss is averaged, so you should multiply it with the batch size)
        '''
        with torch.no_grad():
        # forward + backward + optimize
            loss = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)[0]
        total_loss += loss.item() * batch_size
        
    avg_loss = total_loss / len(instances)
    return avg_loss

### (TO-DO!) Let's train this beeg boy ;-)

In [13]:
patience, best_dev_loss = 0, 1e10
best_state_dict = model.state_dict()

start_time = time()
dataloader = get_train_dataloader(train_instances, batch_size=batch_size, num_workers=num_workers)
for epoch in range(1, max_epochs+1):
    model.train()
    for i, batch in enumerate(dataloader, start=1):
        batch = (tensor.cuda() for tensor in batch)
        input_ids, attention_mask, token_type_ids, labels = batch
        
        # Backpropogation
        ''' TO-DO: 
        1. Compute the cross-entropy loss (using built-in loss of BertForMultipleChoice)
          (Hint: You need to call a function of model which takes all the 4 tensors in the batch as inputs)
         
        2. Perform backpropogation on the loss (i.e. compute gradients)
        3. Optimize the model.
          (Hint: These two lines of codes can be found in PyTorch tutorial)
        '''
        # forward + backward + optimize
        loss = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        # Progress bar with timer ;-)
        elapsed_time = time() - start_time
        elapsed_time = timedelta(seconds=int(elapsed_time))
        print("Epoch: %d/%d | Batch: %d/%d | loss=%.5f | %s      \r" \
              % (epoch, max_epochs, i, len(dataloader), loss, elapsed_time), end='')
        
    # Save parameters of each epoch
    if save_model_path is not None:
        save_checkpoint_path = "%s/epoch_%d" % (save_model_path, epoch)
        model.save_pretrained(save_checkpoint_path)
        
    # Get avg. loss on development set
    print("Epoch: %d/%d | Validating...                           \r" % (epoch, max_epochs), end='')
    dev_loss = validate(model, dev_instances)
    elapsed_time = time() - start_time
    elapsed_time = timedelta(seconds=int(elapsed_time))
    print("Epoch: %d/%d | dev_loss=%.5f | %s                      " \
          % (epoch, max_epochs, dev_loss, elapsed_time))
    
    # Track best checkpoint and earlystop patience
    if dev_loss < best_dev_loss:
        patience = 0
        best_dev_loss = dev_loss
        best_state_dict = deepcopy(model.state_dict())
        if save_model_path is not None:
            model.save_pretrained(save_model_path)
    else:
        patience += 1
    
    if patience > max_patience:
        print('Earlystop at epoch %d' % epoch)
        break

Epoch: 1/3 | dev_loss=0.81031 | 0:29:30                      
Epoch: 2/3 | dev_loss=0.74683 | 0:59:01                      
Epoch: 3/3 | dev_loss=1.03970 | 1:28:35                      
Earlystop at epoch 3


In [14]:
# Restore parameters with best loss on development set
model.load_state_dict(best_state_dict)

<All keys matched successfully>

# Testing

In [15]:
class TestingDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    
    def __len__(self):
        return len(self.instances)
        
    def __getitem__(self, i):
        instance = self.instances[i]
        input_ids = instance['input_ids']
        attention_mask = instance['attention_mask']
        token_type_ids = instance['token_type_ids']
        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.FloatTensor(attention_mask)
        token_type_ids = torch.LongTensor(token_type_ids)
        return input_ids, attention_mask, token_type_ids, 
    
def get_test_dataloader(instances, batch_size=8, num_workers=4):
    def collate_fn(batch):
        input_ids, attention_mask, token_type_ids = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True).unsqueeze(1)  # predict as single choice
        attention_mask = pad_sequence(attention_mask, batch_first=True).unsqueeze(1)
        token_type_ids = pad_sequence(token_type_ids, batch_first=True).unsqueeze(1)
        return input_ids, attention_mask, token_type_ids
    
    dataset = TestingDataset(instances)
    dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=False, \
                            batch_size=batch_size, num_workers=num_workers)
    return dataloader

## (TO-DO!) Define function to predict BERT scores

In [16]:
def predict_query_doc_scores(model, df):
    model.eval()
    start_time = time()

    # Parse CSV
    query_id_list = df["query_id"]
    query_text_list = df["query_text"]
    bm25_top1000_list = df["bm25_top1000"]

    # Treat {1 query, K documents} as a dataset for prediction
    query_doc_scores = []
    query_doc_ids = []
    rows = zip(query_id_list, query_text_list, bm25_top1000_list)
    for qi, row in enumerate(rows, start=1):
        query_id, query_text, bm25_top1000 = row
        bm25_doc_id_list = bm25_top1000.split()
        query_doc_ids.append(bm25_doc_id_list)

        #################################################
        #    Collect all instances of query/doc pairs
        #################################################
        query_instances = []

        # Make query tokens for BERT        
        query = ' '.join([ word for word in my_tokenizer.cut(query_text, shorten=False, stopword=False, stemming=False).split()[:max_query_length] ])
        query_tokens = tokenizer.tokenize(query)
        if len(query_tokens) > max_query_length:  # truncation
            query_tokens = query_tokens[:max_query_length]
        query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)
        query_token_ids.insert(0, tokenizer.cls_token_id)
        query_token_ids.append(tokenizer.sep_token_id)

        # Make input instances for all query/doc pairs
        for i, doc_id in enumerate(bm25_doc_id_list, start=1):
            if doc_id in doc_id_to_token_ids:
                doc_token_ids = doc_id_to_token_ids[doc_id]
            else:
                doc_text = doc_id_to_text[doc_id]
                doc_text = ' '.join([ word for word in my_tokenizer.cut(doc_text, shorten=False, stopword=False, stemming=False).split()[:max_input_length] ])
                doc_tokens = tokenizer.tokenize(doc_text)
                doc_token_ids = tokenizer.convert_tokens_to_ids(doc_tokens)
                doc_id_to_token_ids[doc_id] = doc_token_ids
            doc_token_ids.append(tokenizer.sep_token_id)

            # make input sequences for BERT
            input_ids = query_token_ids + doc_token_ids
            token_type_ids = [0 for token_id in query_token_ids]
            token_type_ids.extend(1 for token_id in doc_token_ids)
            if len(input_ids) > max_input_length:  # truncation
                input_ids = input_ids[:max_input_length]
                token_type_ids = token_type_ids[:max_input_length]
            attention_mask = [1 for token_id in input_ids]

            # convert and collect inputs as tensors
            input_ids = torch.LongTensor(input_ids)
            attention_mask = torch.FloatTensor(attention_mask)
            token_type_ids = torch.LongTensor(token_type_ids)


            # collect all inputs as a dictionary
            instance = {}
            instance['input_ids'] = input_ids
            instance['attention_mask'] = attention_mask
            instance['token_type_ids'] = token_type_ids
            query_instances.append(instance)

        #################################################################
        #    Predict relevance scores for all BM25-top-1000 documents
        #################################################################
        doc_scores = np.empty((0,1))

        # Predict scores for each document
        dataloader = get_test_dataloader(query_instances, batch_size=batch_size*(num_negatives+1), num_workers=num_workers)
        for di, batch in enumerate(dataloader, start=1):
            batch = (tensor.cuda() for tensor in batch)
            input_ids, attention_mask, token_type_ids = batch
            
            ''' TO-DO: 
            1. Compute the logits as relevance scores (using the same function of how you compute built-in loss)
              (Hint: You need to call a function of model which takes all the 3 tensors in the batch as inputs)
         
            2. The scores are still on GPU. Reallocate them on CPU, and convert into numpy arrays.
              (Hint: You need to call two functions on the `scores` tensors. You can find them in PyTorch tutorial.)
            '''
            with torch.no_grad():
                scores = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]

            # merge all scores into a big numpy array
            scores = scores.detach().cpu().numpy()
            doc_scores = np.vstack((doc_scores, scores))

            # Progress bar with timer ;-)
            elapsed_time = time() - start_time
            elapsed_time = timedelta(seconds=int(elapsed_time))
            print("Query: %d/%d | Progress: %d/%d | %s      \r" \
                  % (qi, len(df), di, len(dataloader), elapsed_time), end='')

        # merge all query/BM25 document pair scores
        query_doc_scores.append(doc_scores)
    query_doc_scores = np.hstack(query_doc_scores).T

    print()
    return query_doc_scores, query_doc_ids

## (TO-DO!) Find best weight of BERT for BM25 rescoring on training set

In [17]:
dev_query_doc_scores, dev_query_doc_ids = predict_query_doc_scores(model, dev_df)

print('---- Grid search weight for "BM25 + weight * BERT" ----')
best_map_score, best_bert_weight = -100, 0.0
bert_scores = dev_query_doc_scores
n_query = dev_query_doc_scores.shape[0]

# Get MAP@K of BM25 baseline
query_pos_doc_ids = dev_df['pos_doc_ids'].values.tolist()
actual = [doc_ids.split() for doc_ids in query_pos_doc_ids]
bm25_predicted = [doc_id_list[:K] for doc_id_list in dev_query_doc_ids]
map_score = mapk(actual, bm25_predicted, k=K)
best_map_score = map_score
print("weight=%.1f: %.5f  (BM25 baseline)" % (0, 100*map_score))

Query: 24/24 | Progress: 63/63 | 0:07:42      
---- Grid search weight for "BM25 + weight * BERT" ----
weight=0.0: 25.68422  (BM25 baseline)


In [18]:
# Collect BM25 scores into same format of BERT scores
''' TO-DO: 
1. Convert the BM25 top-1000 scores into 2d numpy arrays
2. BM25 scores should have the same shape and orders as `dev_query_doc_scores` (i.e. BERT scores)
  (Hint: If there are 24 dev-set queries, the shape should be (24, 1000) )
'''

bm25_scores = np.array([ bm25_score.split() for bm25_score in dev_df['bm25_top1000_scores'] ], dtype='float64')

# Grid search for BM25 + BERT rescoring
low_bound, high_bound, scale = 0, 5, 1000
grids = [i / scale for i in range(low_bound * scale+1, high_bound * scale+1)]
for weight in grids:
    
    ''' TO-DO: 
    1. Compute the weighted scores using `bm25_scores`, `weight`, and `bert_scores`
    '''
    weighted_scores = bm25_scores + (weight * bert_scores)
    
    # sort index and map to document ids as output
    rescore_argsort = np.flip(weighted_scores.argsort(), axis=1)
    predicted = []
    for i in range(n_query):  # num. of queries
        predicted.append([dev_query_doc_ids[i][idx] for idx in rescore_argsort[i]][:K])
    map_score = mapk(actual, predicted, k=K)
    
    # show part of results for human evaluation
    if weight * 10 % 2 == 0:
        print("weight=%.1f: %.5f" % (weight, 100*map_score))
        
    # track weight with best MAP@10
    if map_score > best_map_score:
        best_map_score = map_score
        best_bert_weight = weight
print("\nHighest MAP@%d = %.5f found at weight=%.3f" % (K, 100*best_map_score, best_bert_weight))

weight=0.2: 28.42041
weight=0.4: 30.76358
weight=0.6: 31.65468
weight=0.8: 32.85853
weight=1.0: 33.53569
weight=1.2: 34.25076
weight=1.4: 34.18605
weight=1.6: 34.29145
weight=1.8: 34.32184
weight=2.0: 34.31561
weight=2.2: 34.17616
weight=2.4: 34.07996
weight=2.6: 34.08997
weight=2.8: 33.98798
weight=3.0: 34.03795
weight=3.2: 33.97797
weight=3.4: 33.83464
weight=3.6: 33.81573
weight=3.8: 33.79253
weight=4.0: 33.77961
weight=4.2: 33.41195
weight=4.4: 33.33863
weight=4.6: 33.18084
weight=4.8: 33.04788
weight=5.0: 32.97333

Highest MAP@1000 = 34.42586 found at weight=1.271


## (TO-DO!) Rescore testing set with BERT for submission

In [19]:
# Predict BERT scores for testing set
test_df = pd.read_csv(testing_csv_path)
query_id_list = test_df["query_id"]
n_query = len(query_id_list)
test_query_doc_scores, test_query_doc_ids = predict_query_doc_scores(model, test_df)
bert_scores = test_query_doc_scores

Query: 80/80 | Progress: 63/63 | 0:25:27      


In [20]:
# Rescore query/document score with BM25 + BERT
bm25_scores = [scores.split() for scores in test_df["bm25_top1000_scores"]]  # parse into 2d list of string
bm25_scores = [[float(score) for score in scores] for scores in bm25_scores]  # convert to float
bm25_scores = np.array(bm25_scores)

''' TO-DO: 
1. Compute the weighted scores using `bm25_scores`, `best_bert_weight`, and `bert_scores`
'''
weighted_scores = bm25_scores + (best_bert_weight * bert_scores)

# Rerank document ids with new scores
rescore_argsort = np.flip(weighted_scores.argsort(), axis=1)
ranked_doc_id_list = []
for i in range(n_query):  # num. of queries
    ranked_doc_id_list.append([test_query_doc_ids[i][idx] for idx in rescore_argsort[i]][:K])
ranked_doc_ids = [' '.join(doc_id_list) for doc_id_list in ranked_doc_id_list]

# Save reranked results for submission
data = {'query_id': query_id_list, 'ranked_doc_ids': ranked_doc_ids}
submission_df = pd.DataFrame(data)
submission_df.reset_index(drop=True, inplace=True)
submission_df.to_csv(save_submission_path, index=False)
print("Saved submission file as `%s`" % save_submission_path)

Saved submission file as `bm25_bert_rescoring.csv`
