In [1]:
import os
import re
import logging
import math
import random

RANDOM_SEED = 18
random.seed(RANDOM_SEED)

from typing import List, Union
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

import sentence_transformers as st
import sentence_transformers.losses
from sentence_transformers.datasets import NoDuplicatesDataLoader
from sentence_transformers.readers import InputExample

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using device: {device}')

using device: cuda


In [2]:
from bert_utils.data import *
from bert_utils.siamese import *

In [3]:
%load_ext autoreload
%autoreload 2

## TODO:

* build pipeline for Cross-Encoder

----
* siamese net:
    * train:
        * uses MultipleNegativeRankingLoss. we provide only positive examples.
          
          for i, all the rest examples (j != i) from the batch are considered as negative examples.
          
          thus we don't need to sample negative examples explicitly.
          
          and there is no need to compute embeddings for negative examples because we reuse
          embeddings for positive examples when treating them as negatives.
          
        * NoDuplicatesDataloader is not parallel (does it impact training speed?)

    * evaluation:
        * sample negative cases randomly
        
        
* cross-encoder:
    * evaluation:
        * ideally: rerank siamese predictions -> i.e. compute classification metrices for K * |q| examples
        * for simplicity (no connection with siamese net), let's sample negative examples randomly
      
----

Notes:
* one of main bottlenecks is reading texts from disk. 
  it takes ~1.5 min to read ~11k articles (size of evaluation set with negative examples) from ssd

## functions

In [4]:
def process_title(title: pd.Series):
    title = title.str.lower().str.strip()
    title = title.str.replace(r'([^\w\s])|_|-', '_', regex=True)
    return title

## data

In [5]:
data_root_dp = '/media/rtn/Windows 10/work/univier/wiki_extract/wiki_parsed'
articles_dp = data_root_dp

filemap = pd.read_csv(os.path.join(data_root_dp, 'filepaths.csv'))
assert filemap.duplicated('filename').sum() == 0

filemap['article_id'] = list(range(filemap.shape[0]))
article_ids = set(filemap['article_id'])

print(filemap.shape)

(223619, 4)


In [6]:
get_article_text(get_article_path('poetry', filemap))

'Poetry is a type of art form and a type of literature. Poetry uses the qualities of words, in different ways, to be artistic. Poetry can be as short as a few words, or as long as a book. A poem as short as one line is called a monostich. A poem that is as long as a book is an epic. There are many "poetic forms" (forms of poetry). Some of forms are: Sonnet, Haiku, Ballad, Stev, Prose poem, Ode, Free verse, Blank verse, thematic, limerick and nursery rhymes. Poetry can be used to describe (comparing, talking about, or expressing emotion) many things. It can make sense or be nonsense, it can rhyme or not. It can have many shapes and sizes; it can be serious or funny. "To say something poetically" means to give information in an artistic way. A more modern approach is digital poetry. Computers and webtechnology is used to express poetry and make it interactive. So called interdisciplinary poetry (wich means combination of different forms of poetry) are made possible by linking the poetic 

In [7]:
labels_test = pd.read_csv('data/queries.tsv', sep='\t', header=None)
labels_test.columns = ['query', 'title']
labels_test['title'] = process_title(labels_test['title'])

print(labels_test.shape)
labels_test.head(3)

(200, 2)


Unnamed: 0,query,title
0,animals that have shells and live in water,shell__zoology_
1,how many different types of scorpions are there,scorpion
2,describe the structure of a scientific name fo...,binomial_nomenclature


In [8]:
labels_train_raw = pd.read_csv('data/train.tsv', sep='\t', header=None)
labels_train_raw.columns = ['query', 'title']
labels_train_raw['title'] = process_title(labels_train_raw['title'])

print(labels_train_raw.shape)
labels_train_raw.head(3)

(50000, 2)


Unnamed: 0,query,title
0,where does the most metabolic activity in the ...,cytoplasm
1,what kind of dog played in turner and hooch,dogue_de_bordeaux
2,when is there gonna be an eclipse 2017,solar_eclipse_of_august_21__2017


In [9]:
filemap.head()

Unnamed: 0,filename,path,html_path,article_id
0,000_emergency,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...,0
1,0s_bc,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...,1
2,0_0_0,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...,2
3,0_4_0,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...,3
4,0_6_0,/media/rtn/Windows 10/work/univier/wiki_extrac...,/media/rtn/Windows 10/work/univier/wiki_extrac...,4


### filter train, test samples

In [10]:
two_sets_stats(filemap['filename'], labels_train_raw['title'])

init shapes: (223619, 50000)
unique elements: (223619, 16731)
s1 & s2: 15430
s1 ^ s2: 209490
s1 - s2: 208189
s2 - s1: 1301


In [11]:
two_sets_stats(filemap['filename'], labels_test['title'])

init shapes: (223619, 200)
unique elements: (223619, 142)
s1 & s2: 137
s1 ^ s2: 223487
s1 - s2: 223482
s2 - s1: 5


In [12]:
# labels_train_raw = labels_train_raw[labels_train_raw['title'].isin(filemap['filename'])]
# labels_test = labels_test[labels_test['title'].isin(filemap['filename'])]

labels_train_raw = labels_train_raw.merge(
    filemap[['filename', 'article_id']], 
    left_on='title', right_on='filename', how='inner').drop(columns='filename')

labels_test = labels_test.merge(
    filemap[['filename', 'article_id']], 
    left_on='title', right_on='filename', how='inner').drop(columns='filename')

print(labels_train_raw.shape[0])
print(labels_test.shape[0])

45260
194


In [13]:
labels_train_raw.head(2)

Unnamed: 0,query,title,article_id
0,where does the most metabolic activity in the ...,cytoplasm,52039
1,what is the structure that contains ions subst...,cytoplasm,52039


In [14]:
labels_test.head(2)

Unnamed: 0,query,title,article_id
0,animals that have shells and live in water,shell__zoology_,180649
1,how many different types of scorpions are there,scorpion,177775


## siamese model

### prepare data

In [15]:
assert labels_train_raw['query'].duplicated().sum() == 0  # else we should split into (train, val) by queries

labels_train, labels_val = train_test_split(
    labels_train_raw, random_state=RANDOM_SEED, test_size=0.15, shuffle=True)

print(labels_train.shape)
print(labels_val.shape)

(38471, 3)
(6789, 3)


In [16]:
val_dataset = add_negative_samples_to_val_set(
    labels_val=labels_val, val_neg_to_pos_factor=1, 
    all_article_ids=article_ids, filemap=filemap, 
)
print(val_dataset.shape)

(13578, 4)


In [17]:
print(labels_train_raw['title'].nunique())
print(labels_train['title'].nunique())
print(val_dataset['title'].nunique())

15430
14136
11162


In [18]:
train_dataset = WikiQADataset(labels_train, filemap=filemap)

In [19]:
train_dataset[2].texts

['how do they draw teams for the world cup',
 'The knockout stage of the 2018 FIFA World Cup is the second stage of the competition, just after the group stage. It began on 30 June with the round of 16 and ended on 15 July with the final match, held at the Luzhniki Stadium in Moscow. The top two teams from each group (16 in total) would advance to the knockout stage to compete in a single-elimination. In the knockout stage, if a match is level at the end of 90 minutes of normal playing time, extra time would be played (two periods of 15 minutes each), where each team is allowed to make a fourth substitution. If still tied after extra time, the match would be decided by a penalty shoot-out to determine the winners. A third place play-off will also be played between the two losing teams of the semi-finals. The top two placed teams from each of the eight groups will qualify for the knockout stage.']

In [20]:
# TODO: has max_seq_length: 128. can change to other model
model = st.SentenceTransformer('distilbert-base-nli-mean-tokens')
model.to(device);

### train siamese model

In [41]:
train_dl = CustomNoDuplicatesDataLoader(train_dataset, batch_size=train_batch_size)

num_epochs = 2
train_batch_size = 16
val_batch_size = 64
warmup_steps = math.ceil(len(train_dl) * num_epochs * 0.1)
eval_steps = int(len(train_dl) * 0.1)

out_root_dp = '/media/rtn/data/fajly2/nn/bert'
checkpoints_dp = os.path.join(out_root_dp, 'siam', 'checkpoints')
output_dp = os.path.join(out_root_dp, 'siam', 'output')
print(f'checkpoints_dp: "{checkpoints_dp}"')
print(f'output_dp: "{output_dp}"')

print(f"num_epochs: {num_epochs}")
print(f'train_batch_size: {train_batch_size}')
print(f'val_batch_size: {val_batch_size}')
print(f"warmup steps: {warmup_steps}")
print(f'len(train_dl): {len(train_dl)}')
print(f'eval_steps: {eval_steps}')

checkpoints_dp: "/media/rtn/data/fajly2/nn/bert/siam/checkpoints"
output_dp: "/media/rtn/data/fajly2/nn/bert/siam/output"
num_epochs: 2
train_batch_size: 16
val_batch_size: 64
warmup steps: 481
len(train_dl): 2404
eval_steps: 240


In [34]:
train_loss = st.losses.MultipleNegativesRankingLoss(model=model)

In [None]:
val_evaluator = CustomBinaryClassificationEvaluator(
    sentences1=val_dataset['query'].tolist(),
    sentences2=val_dataset_articles,
    labels=val_dataset['label'].tolist(),
    batch_size=val_batch_size
)

In [26]:
model.fit(
    train_objectives=[(train_dl, train_loss)],
    epochs=num_epochs,
    evaluator=val_evaluator,
    evaluation_steps=eval_steps,
    
    checkpoint_path=checkpoints_dp,
    checkpoint_save_steps=eval_steps,
    output_path=output_dp,
    save_best_model=True,
    
    optimizer_params={'lr': 2e-5},
    weight_decay= 0.01,
    scheduler='WarmupLinear',
    warmup_steps=warmup_steps,
    use_amp=False,
    
    steps_per_epoch=300,  # for debug
)

## debug

In [21]:
val_dataset.head()

Unnamed: 0,query,title,article_id,label
0,the governor general who annexed sindh in brit...,charles_james_napier,40657,1
1,where does the thirteen reasons why take place,13_reasons_why,635,1
2,what kind of engine does a honda crv have,honda_cr_v,90180,1
3,does steven king have a cameo in the new it movie,it__2017_movie_,97006,1
4,how many goals has rooney scored for dc,wayne_rooney,215242,1


In [22]:
val_dataset['label'].value_counts()

1    6789
0    6789
Name: label, dtype: int64

In [23]:
val_dataset.dtypes

query         object
title         object
article_id     int64
label          int64
dtype: object

In [24]:
val_dataset_articles = []
for t in tqdm(val_dataset['title']):
    val_dataset_articles.append(get_article_text(get_article_path(t, filemap)))

  0%|          | 0/13578 [00:00<?, ?it/s]

In [26]:
val_evaluator(model, output_path='tmp')

Batches:   0%|          | 0/277 [00:00<?, ?it/s]

0.8021357742181541

In [32]:
pd.read_csv('tmp/validation_evaluator_results.csv')

Unnamed: 0,epoch,steps,f1_threshold,f1,precision,recall,accuracy,ap
0,-1,-1,0.415949,0.802136,0.83183,0.774488,0.808956,0.900265
