In [None]:
# Prepare for Paperspace. Manage these via conda or pipenv on your own machine
!pip --quiet install nmslib flask torch transformers sklearn pyarrow seaborn spacy[cuda92] torchtext
%run init_container.py

In [None]:
import os
import requests
import random
import pickle
from itertools import islice
import multiprocessing as mp

import pandas as pd
import json
import sklearn
import spacy
from spacy.tokens import DocBin
from spacy.strings import hash_string
from unidecode import unidecode

import nmslib
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from itertools import islice
from torchtext.data import Field, Dataset, BucketIterator, Example, RawField
from tqdm.notebook import tqdm

from scipy.special import softmax

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from transformers import *
import seaborn as sns
%matplotlib inline

In [None]:
from qa.constants import *

In [None]:
#spacy.prefer_gpu()
spen = spacy.load("en_core_web_sm")

In [None]:
# This takes a while the first time, since from_pretrained() downloads and caches the model weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering \
    .from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') \
    .to(device)

In [None]:
with open(SQUAD_TRAIN) as f:
    doc = json.load(f)

In [None]:
paragraphs = []
questions = []
topics = []
for topic in doc["data"]:
    topics.append(topic["title"])
    for pgraph in topic["paragraphs"]:
        paragraphs.append(pgraph["context"])
        for qa in pgraph["qas"]:
            if not qa["is_impossible"]:
                questions.append(qa["question"])
        

In [None]:
topic = random.choice(doc["data"])
paragraph = random.choice(topic["paragraphs"])
question = random.choice(paragraph["qas"])
topic["title"], question["question"], paragraph["context"]

In [None]:
len(paragraphs), len(questions), random.sample(paragraphs, 2), random.sample(questions, 10)

In [None]:
", ".join(topics)

In [None]:
%%time

if os.path.isfile(DOCBIN_CACHE) and os.path.isdir(VOCAB_CACHE):
    spen.vocab.from_disk(VOCAB_CACHE)

    with open(DOCBIN_CACHE, "rb") as f:
        bb = f.read()
        doc_bin = DocBin().from_bytes(bb)
    docs = list(doc_bin.get_docs(spen.vocab))
else:
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
    for doc in spen.pipe(tqdm(paragraphs)):
        doc_bin.add(doc)
    with open(DOCBIN_CACHE, "wb") as f:
        f.write(doc_bin.to_bytes())
    spen.vocab.to_disk(VOCAB_CACHE)

In [None]:
%%time

def lemmatize_preproc(doc):
    return [unidecode(tok.lemma_.lower()) for tok in doc if not tok.is_stop]

if not os.path.isfile(VECTOR_CACHE):
    vectorizer = TfidfVectorizer(
        analyzer=lemmatize_preproc,
        stop_words='english', min_df=10, max_df=.5, ngram_range=(1,3))
    tfidf = vectorizer.fit_transform(docs)
    with open(VECTOR_CACHE, "wb") as f:
        pickle.dump(dict(vectorizer=vectorizer, tfidf=tfidf), f)
else:
    with open(VECTOR_CACHE, "rb") as f:
        cache = pickle.load(f)
        tfidf = cache["tfidf"]
        vectorizer = cache["vectorizer"]
        
len(vectorizer.vocabulary_)

Buckets

In [None]:
class VocabSTOI:
    def __init__(self, tokenizer):
        self.tok = tokenizer
        
    def __getitem__(self, key):
        return self.tok.convert_tokens_to_ids([key])[0]
    
class VocabITOS:
    def __init__(self, tokenizer):
        self.tok = tokenizer
        
    def __getitem__(self, key):
        return self.tok.convert_ids_to_tokens([key])[0]
    
class BertVocab:
    def __init__(self, tokenizer):
        self.itos = VocabITOS(tokenizer)
        self.stoi = VocabSTOI(tokenizer)
def ident(x):
    return x

def default_preproc(s):
    #print(f'input type {type(s)} value: {s}')
    return ["[CLS]"] + tokenizer.tokenize(s)[:510] + ["[SEP]"]

class SpacyBertField(Field):
    """Transforms spaCy documents into Bert token lists"""
    
    def __init__(self,
                 vocab=BertVocab(tokenizer),
                 preprocessing=default_preproc,
                 **kwargs):
        super().__init__(
            pad_token=tokenizer.pad_token,
            preprocessing=preprocessing,
            tokenize=ident,
            batch_first=True,
            **kwargs)
        self.vocab = vocab

    def build_vocab(self, *args, **kw):
        pass

In [None]:
REDUCTION_DIMS = 1024
def reduce_embeds(toks, emb):
    N = (toks != 0).sum(axis = 1, keepdim=True)
    sumq = emb.sum(axis=1)
    meanq = sumq / N
    maxq, _ = emb.max(axis=1)
    minq, _ = emb.min(axis=1)
    #return torch.cat([meanq, minq, maxq], axis=1)
    return maxq

In [None]:
texts = [' '.join([tok.lemma_ for tok in doc if not tok.is_stop]) for doc in tqdm(docs)]

In [None]:
%%time
fields = [('index', RawField()), ('context', SpacyBertField())]

def examplify(args):
    (i, doc) = args
    return Example.fromlist([i, doc], fields)

with mp.Pool() as pool:
    examples = pool.map(examplify, enumerate(tqdm(texts)))

In [None]:
ds = Dataset(examples, fields)
buckets = BucketIterator(dataset=ds,
                         batch_size=24,
                         device=device,
                         shuffle=False,
                         sort=True,
                         sort_key=lambda ex: -len(ex.context))

In [None]:
embeds = np.zeros((len(texts), REDUCTION_DIMS), dtype=np.float32)
for b in tqdm(buckets):
    with torch.no_grad():
        output = model.bert.embeddings(b.context)
        embeds[b.index] = reduce_embeds(b.context, output).cpu()

# Either I messed up or z-normalization completely destroys the embedding
# Accuracy went from 60% to 2%.
# Just subtracting mean accounts for most of this drop.
# Should try PCA whitening instead.


In [None]:
embeds.min(), embeds.max(), embeds.mean()

In [None]:
(embeds > 10).sum()

In [None]:
def embed_sentence(query):
    if type(query) is str:
        query = spen(query)
    query = ' '.join([word.lemma_ for word in query if not word.is_stop])
    query_ids = tokenizer.encode("[CLS] " + query + " [SEP]", add_special_tokens=False, max_length=512)
    X = torch.tensor(query_ids, device=device).unsqueeze(0)
    query_emb = model.bert.embeddings(X)
    result = reduce_embeds(X, query_emb).cpu().numpy()
    
    return result

In [None]:
with torch.no_grad():
    foo = embed_sentence("how now brown cow")
foo.shape

In [None]:
%%time
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(embeds)
index.createIndex({'post': 2, 'efConstruction': 500, 'M': 64}, print_progress=True)
index.setQueryTimeParams(dict(ef=500))

In [None]:
%%time
bfidx = nmslib.init(method='brute_force', space='cosinesimil')
bfidx.addDataPointBatch(embeds)
bfidx.createIndex(print_progress=True)

In [None]:
index.knnQuery(embeds[3000], k=30), bfidx.knnQuery(embeds[3000], k=30), 

In [None]:
with torch.no_grad():
    query = embed_sentence("What are common pieces of computers?")
    results, dists = bfidx.knnQuery(query, k=50)
results

In [None]:
[(i,docs[i].text) for i in results[:20]]

In [None]:
def knn_embed_trial(k=20):
    topic = random.choice(doc["data"])
    paragraph = random.choice(topic["paragraphs"])
    question = random.choice(paragraph["qas"])
    with torch.no_grad():
        query = embed_sentence(question["question"])
        results, dists = index.knnQuery(query, k=k)
    return paragraph["context"] in [docs[i].text for i in results]

In [None]:
hits = 0
for i in tqdm(range(1000)):
    hits += knn_embed_trial(25)

hits

In [None]:
def brute_embed_trial(k=20):
    topic = random.choice(doc["data"])
    paragraph = random.choice(topic["paragraphs"])
    question = random.choice(paragraph["qas"])
    with torch.no_grad():
        query = embed_sentence(question["question"])
        results, dists = bfidx.knnQuery(query, k=k)
    return paragraph["context"] in [docs[i].text for i in results]

In [None]:
hits = 0
for i in tqdm(range(1000)):
    hits += brute_embed_trial(25)

hits

In [None]:
def brute_tfidf_trial(k=20):
    topic = random.choice(doc["data"])
    paragraph = random.choice(topic["paragraphs"])
    question = random.choice(paragraph["qas"])["question"]
    query = vectorizer.transform([spen(question)])
    scores = (tfidf * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    return paragraph["context"] in [docs[i].text for i in results[:k, 0]]

In [None]:
hits = 0
for i in tqdm(range(1000)):
    hits += brute_tfidf_trial(50)

hits

### With entities

In [None]:
NUMERICS = set(["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"])
def doc_entities(doc):
    ents = [e for e in doc.ents if e.label_ not in NUMERICS]
    result = (unidecode(w.lemma_.lower()) for s in ents for w in s if w.is_alpha and not w.is_stop)
    #return list(result)
    return [w for w in result if w not in vectorizer.vocabulary_]

In [None]:
%%time
ent_vecr = CountVectorizer(
    analyzer=doc_entities,
    stop_words='english', max_df=10)
ent_tfidf = ent_vecr.fit_transform(docs)

N = 10_001
hashed_ents = [set() for i in range(N)]
for (i, words) in enumerate(tqdm(ent_vecr.inverse_transform(ent_tfidf))):
    for w in words:
        h = hash_string(str(w))
        hashed_ents[h%N].add(i)
len(ent_vecr.vocabulary_)

In [None]:
def contexts_by_entities(doc):
    """Returns a set of document ids that *might* be related to named entities in the pre-processed question"""
    ents = doc_entities(doc)
    buckets = [hash_string(word)%N for word in ents]
    return set([doc_id for slot in buckets for doc_id in hashed_ents[slot]])


In [None]:
def combined_trial(k=20):
    rs = set()
    topic = random.choice(doc["data"])
    paragraph = random.choice(topic["paragraphs"])
    qa = random.choice(paragraph["qas"])
    question = qa["question"]
    
    query = vectorizer.transform([spen(question)])
    scores = (tfidf * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    rs.update(results[:k, 0].tolist())
    
    if paragraph["context"] in [docs[i].text for i in results[:k, 0]]:
        return True

    with torch.no_grad():
        query = embed_sentence(question)
        results, dists = bfidx.knnQuery(query, k=k)
    rs.update(results)
        
    if paragraph["context"] in [docs[i].text for i in results]:
        return True
    
    results = contexts_by_entities(spen(question))
    rs.update(results)
    
    if paragraph["context"] in [docs[i].text for i in results]:
        return True
    
    return False

In [None]:
hits = 0
for i in tqdm(range(1000)):
    hits += combined_trial(50)

hits

## All together now

In [None]:
def combined_contexts(question, k=20):
    preproc = spen(question)

    query = vectorizer.transform([preproc])
    scores = (tfidf * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    tagged = { i: "TFIDF" for i in results[:k, 0].tolist() }
    rs = set(tagged.keys())

    with torch.no_grad():
        query = embed_sentence(preproc)
        results, dists = bfidx.knnQuery(query, k=k)
        embed_set = set(results)

    #tagged.update({ i: "EMBED" for i in embed_set.difference(rs)})
    tagged.update({ i: "EMBED" for i in embed_set})
    rs.update(results)
    tagged

    entity_set = set(contexts_by_entities(preproc))
    #tagged.update({ i: "ENTITY" for i in entity_set.difference(rs)})
    tagged.update({ i: "ENTITY" for i in entity_set})

    return tagged

In [None]:
topic = random.choice(doc["data"])
paragraph = random.choice(topic["paragraphs"])
qa = random.choice(paragraph["qas"])
question = qa["question"]
topic["title"], question, qa["is_impossible"],  paragraph["context"], qa["answers"]

In [None]:
#question = "What did humans hunt during the Paleolithic?"

In [None]:

contexts = combined_contexts(question, 50)
question_df = pd.DataFrame.from_records([ {
    'question': question,
    'context':  docs[i].text,
    'tag': tag
} for (i, tag) in contexts.items() ])
question_df.head()

In [None]:
question_df["doc_id"] = contexts
question_df["encoded"] = question_df.apply(lambda row: tokenizer.encode("[CLS] " + row["question"] + " [SEP] " + row["context"] + " [SEP]", add_special_tokens=False, max_length=512), axis=1)
question_df["tok_type"] = question_df.apply(lambda row: [0 if i <= row["encoded"].index(102) else 1 for i in range(len(row["encoded"]))], axis=1)
question_df["context_start"] = question_df.apply(lambda row: row["encoded"].index(102) + 1, axis=1)
question_df["context_end"] = question_df.apply(lambda row: len(row["encoded"])-1, axis=1)

TODO batching and ranking contexts by jaccard index of character-level n-grams

In [None]:
chargram = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3))
foo = chargram.fit_transform(question_df.context)
bar = chargram.transform([question])
foo

In [None]:
INFER_LIMIT = 20

In [None]:
chargram.inverse_transform(bar)

In [None]:
rows = np.flip(np.asarray((foo * bar.transpose()).todense()).squeeze().argsort().astype(np.int))[:INFER_LIMIT]
question_df = question_df.iloc[rows.tolist()]


In [None]:
question_df

In [None]:
%%time
with torch.no_grad():
    X = torch.nn.utils.rnn.pad_sequence([torch.tensor(row) for row in question_df["encoded"]], batch_first=True).to(device)
    T = torch.nn.utils.rnn.pad_sequence([torch.tensor(row) for row in question_df["tok_type"]], batch_first=True).to(device)
    start_scores, end_scores = model(X, token_type_ids=T)
    max_score, max_start = torch.max(start_scores, axis=1)
    soft_max = F.softmax(max_score, dim=0)

In [None]:
soft_max.shape

In [None]:
def decode_answer(row):
    input_ids = row.encoded
    offset = row.answer_start
    length = np.clip(row.answer_length, 0, 20)
    return tokenizer.decode(input_ids[offset:][:length])

In [None]:
answer_df = question_df[["doc_id", "tag", "context", "encoded", "context_start", "context_end"]].copy()
answer_df["answer_score"] = max_score.cpu().numpy()
answer_df["answer_start"] = max_start.cpu().numpy()
answer_df["answer_softmax"] = soft_max.cpu().numpy()
max_len = torch.zeros_like(max_start)
for i in range(max_start.shape[0]):
    max_len[i] = torch.argmax(end_scores[i,max_start[i]:]) + 1
answer_df["answer_length"] = max_len.cpu().numpy()    

In [None]:
answer_df = answer_df[answer_df.answer_start >= answer_df.context_start]
answer_df = answer_df[answer_df.answer_start <= answer_df.context_end]
answer_df = answer_df[answer_df.answer_softmax >= 1.0 / INFER_LIMIT]

In [None]:
answer_df = answer_df.sort_values(by="answer_score", ascending=False)
answer_df["answer"] = answer_df.apply(decode_answer, axis=1) if len(answer_df.index) > 0 else ""
answer_df[["answer_softmax", "answer_score", "answer"]].head()
answer_df

In [None]:
answer_df[["answer_softmax", "answer_score", "answer", "doc_id", "tag", "context"]].iloc[:5].to_dict(orient="records")