In [None]:
# Prepare for Paperspace. Manage these via conda or pipenv on your own machine
!pip --quiet install nmslib flask torch transformers sklearn pyarrow seaborn spacy[cuda92]
%run init_container.py

In [None]:
import os
import requests
import random
import pickle
from itertools import islice

import pandas as pd
import json
import sklearn
import spacy
from spacy.tokens import DocBin
from spacy.strings import hash_string
from unidecode import unidecode

import numpy as np
import torch
import torch.nn.functional as F
from itertools import islice
from tqdm.notebook import tqdm

from scipy.special import softmax

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from transformers import *
import seaborn as sns
%matplotlib inline

In [None]:
from qa.constants import *

In [None]:
spacy.prefer_gpu()
spen = spacy.load("en_core_web_sm")

In [None]:
# This takes a while the first time, since from_pretrained() downloads and caches the model weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering \
    .from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') \
    .to(device)

In [None]:
with open(SQUAD_TRAIN) as f:
    doc = json.load(f)

In [None]:
paragraphs = []
questions = []
topics = []
for topic in doc["data"]:
    topics.append(topic["title"])
    for pgraph in topic["paragraphs"]:
        paragraphs.append(pgraph["context"])
        for qa in pgraph["qas"]:
            if not qa["is_impossible"]:
                questions.append(qa["question"])
        

In [None]:
len(paragraphs), len(questions), random.sample(paragraphs, 2), random.sample(questions, 10)

In [None]:
", ".join(topics)

In [None]:
%%time

if os.path.isfile(DOCBIN_CACHE) and os.path.isdir(VOCAB_CACHE):
    spen.vocab.from_disk(VOCAB_CACHE)

    with open(DOCBIN_CACHE, "rb") as f:
        bb = f.read()
        doc_bin = DocBin().from_bytes(bb)
    docs = list(doc_bin.get_docs(spen.vocab))
else:
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
    for doc in spen.pipe(tqdm(paragraphs)):
        doc_bin.add(doc)
    with open(DOCBIN_CACHE, "wb") as f:
        f.write(doc_bin.to_bytes())
    spen.vocab.to_disk(VOCAB_CACHE)

In [None]:
%%time

def lemmatize_preproc(doc):
    return [unidecode(tok.lemma_.lower()) for tok in doc if not tok.is_stop]

if not os.path.isfile(VECTOR_CACHE):
    print(f'Indexing corpus')
    vectorizer = TfidfVectorizer(
        analyzer=lemmatize_preproc,
        stop_words='english', min_df=10, max_df=.5, ngram_range=(1,3))
    tfidf = vectorizer.fit_transform(docs)
    with open(VECTOR_CACHE, "wb") as f:
        pickle.dump(dict(vectorizer=vectorizer, tfidf=tfidf), f)
else:
    print(f'Loading vector cache from {VECTOR_CACHE}')
    with open(VECTOR_CACHE, "rb") as f:
        cache = pickle.load(f)
        tfidf = cache["tfidf"]
        vectorizer = cache["vectorizer"]
        #vocab = cache["vocab"]
    #vectorizer = TfidfVectorizer(
    #    analyzer=lambda doc: [tok.lemma_.lower() for tok in doc],
     ##   vocabulary=vocab,
      #  stop_words='english', min_df=5, max_df=.5, ngram_range=(1,3))
        
len(vectorizer.vocabulary_), vectorizer

In [None]:
type(vectorizer.vocabulary)

In [None]:
len(docs)

In [None]:
len([1 for doc in tqdm(docs) if 'russes' in doc.text.lower()])

In [None]:
[ (i,doc) for (i, doc) in enumerate(tqdm(docs)) if 'Calafat' in doc.text]

In [None]:
NUMERICS = set(["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"])
def doc_entities(doc):
    ents = [e for e in doc.ents if e.label_ not in NUMERICS]
    result = (unidecode(w.lemma_.lower()) for s in ents for w in s if w.is_alpha and not w.is_stop)
    #return list(result)
    return [w for w in result if w not in vectorizer.vocabulary_]

In [None]:
%%time
ent_vecr = CountVectorizer(
    analyzer=doc_entities,
    stop_words='english', max_df=10)
ent_tfidf = ent_vecr.fit_transform(docs)

len(ent_vecr.vocabulary_)

In [None]:
%%time
N = 10_001
hashed_ents = [set() for i in range(N)]
for (i, words) in enumerate(tqdm(ent_vecr.inverse_transform(ent_tfidf))):
    for w in words:
        h = hash_string(str(w))
        hashed_ents[h%N].add(i)

In [None]:
bucket_size = [len(x) for x in hashed_ents]
max(bucket_size), sns.distplot(bucket_size)

In [None]:
# Ratio of empty buckets
sum([x == 0 for x in bucket_size]) / N

In [None]:
def contexts_by_entities(doc):
    """Returns a set of document ids that *might* be related to named entities in the pre-processed question"""
    ents = doc_entities(doc)
    buckets = [hash_string(word)%N for word in ents]
    return set([doc_id for slot in buckets for doc_id in hashed_ents[slot]])


ent_questions = list(q for q in spen.pipe(tqdm(questions)) if len(doc_entities(q)) > 0)
len(questions), len(ent_questions)

queries = list(random.sample(ent_questions, 5))
query_ents = [(doc_entities(query)) for query in queries]
queries, query_ents

In [None]:
def ranked_contexts_by_entities(query, topk=10, thresh=1e-5):
    doc_ids = list(contexts_by_entities(query))
    if len(doc_ids) < 1:
        return []

    contexts = [docs[i] for i in doc_ids]
    scores = vectorizer.transform(contexts) * vectorizer.transform([query]).transpose()
    scores = (np.asarray(scores.todense()).flatten())
    sort_scores = np.asarray(np.flip((scores).argsort())) # indices ranked by score
    useful = sort_scores[scores[sort_scores] >= thresh] # Filter out irrelevant scores
    top_indices = useful[:topk]
    return [(doc_ids[i], scores[i]) for i in top_indices]

In [None]:
query = spen("When did the US send troops to the Philippines to battle terrorists?")
doc_entities(query), ranked_contexts_by_entities(query, thresh=0)

## Nearest Neighbor

In [None]:
import nmslib

In [None]:
tfidf = tfidf.astype(np.float32)

In [None]:
tfidf.shape

In [None]:
%%time
index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR)
index.addDataPointBatch(tfidf.astype(np.float32))
index.createIndex({'post': 2, 'efConstruction': 500, 'M': 64},print_progress=True)

In [None]:
index.saveIndex('cache/nmsIndex.nms', save_data=True)

In [None]:
%%time
index.loadIndex('cache/nmsIndex.nms', load_data=True)

In [None]:
index.setQueryTimeParams(dict(ef=500))

In [None]:
# Odd, punctuation really matters for spacy's NER
# Some queries trigger cuda error
question = spen("Why did Feynman pick Caltech over Princeton?")
query = vectorizer.transform([question])
query, vectorizer.inverse_transform(query)

In [None]:
%%time
knn_ids, _ = index.knnQueryBatch(query, k=10, num_threads=8)[0]
knn_ids

In [None]:
[docs[i] for i in knn_ids[:4]]

In [None]:
ranked_contexts_by_entities(question)

In [None]:
ids_by_ent = [i for (i,s) in ranked_contexts_by_entities(question)]
ids_by_ent

In [None]:
merge_ids = set(ids_by_ent + [i for i in knn_ids])
merge_ids

In [None]:
contexts = [docs[i] for i in merge_ids]

In [None]:
question_df = pd.DataFrame.from_records([ {
    'question': question.text,
    'context':  ctx.text,
} for ctx in contexts ])
question_df

In [None]:
# TODO use stride when context too long
question_df["doc_id"] = merge_ids
question_df["encoded"] = question_df.apply(lambda row: tokenizer.encode("[CLS] " + row["question"] + " [SEP] " + row["context"] + " [SEP]", add_special_tokens=False, max_length=512), axis=1)
question_df["context_start"] = question_df.apply(lambda row: row["encoded"].index(102) + 1, axis=1)
question_df["context_end"] = question_df.apply(lambda row: len(row["encoded"])-1, axis=1)
question_df["tok_type"] = question_df.apply(lambda row: [0 if i <= row["encoded"].index(102) else 1 for i in range(len(row["encoded"]))], axis=1)
question_df.iloc[:3]

In [None]:
%%time
with torch.no_grad():
    X = torch.nn.utils.rnn.pad_sequence([torch.tensor(row) for row in question_df["encoded"]], batch_first=True).to(device)
    T = torch.nn.utils.rnn.pad_sequence([torch.tensor(row) for row in question_df["tok_type"]], batch_first=True).to(device)
    start_scores, end_scores = model(X, token_type_ids=T)
    print(f'start_scores = {start_scores}')
    max_score, max_start = torch.max(start_scores, axis=1)
    soft_max = F.softmax(max_score, dim=0)

In [None]:
answer_df = question_df[["doc_id", "context", "encoded", "context_start", "context_end"]].copy()
answer_df["answer_score"] = max_score.cpu().numpy()
answer_df["answer_start"] = max_start.cpu().numpy()
answer_df["answer_softmax"] = soft_max.cpu().numpy()
answer_df

In [None]:
max_len = torch.zeros_like(max_start)
for i in range(max_start.shape[0]):
    max_len[i] = torch.argmax(end_scores[i,max_start[i]:]) + 1
    
answer_df["answer_length"] = max_len.cpu().numpy()

In [None]:
answer_df = answer_df[answer_df.answer_start > answer_df.context_start]
answer_df = answer_df[answer_df.answer_start < answer_df.context_end]
answer_df = answer_df[answer_df.answer_score > 1.0]
answer_df = answer_df.sort_values(by="answer_score", ascending=False)
answer_df.head()

In [None]:
def decode_answer(row):
    input_ids = row.encoded
    offset = row.answer_start
    length = np.clip(row.answer_length, 0, 20)
    return tokenizer.decode(input_ids[offset:][:length])

In [None]:
answer_df["answer"] = answer_df.apply(decode_answer, axis=1)
answer_df[["answer_softmax", "answer_score", "answer"]].head()

In [None]:
answer_df[["answer_softmax", "answer_score", "answer", "doc_id", "context"]].iloc[:3].to_dict(orient="records")