# Question Answering with PyTorch Transformers: Part 2
## A simple vector index with scikit-learn

Read the full article: https://medium.com/@patonw/question-answering-with-pytorch-transformers-part-2-a31900294673

> In Part 1 we briefly examined the problem of question answering in machine learning and how recent breakthroughs have greatly improved the quality of answers produced by computer systems.
>
> Using the pipeline API Transformers library we were able to run a pre-trained model in a few lines of code. In this article we’ll prototype an information retrieval system around it. In later articles we’ll turn that into web services that can be queried by browsers and mobile apps.

In [None]:
# Prepare for Paperspace. Manage these via conda or pipenv on your own machine
!pip --quiet install torch transformers sklearn pyarrow seaborn spacy[cuda92]
%run init_container.py

In [None]:
from qa.constants import *

In [None]:
import os
import requests
import random
import pickle

import pandas as pd
import json
import sklearn
import spacy

import numpy as np
import torch
import torch.nn.functional as F
from itertools import islice
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import *

In [None]:
spacy.prefer_gpu()
sp = spacy.load("en_core_web_sm")

### Extract Questions from SQUAD2.0

Download dataset if not present.

In [None]:
with open(SQUAD_TRAIN) as f:
    doc = json.load(f)
doc.keys(), type(doc["data"]), len(doc["data"])

In [None]:
paragraphs = []
questions = []
for topic in doc["data"]:
    for pgraph in topic["paragraphs"]:
        paragraphs.append(pgraph["context"])
        for qa in pgraph["qas"]:
            if not qa["is_impossible"]:
                questions.append(qa["question"])
        
len(paragraphs), len(questions), random.sample(paragraphs, 2), random.sample(questions, 5)

### Map words to lemmas

In [None]:
def lemmatize(phrase):
    return " ".join([word.lemma_ for word in sp(phrase)])

In [None]:
%%time

if not os.path.isfile(LEMMA_CACHE):
    lemmas = [lemmatize(par) for par in tqdm(paragraphs)]
    df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
    df.to_feather(LEMMA_CACHE)
    
df = pd.read_feather(LEMMA_CACHE)
paragraphs = df.context
lemmas = df.lemmas

In [None]:
rand_idx = [random.randint(0, len(lemmas)-1) for i in range(10)]

# TODO display in left/right columns
[(paragraphs[i][:80], lemmas[i][:80]) for i in rand_idx]

### Vectorize corpus by TF-IDF

In [None]:
VECTOR_CACHE = 'cache/vectors.pickle'

In [None]:
%%time
if not os.path.isfile(VECTOR_CACHE):
    vectorizer = TfidfVectorizer(
        stop_words='english', min_df=5, max_df=.5, ngram_range=(1,3))
    tfidf = vectorizer.fit_transform(lemmas)
    with open(VECTOR_CACHE, "wb") as f:
        pickle.dump(dict(vectorizer=vectorizer, tfidf=tfidf), f)
else:
    with open(VECTOR_CACHE, "rb") as f:
        cache = pickle.load(f)
        tfidf = cache["tfidf"]
        vectorizer = cache["vectorizer"]
        
len(vectorizer.vocabulary_)

### Fetch contexts related to question

In [None]:
question = "When did the last country to adopt the Gregorian calendar start using it?"
query = vectorizer.transform([lemmatize(question)])
(query > 0).sum(), vectorizer.inverse_transform(query)

In [None]:
%%time
scores = (tfidf * query.T).toarray()
results = (np.flip(np.argsort(scores, axis=0)))
[paragraphs[i] for i in results[:3, 0]]

### Extract answers from contexts

In [None]:
qapipe = pipeline('question-answering',
                  model='distilbert-base-uncased-distilled-squad',
                  tokenizer='bert-base-uncased')

In [None]:
%%time
THRESH = 0.01
candidate_idxs = [ (i, scores[i]) for i in results[0:10, 0] ]
contexts = [ (paragraphs[i],s)
    for (i,s) in candidate_idxs if s > THRESH ]

question_df = pd.DataFrame.from_records([ {
    'question': question,
    'context':  ctx
} for (ctx,s) in contexts ])

question_df.to_feather("cache/question_context.feather")

In [None]:
%%time
preds = qapipe(question_df.to_dict(orient="records"))
answer_df = pd.DataFrame.from_records(preds)
answer_df["context"] = question_df["context"]
answer_df = answer_df.sort_values(by="score", ascending=False)
answer_df.head()

In [None]:
answer_df.head().to_dict(orient="records")