In [1]:
!pip install -qU datasets pinecone-client sentence-transformers torch

[0m

In [2]:
from datasets import load_dataset

# load the dataset from huggingface in streaming mode and shuffle it
wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
).shuffle(seed=960)

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

In [3]:
# show the contents of a single document in the dataset
next(iter(wiki_data))

{'wiki_id': 'Q7649565',
 'start_paragraph': 20,
 'start_character': 272,
 'end_paragraph': 24,
 'end_character': 380,
 'article_title': 'Sustainable Agriculture Research and Education',
 'section_title': "2000s & Evaluation of the program's effectiveness",
 'passage_text': "preserving the surrounding prairies. It ran until March 31, 2001.\nIn 2008, SARE celebrated its 20th anniversary. To that date, the program had funded 3,700 projects and was operating with an annual budget of approximately $19 million. Evaluation of the program's effectiveness As of 2008, 64% of farmers who had received SARE grants stated that they had been able to earn increased profits as a result of the funding they received and utilization of sustainable agriculture methods. Additionally, 79% of grantees said that they had experienced a significant improvement in soil quality though the environmentally friendly, sustainable methods that they were"}

In [4]:
# filter only documents with History as section_title
history = wiki_data.filter(
    lambda d: d['section_title'].startswith('History')
)

In [5]:
from tqdm.auto import tqdm  # progress bar

total_doc_count = 50000

counter = 0
docs = []
# iterate through the dataset and apply our filter
for d in tqdm(history, total=total_doc_count):
    # extract the fields we need
    doc = {
        "article_title": d["article_title"],
        "section_title": d["section_title"],
        "passage_text": d["passage_text"]
    }
    # add the dict containing fields we need to docs list
    docs.append(doc)

    # stop iteration once we reach 50k
    if counter == total_doc_count:
        break

    # increase the counter on every iteration
    counter += 1

  0%|          | 0/50000 [00:00<?, ?it/s]

In [6]:
import pandas as pd

# create a pandas dataframe with the documents we extracted
df = pd.DataFrame(docs)
df.head()

Unnamed: 0,article_title,section_title,passage_text
0,Taupo District,History,was not until the 1950s that the region starte...
1,Sutarfeni,History & Western asian analogues,Sutarfeni History strand-like pheni were Phena...
2,The Bishop Wand Church of England School,History,The Bishop Wand Church of England School Histo...
3,Teufelsmoor,History & Situation today,"made to preserve the original landscape, altho..."
4,Surface Hill Uniting Church,History,in perpetual reminder that work and worship go...


In [8]:
import pinecone

# connect to pinecone environment
pinecone.init(
    api_key="API_KEY",
    environment="us-west1-gcp"  # find next to API key in console
)

In [9]:
index_name = "abstractive-question-answering"

# check if the abstractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=768,
        metric="cosine"
    )

# connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)

In [10]:
import torch
from sentence_transformers import SentenceTransformer

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base", device=device)
retriever

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/591 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [11]:
# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["passage_text"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/782 [00:00<?, ?it/s]

{'dimension': 768,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 50001}},
 'total_vector_count': 50001}

In [40]:
from transformers import BartTokenizer, BartForConditionalGeneration

# load bart tokenizer and model from huggingface
tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa').to(device)

In [41]:
def query_pinecone(query, top_k):
    # generate embeddings for the query
    xq = retriever.encode([query]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    return xc

In [42]:
def format_query(query, context):
    # extract passage_text from Pinecone search result and add the <P> tag
    context = [f"<P> {m['metadata']['passage_text']}" for m in context]
    # concatinate all context passages
    context = " ".join(context)
    # contcatinate the query and context passages
    query = f"question: {query} context: {context}"
    return query

In [43]:
query = "when was the first electric power system built?"
result = query_pinecone(query, top_k=1)
result

{'matches': [{'id': '3593',
              'metadata': {'article_title': 'Electric power system',
                           'passage_text': 'Electric power system History In '
                                           '1881, two electricians built the '
                                           "world's first power system at "
                                           'Godalming in England. It was '
                                           'powered by two waterwheels and '
                                           'produced an alternating current '
                                           'that in turn supplied seven '
                                           'Siemens arc lamps at 250 volts and '
                                           '34 incandescent lamps at 40 volts. '
                                           'However, supply to the lamps was '
                                           'intermittent and in 1882 Thomas '
                                           'Ed

In [44]:
from pprint import pprint

In [45]:
# format the query in the form generator expects the input
query = format_query(query, result["matches"])
pprint(query)

('question: when was the first electric power system built? context: <P> '
 "Electric power system History In 1881, two electricians built the world's "
 'first power system at Godalming in England. It was powered by two '
 'waterwheels and produced an alternating current that in turn supplied seven '
 'Siemens arc lamps at 250 volts and 34 incandescent lamps at 40 volts. '
 'However, supply to the lamps was intermittent and in 1882 Thomas Edison and '
 'his company, The Edison Electric Light Company, developed the first '
 'steam-powered electric power station on Pearl Street in New York City. The '
 'Pearl Street Station initially powered around 3,000 lamps for 59 customers. '
 'The power station generated direct current and')


In [48]:
def generate_answer(query):
    # tokenize the query to get input_ids
    inputs = tokenizer([query], max_length=1024, return_tensors="pt", truncation=True).to(device)
    # use generator to predict output ids
    ids = generator.generate(inputs["input_ids"], num_beams=2, min_length=20, max_length=40)
    # use tokenizer to decode the output ids
    answer = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return pprint(answer)

In [49]:
generate_answer(query)

('The first electric power system was built in 1881 at Godalming in England. '
 'It was powered by two waterwheels and produced alternating current that in '
 'turn supplied seven Siemens arc lamps')


In [50]:
query = "How was the first wireless message sent?"
context = query_pinecone(query, top_k=5)
query = format_query(query, context["matches"])
generate_answer(query)

('The first wireless message was sent in 1866 by Mahlon Loomis, who had a kite '
 'on a mountaintop 14 miles apart. The kite was connected to a cable')


In [51]:
for doc in context["matches"]:
    print(doc["metadata"]["passage_text"], end='\n---\n')

by electrostatic induction or electromagnetic induction, which had too short a range to be practical. In 1866 Mahlon Loomis claimed to have transmitted an electrical signal through the atmosphere between two 600 foot wires held aloft by kites on mountaintops 14 miles apart. Thomas Edison had come close to discovering radio in 1875; he had generated and detected radio waves which he called "etheric currents" experimenting with high-voltage spark circuits, but due to lack of time did not pursue the matter. David Edward Hughes in 1879 had also stumbled on radio wave transmission which he received with his carbon microphone
---
the east coast of India, then on to Penang, Malacca, Singapore, Batavia (current Jakarta), to finally reach Darwin, Australia. It was the first direct link between Australia and Great Britain. The company that laid the first part of the cable took the name of Falmouth, Gibraltar and Malta Telegraph Company and had been founded in 1869. This company later operated as

In [52]:
query = "where did COVID-19 originate?"
context = query_pinecone(query, top_k=3)
query = format_query(query, context["matches"])
generate_answer(query)

('COVID-19 is a zoonotic disease, which means that it is a virus that is '
 'transmitted from one animal to another. It is not a virus that can be '
 'transmitted from person')


In [53]:
for doc in context["matches"]:
    print(doc["metadata"]["passage_text"], end='\n---\n')

to establish with certainty which diseases jumped from other animals to humans, but there is increasing evidence from DNA and RNA sequencing, that measles, smallpox, influenza, HIV, and diphtheria came to humans this way. Various forms of the common cold and tuberculosis also are adaptations of strains originating in other species.
Zoonoses are of interest because they are often previously unrecognized diseases or have increased virulence in populations lacking immunity. The West Nile virus appeared in the United States in 1999 in the New York City area, and moved through the country in the summer of 2002, causing much distress. Bubonic
---
plague is a zoonotic disease, as are salmonellosis, Rocky Mountain spotted fever, and Lyme disease.
A major factor contributing to the appearance of new zoonotic pathogens in human populations is increased contact between humans and wildlife. This can be caused either by encroachment of human activity into wilderness areas or by movement of wild ani

In [54]:
query = "who was the first person on the moon?"
context = query_pinecone(query, top_k=10)
query = format_query(query, context["matches"])
generate_answer(query)

('The first person to walk on the moon was Neil Armstrong in 1969. He walked '
 'on the moon in 1969. He was the first person to walk on the moon.')


In [55]:
query = "what was NASAs most expensive project?"
context = query_pinecone(query, top_k=3)
query = format_query(query, context["matches"])
generate_answer(query)

('The Space Shuttle was the most expensive project in the history of NASA. It '
 'cost about $10 billion to build.')
