# Retrieval Augmentation

In [14]:
def format_text(text):
    return text.strip().replace(". ", ".\n")

In [15]:
from datasets import load_dataset

corpus = load_dataset(path="wikipedia", name="20220301.simple", split="train[:10000]", )
corpus

Found cached dataset wikipedia (C:/Users/matte/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [16]:
corpus[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [17]:
import tiktoken

tokenizer = tiktoken.get_encoding("p50k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())

    return len(tokens)

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n", "\n\n", " ", ""]
)

In [19]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [20]:
chunks = text_splitter.split_text(corpus[6]["text"])

In [21]:
for chunk in chunks:
    print(len(chunk.split(" ")), len(tokenizer.encode(chunk)))

275 397
225 304
308 399
25 50


In [22]:
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

## Embeddings

In [23]:
from getpass import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass("OpenAI API Key: ")

In [13]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"

embedding_model = OpenAIEmbeddings(
    document_model_name=model_name,
    query_model_name=model_name,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

In [24]:
texts = [
    "this is the first chunk of text",
    "then another chunk of text is here"
]

embeddings = embedding_model.embed_documents(texts)

In [25]:
len(embeddings), len(embeddings[0]), len(embeddings[1])

(2, 1536, 1536)

In [26]:
import pinecone

os.environ["PINECONE_API_KEY"] = getpass("Pinecone API Key: ")
os.environ["PINECONE_REGION"] = input("Pinecone region: ")

In [27]:
index_name = "langchain-retrieval-augmentation-v2"

pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["PINECONE_REGION"]
)

# pinecone.create_index(
#     name=index_name,
#     metric="dotproduct",
#     dimension=len(embeddings[0])
# )

In [28]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 27437}},
 'total_vector_count': 27437}

In [30]:
corpus[0]

{'id': '1',
 'url': 'https://simple.wikipedia.org/wiki/April',
 'title': 'April',
 'text': 'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn co

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_size = 100

text_chunk_collector = []
metadata_collector = []

for i, example in enumerate(tqdm(corpus)):

    # gather metadata
    metadata = {
        "wiki-id": str(example["id"]),
        "source": example["url"],
        "title": example["title"]
    }

    # split text into smaller chunks
    text_chunks = text_splitter.split_text(example["text"])

    # add index and metadata to each chunk
    text_chunks_metadata = [{"chunk": j, "text": text_chunk, **metadata} for j, text_chunk in enumerate(text_chunks)]

    # collect text chunks
    text_chunk_collector.extend(text_chunks)

    # collect text chunks extended with metadata
    metadata_collector.extend(text_chunks_metadata)

    # when the count of text chunks reaches the batch size create ID and compute embeddings for each chunk
    # then add everything pinecone's index (i.e. vector database)
    if len(text_chunk_collector) >= batch_size:
        ids = [str(uuid4()) for _ in range(len(text_chunk_collector))]
        embeddings = embedding_model.embed_documents(text_chunk_collector)

        index.upsert(vectors=zip(ids, embeddings, metadata_collector))

        text_chunk_collector = []
        metadata_collector = []
        

In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 27437}},
 'total_vector_count': 27437}

In [32]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, # name of the vectorstore
    embedding_model.embed_query, # embedding function to embed the query
    text_field # field in the vectorstore containing the text
)

In [33]:
query = "Who was Benito Mussolini?"

docs_retrieved = vectorstore.similarity_search(
    query=query,
    k=3
)

In [41]:
docs_retrieved

[Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist. He was also the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party.\n\nBiography\n\nEarly life\nBenito Mussolini was named after Benito Juarez, a Mexican opponent of the political power of the Roman Catholic Church, by his anticlerical (a person who opposes the political interference of the Roman Catholic Church in secular affairs) father. Mussolini\'s father was a blacksmith. Before being involved in politics, Mussolini was a newspaper editor (where he learned all his propaganda skills) and elementary school teacher.\n\nAt first, Mussolini was a socialist, but when he wanted Italy to join the First World War, he was thrown out of the socialist party. He \'invented\' a new ideology, Fascism, much out of Nationalist\xa0and Conservative views.\n\nRise to power and becoming dictator\nIn 1922, he took power b

## Generative Question-Answering

In [36]:
from langchain import OpenAI, VectorDBQA

llm = OpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.0,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

qa = VectorDBQA.from_chain_type(
    llm=llm,
    vectorstore=vectorstore,
    chain_type="stuff"
)


In [37]:
query = "Who was Benito Mussolini?"

response = qa.run(query)

In [39]:
print(format_text(response))

Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943.
He was the leader of the National Fascist Party and became a dictator of Italy by the end of 1927.
He was executed by partisans in 1945.


In [40]:
from langchain.chains import VectorDBQAWithSourcesChain


qa_with_sources = VectorDBQAWithSourcesChain.from_chain_type(
    llm=llm,
    vectorstore=vectorstore,
    chain_type="stuff"
)

In [42]:
response = qa_with_sources(query)

In [43]:
response

{'question': 'Who was Benito Mussolini?',
 'answer': 'Benito Mussolini was an Italian politician and journalist who was the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and became dictator of Italy by the end of 1927. He was removed by the Great Council of Fascism in 1943 and executed by a partisan in 1945. After the war, several Neo-Fascist movements have had success in Italy, the most important being the Movimento Sociale Italiano. His granddaughter Alessandra Mussolini has outspoken views similar to Fascism.\n',
 'sources': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini'}

In [44]:
print(format_text(response["answer"]))

Benito Mussolini was an Italian politician and journalist who was the Prime Minister of Italy from 1922 until 1943.
He was the leader of the National Fascist Party and became dictator of Italy by the end of 1927.
He was removed by the Great Council of Fascism in 1943 and executed by a partisan in 1945.
After the war, several Neo-Fascist movements have had success in Italy, the most important being the Movimento Sociale Italiano.
His granddaughter Alessandra Mussolini has outspoken views similar to Fascism.
