### Data Load

In [17]:
from datasets import load_dataset 
data = load_dataset('squad', split='train')
df = data.to_pandas()
df.drop_duplicates(subset='context', keep='first', inplace=True)

In [18]:
df

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."
10,5733bed24776f41900661188,University_of_Notre_Dame,The university is the major seat of the Congre...,Where is the headquarters of the Congregation ...,"{'text': ['Rome'], 'answer_start': [119]}"
15,5733a6424776f41900660f51,University_of_Notre_Dame,The College of Engineering was established in ...,How many BS level degrees are offered in the C...,"{'text': ['eight'], 'answer_start': [487]}"
20,5733a70c4776f41900660f64,University_of_Notre_Dame,All of Notre Dame's undergraduate students are...,What entity provides help with the management ...,"{'text': ['Learning Resource Center'], 'answer..."
...,...,...,...,...,...
87574,5735d0026c16ec1900b92815,Kathmandu,"Institute of Medicine, the central college of ...",Of what university is the Institute of Medicin...,"{'text': ['Tribhuwan'], 'answer_start': [46]}"
87579,5735d07d012e2f140011a087,Kathmandu,Football and Cricket are the most popular spor...,"Along with cricket, what sport is highly popul...","{'text': ['Football'], 'answer_start': [0]}"
87584,5735d0f46c16ec1900b92823,Kathmandu,The total length of roads in Nepal is recorded...,"As of 2004, how many kilometers of road existe...","{'text': ['17,182'], 'answer_start': [54]}"
87589,5735d1a86c16ec1900b92831,Kathmandu,The main international airport serving Kathman...,What is Nepal's primary airport for internatio...,"{'text': ['Tribhuvan International Airport'], ..."


In [19]:
df.iloc[0]['context']

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [20]:
df.iloc[0]['question']

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [21]:
df.iloc[0]['answers']

{'text': array(['Saint Bernadette Soubirous'], dtype=object),
 'answer_start': array([515], dtype=int32)}

In [24]:
df.shape

(18891, 5)

In [25]:
df.head(2)

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."


### Embedding API

In [26]:
import os
from openai import OpenAI
MODEL = "text-embedding-ada-002"

In [27]:
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()

In [28]:
from openai import OpenAI

# For text embeddings
res = client.embeddings.create(
    input="I love openai", 
    model="text-embedding-ada-002"
)

In [29]:
res

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.010811165906488895, -0.015025688335299492, -0.019898518919944763, -0.019599905237555504, 0.012154926545917988, 0.003408605232834816, -0.003242331789806485, 0.01315256766974926, -0.0015125791542232037, -0.014102701097726822, 0.020712917670607567, 0.017957530915737152, 0.009772805497050285, -0.03339041769504547, -0.009772805497050285, 0.012915033847093582, 0.027798201888799667, 0.014347021467983723, 0.020427878946065903, -0.009012698195874691, -0.00526645639911294, 0.005996023304760456, -0.025422867387533188, -0.02782534994184971, 0.006932584103196859, -0.0004040189669467509, 0.008530844934284687, -0.028829775750637054, -0.019884943962097168, -0.03368903324007988, 0.03982418403029442, 0.009548845700919628, -0.009650645777583122, -0.004475809168070555, -0.020414305850863457, -0.002981045050546527, -0.0029267517384141684, -0.0040312823839485645, 0.024418441578745842, 0.006525383796542883, 0.006983484141528606, 0.0061046103946864605, -0.

In [32]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [33]:
vec = get_embedding("I am trying a new text \n And see what happens")

In [34]:
len(vec) # 

1536

### Vector DB Setup

In [46]:
# db of 1536 dimension

# API_KEY = "your key"
# ENV = "your env"

import pinecone
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="your key")

# pinecone.init(api_key = API_KEY, environment = ENV)
# pinecone.create_index("ai-agent", dimension=1536, metric='dotproduct')
index = pc.Index("squad-data")

### this one is for create index in pinecone

In [47]:
# index_name = "squad-data"
# pc.create_index(
#     name=index_name,
#     dimension=1536, 
#     metric="cosine", 
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     ) 
# )

In [48]:
# index.delete(delete_all=True)


### Indexing

In [49]:
df_sample = df.sample(10000, random_state=45)
batch_size = 20 # free tier limit 20 RPM

In [50]:
# embedding function from OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"

embed = OpenAIEmbeddings(
    model = model_name,
    openai_api_key= api_key)

In [51]:
from tqdm.auto import tqdm
import time

In [52]:
%%time
for i in tqdm(range(0, len(df_sample), batch_size)):
    i_end = min(i+batch_size, len(df_sample))
#     print(i, i_end)
    batch = df_sample.iloc[i:i_end]
    meta_data = [{"titile" : row['title'], 
              "context": row['context']} 
             for i, row in batch.iterrows()]
    
    # embedding  
    docs = batch['context'].tolist()  # pd.Series to python list
#     emb_vectors = [get_embedding(doc, MODEL) for doc in docs] 
    emb_vectors = embed.embed_documents(docs) # list of list

    ids = batch['id'].tolist()
    
    # upsert
    to_upsert = zip(ids, emb_vectors, meta_data)    
    index.upsert(vectors=to_upsert)
    
    time.sleep(20) # 8s for 50 data points

    


  0%|          | 0/500 [00:00<?, ?it/s]

CPU times: user 59 s, sys: 926 ms, total: 59.9 s
Wall time: 3h 9min 36s


In [54]:
df.shape[0]/3600 # 5 hrs to load , free tier will take 15hrs
# 14000 records/dollar

5.2475

### Using

In [80]:
# Langchain vector store defination
from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
vectorstore = Pinecone(index=index, embedding=embed.embed_query, text_key="titile")
vector_store = PineconeVectorStore(index=index, embedding=get_embedding, text_key="context")

In [81]:
query = "Virgin Mary"

In [83]:
# pure semantic, non generative, non agent based
vectorstore.similarity_search(query, k=10)

[Document(metadata={'context': "The Perpetual Virginity of Mary asserts Mary's real and perpetual virginity even in the act of giving birth to the Son of God made Man. The term Ever-Virgin (Greek ἀειπάρθενος) is applied in this case, stating that Mary remained a virgin for the remainder of her life, making Jesus her biological and only son, whose conception and birth are held to be miraculous. While the Orthodox Churches hold the position articulated in the Protoevangelium of James that Jesus' brothers and sisters are older children of Joseph the Betrothed, step-siblings from an earlier marriage that left him widowed, Roman Catholic teaching follows the Latin father Jerome in considering them Jesus' cousins."}, page_content='Mary_(mother_of_Jesus)'),
 Document(metadata={'context': 'The popularity of this particular representation of The Immaculate Conception spread across the rest of Europe, and has since remained the best known artistic depiction of the concept: in a heavenly realm, m

### Define QA Agent

In [85]:

from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory

from langchain.chains import RetrievalQA

# OpenAI LLM
llm = ChatOpenAI(openai_api_key = api_key,
                model_name = 'gpt-3.5-turbo',
                temperature = 0.0)

# conversational memory
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'chat_history',
    k = 5,
    return_messages =True)

# retrieval qa
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())


# https://python.langchain.com/en/latest/modules/chains/index_examples/question_answering.html
# https://docs.langchain.com/docs/components/chains/index_related_chains

  llm = ChatOpenAI(openai_api_key = api_key,
  conv_mem = ConversationBufferWindowMemory(


### Invoking Retrieval QA

In [86]:
query = "When was university of notredame establish"
qa.run(query) # retrieving the info

  qa.run(query) # retrieving the info


'The University of Notre Dame was established on November 26, 1842.'

In [87]:
query = "who established the university of notredame"
qa.run(query)

'The University of Notre Dame was established by the Congregation of Holy Cross, a Catholic religious order also known as the Holy Cross Fathers.'

In [88]:
from langchain.agents import Tool

tools = [
    Tool(
    name = 'Knowledge Base',
    func = qa.run,
    description = ('use this when answering based on knwowledge')
    )
]

In [89]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conv_mem 
)

  agent = initialize_agent(


In [90]:
agent("when was university of notredame established") # chat gpt kind

  agent("when was university of notredame established") # chat gpt kind




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "University of Notre Dame establishment date"
}
```[0m
Observation: [36;1m[1;3mThe University of Notre Dame was established on November 26, 1842.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The University of Notre Dame was established on November 26, 1842."
}
```[0m

[1m> Finished chain.[0m


{'input': 'when was university of notredame established',
 'chat_history': [],
 'output': 'The University of Notre Dame was established on November 26, 1842.'}

In [91]:
agent("who founded the university")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "University of Notre Dame founding"
}
```[0m
Observation: [36;1m[1;3mThe University of Notre Dame was founded on November 26, 1842, by Rev. Edward Sorin, a French priest of the Congregation of Holy Cross. The university was established in Notre Dame, Indiana, USA.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The University of Notre Dame was founded on November 26, 1842, by Rev. Edward Sorin, a French priest of the Congregation of Holy Cross. The university was established in Notre Dame, Indiana, USA."
}
```[0m

[1m> Finished chain.[0m


{'input': 'who founded the university',
 'chat_history': [HumanMessage(content='when was university of notredame established', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The University of Notre Dame was established on November 26, 1842.', additional_kwargs={}, response_metadata={})],
 'output': 'The University of Notre Dame was founded on November 26, 1842, by Rev. Edward Sorin, a French priest of the Congregation of Holy Cross. The university was established in Notre Dame, Indiana, USA.'}

In [92]:
agent("20+6")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "26"
}
```[0m

[1m> Finished chain.[0m


{'input': '20+6',
 'chat_history': [HumanMessage(content='when was university of notredame established', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The University of Notre Dame was established on November 26, 1842.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='who founded the university', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The University of Notre Dame was founded on November 26, 1842, by Rev. Edward Sorin, a French priest of the Congregation of Holy Cross. The university was established in Notre Dame, Indiana, USA.', additional_kwargs={}, response_metadata={})],
 'output': '26'}