In [None]:
%pip install -qU openai pinecone-client langchain
%pip install --upgrade tiktoken
%pip install wikipedia-api

In [12]:
MODEL3 = "gpt-3.5-turbo"
MODEL4 = "gpt-4"
REFRESH = False
LOAD = True

In [2]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [6]:
import pinecone

# find API key in console at app.pinecone.io
YOUR_API_KEY = os.environ.get("PINECONE_API_KEY")
YOUR_ENV = "us-west1-gcp-free"

pinecone.init(api_key=YOUR_API_KEY, environment=YOUR_ENV)


#index_name = 'langchain-retrieval-agent'
index_name = 'retrieval-index'
pinecone.init(
    api_key=YOUR_API_KEY,
    environment=YOUR_ENV
)

if index_name not in pinecone.list_indexes():
    print ("we create a new index")
    pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

  from tqdm.autonotebook import tqdm


In [7]:
index = pinecone.Index(index_name)
print(pinecone.describe_index(index_name))
print(index.describe_index_stats())

IndexDescription(name='retrieval-index', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 123}},
 'total_vector_count': 123}


In [8]:
#scratch some text data (wiki scratacher) part 1 of 2
import wikipediaapi

def get_page_text(page_title, language):
    wiki_wiki = wikipediaapi.Wikipedia(language)

    page = wiki_wiki.page(page_title)
    if page.exists():
        return page.text
    else:
        return None
    


In [9]:
import tiktoken
def num_tokens_from_string(string: str, model: str = "gpt-3.5-turbo-0301") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# try to split the text into smaller chunks
# https://python.langchain.com/en/latest/modules/indexes/text_splitters/getting_started.html
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap  = 100,
    length_function = num_tokens_from_string,
)

In [10]:
def convert_string_helper(input_str):
    input_str = input_str.lower()
    ascii_str = ''.join(char if ord(char) < 128 else 'x' for char in input_str)
    final_str = ascii_str[:8]
    return final_str

In [13]:
if LOAD:
    #scratch some text data (get data) part 2 of 2
    language = "pl"  
    for page_titel in ["Warszawa", "Kraków", "Mikołaj_Kopernik"]: # load examples from wiki pages as knowwladge DB
        page_txt = get_page_text(page_titel, language)
        if page_txt:
            print(f"Processing {page_titel} page...")
            texts = text_splitter.create_documents([page_txt])
            # add to local brain and embedings to picone
            id = 0
            for chunk in texts:
                embeds = embed.embed_documents([chunk.page_content])
                chunk_id = f"{convert_string_helper(page_titel)}_{id:06d}"
                metadatas = {
                    'title': page_titel,
                    'text': chunk.page_content
                } 
                to_upsert = (chunk_id, embeds, metadatas)
                #print (to_upsert)
                upsert_response = index.upsert(
                    vectors=[to_upsert],
                )
                print(f"Processed chunk {chunk_id} for {page_titel}")
                id += 1
        else:
            print(f"Page {page_titel} does not exist.")


In [14]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

In [None]:
#Test similarity search
#query = "Pierwszymi osadami powstałymi w obecnych granicach administracyjnych Warszawy były"
query = "Jakie osady powstały w granicach Warszawy?"
#query = "XXXX"

results = vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)
for result in results:
    print(result.page_content)
    print(result.metadata)


In [None]:
# test query
for query_txt in [
    "Pierwszymi osadami powstałymi w obecnych granicach administracyjnych Warszawy były",
    "Jakie osady powstały w granicach Warszawy?",
    "XXXX"    
]:
    #embeds = [record['embedding'] for record in (get_embeddings([query_txt]))['data']][0]
    embeds = embed.embed_documents([query_txt])
    query_response = index.query(
        top_k=3,
        include_values=True,
        include_metadata=False,
        vector=embeds
    )["matches"]
    print(f"text: {query_txt}")
    for item in query_response:
        print(f"id: {item.id}, score: {item.score}")


In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [16]:
query = "Jakie osady powstały w granicach Warszawy?"
qa.run(query)

'Pierwszymi osadami powstałymi w granicach administracyjnych Warszawy były: Bródno (IX/X wiek), Kamion (II połowa XI wieku), Solec (XI wiek) i Jazdów (XIII wiek).'

In [17]:
qa.run("Co robil mikolaj?")

'Nie jest jasne, o który okres w życiu Mikołaja Kopernika pytasz. W tekście znajdują się informacje na temat różnych dziedzin, w których działał, takich jak astronomia, ekonomia, medycyna, a także jego edukacja i młodość. Proszę sprecyzować swoje pytanie.'

In [19]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [20]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

In [21]:
agent("Czy Warszawa i Krakow maja cos wspolnego")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Knowledge Base",
    "action_input": "What do Warsaw and Krakow have in common?"
}[0m
Observation: [36;1m[1;3mWarsaw and Krakow are both cities in Poland.[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "Warsaw and Krakow are both cities in Poland."
}[0m

[1m> Finished chain.[0m


{'input': 'Czy Warszawa i Krakow maja cos wspolnego',
 'chat_history': [],
 'output': 'Warsaw and Krakow are both cities in Poland.'}

In [22]:
agent("Gdzie mieszkal Mikolaj?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Knowledge Base",
    "action_input": "Mikołaj Kopernik place of residence"
}[0m

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID d5cb59605111c3124eb2b5af4c16e218 in your message.).



Observation: [36;1m[1;3mMikołaj Kopernik was born and lived in Toruń, Poland. Later in life, he spent a significant amount of time in Lidzbark Warmiński and Frombork.[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "Mikołaj Kopernik was born and lived in Toruń, Poland. Later in life, he spent a significant amount of time in Lidzbark Warmiński and Frombork."
}[0m

[1m> Finished chain.[0m


{'input': 'Gdzie mieszkal Mikolaj?',
 'chat_history': [HumanMessage(content='Czy Warszawa i Krakow maja cos wspolnego', additional_kwargs={}, example=False),
  AIMessage(content='Warsaw and Krakow are both cities in Poland.', additional_kwargs={}, example=False)],
 'output': 'Mikołaj Kopernik was born and lived in Toruń, Poland. Later in life, he spent a significant amount of time in Lidzbark Warmiński and Frombork.'}

In [23]:
agent("Licz do 5 po francusku")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "Un, deux, trois, quatre, cinq"
}[0m

[1m> Finished chain.[0m


{'input': 'Licz do 5 po francusku',
 'chat_history': [HumanMessage(content='Czy Warszawa i Krakow maja cos wspolnego', additional_kwargs={}, example=False),
  AIMessage(content='Warsaw and Krakow are both cities in Poland.', additional_kwargs={}, example=False),
  HumanMessage(content='Gdzie mieszkal Mikolaj?', additional_kwargs={}, example=False),
  AIMessage(content='Mikołaj Kopernik was born and lived in Toruń, Poland. Later in life, he spent a significant amount of time in Lidzbark Warmiński and Frombork.', additional_kwargs={}, example=False)],
 'output': 'Un, deux, trois, quatre, cinq'}