In [1]:
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
import tiktoken
deployment_name1 = "gptTurbo"
deployment_name2 = "gpt-4-32k"
chat3 = AzureChatOpenAI(deployment_name=deployment_name1, model_name="gpt-35-turbo", temperature=0)
chat4 = AzureChatOpenAI(deployment_name=deployment_name2, model_name="gpt-4-32k", temperature=0)

In [2]:
with open("data/churchill_speech.txt") as f:
   churchill_speech = f.read() 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,    
)

chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])
print(chunks[10].page_content)
print(f'There are {len(chunks)} chunks')

page_content='We Shall Fight on the Beaches\nJune 4, 1940\nHouse of Commons' metadata={}
command in place of General Gamelin, an effort was made by the French and British Armies in
There are 299 chunks


Embedding Cost Function

In [3]:
def embedding_cost(texts):
    enc = tiktoken.encoding_for_model("text-embedding-ada-002")
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Cost in US Dollars: {total_tokens / 1000 * 0.0004:.6f}')

cost_tokens = embedding_cost(chunks)
print(cost_tokens)

Total tokens: 4800
Cost in US Dollars: 0.001920
None


In [4]:
#embeddings = OpenAIEmbeddings()

embeddings = OpenAIEmbeddings(
            
            deployment="text-embedding-ada-002", model="text-embedding-ada-002", chunk_size=16, max_retries=10)
vector=embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.04369995005768468, -0.024461628912782914, -0.019690835883552915, -0.01418309009368367, 0.012127382702392853, 0.019057316739557427, -0.012928979494373974, -0.007324266726169341, 0.0002937302638890567, -0.01098963247708562, 0.002548624998776607, 0.023104086559474612, 0.001521579225466117, -0.022561069618151294, -0.007026900549838674, 0.004088789697574882, 0.013206952275037565, -0.015475988415650099, 0.015075190019659538, 0.0035554694540851006, -0.000587864547153234, -0.023336807307479543, 0.02678884570605797, 0.00658731489685267, 0.0034455731572539216, -0.022625714580815744, 0.014260664607674558, -0.020259710064883956, 0.005190985286548925, -0.009657947363785037, 0.013756434457685391, -0.019910628011553987, -0.02150089297551237, -0.0013793604473027131, -0.028004169514033513, -0.025017574474110096, -0.024681421040783986, -0.001393097571718102, 0.0208932310715246, -0.00861716458247448, 0.0076862783308257465, 0.004670593895893646, 0.013872794831687855, 0.002886394742433844, -0.030227953

In [5]:
import os
import pinecone
from langchain.vectorstores import Pinecone

import pinecone
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["PINECONE_ENV"],
)

  from tqdm.autonotebook import tqdm


Deleting existing indexes

In [6]:
indexes = pinecone.list_indexes()
print(indexes)
for i in indexes:
    print(f"Deleting index: {i}", end=" ")
    pinecone.delete_index(i) 
    print("Done") 

[]


In [7]:
index_name = "churchill-speech"

if index_name not in pinecone.list_indexes():
    print(f"Creating index: {index_name}...")
    pinecone.create_index(index_name, metric="cosine", dimension=1536)
    print("Done")

Creating index: churchill-speech...
Done


In [13]:
vector_store= Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [14]:
index_name = "churchill-speech"
index=pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 299}},
 'total_vector_count': 299}

In [15]:
query = "where should we fight"
result = vector_store.similarity_search(query, k=5)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and', metadata={}), Document(page_content='front, now on that, fighting', metadata={}), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing', metadata={}), Document(page_content='We Shall Fight on the Beaches\nJune 4, 1940\nHouse of Commons', metadata={}), Document(page_content='When we consider how much greater would be our advantage in defending the air above this Island', metadata={})]


In [16]:
for r in result:
    print(r.page_content)
    print('-'* 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
We Shall Fight on the Beaches
June 4, 1940
House of Commons
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------


In [21]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
deployment_name1 = "gpt-35-turbo-16k"
deployment_name2 = "gpt-4-32k"
chat3 = AzureChatOpenAI(deployment_name=deployment_name1, model_name="gpt-35-turbo-16k", temperature=0)
chat4 = AzureChatOpenAI(deployment_name=deployment_name2, model_name="gpt-4-32k", temperature=0)

retriever  = vector_store.as_retriever(search_type= 'similarity', search_kwargs={'k': 5})

chain = RetrievalQA.from_chain_type(llm=chat4, chain_type="stuff", retriever=retriever)

In [23]:

query = "What about the French Armies?"
response = chain.run(query)
print(response)

The French Armies were involved in a conflict with the British. They held certain territories and had plans to advance across the Somme. However, communication was lost between the British and two out of the three corps forming the First French Army. They were trying to reopen their communications to Amiens.
