In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('./data/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_spliter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [7]:
chunks = text_spliter.create_documents([churchill_speech])

In [10]:
for chunk in chunks:
    print(chunk.page_content)

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
June 4, 1940
House of Commons
From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the
second week of May, only a rapid retreat to Amiens and the south could have saved the British and
French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact
this strategic fact was
not immediately realized. The French High Command hoped they would be able to close the gap, and
the Armies of the north were under their orders. Moreover, a retirement of this kind would have
involved almost certainly the destruction of the fine Belgian Army of over 20 divisions and the
abandonment of the whole of Belgium. Therefore, when the force and scope of the German
penetration were realized and when a new French Generalissimo, General Weygand, assumed
command in place of General Gamelin, an effort was made by the French and British Armies i

## Embedding cost

In [11]:
def print_embedding_cost(texts):
    import tiktoken

    enc = tiktoken.encoding_for_model('text-embedding-3-large')

    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])

    print(f'Total tokens: {total_tokens}')

    print(f'Emedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total tokens: 4820
Emedding Cost in USD: 0.001928


In [12]:
from langchain.embeddings import OpenAIEmbeddings
from openai import embeddings

embeddings = OpenAIEmbeddings()

In [14]:
vector = embeddings.embed_query(chunks[0].page_content)

vector

[-0.044567573656022125,
 -0.0378875395789288,
 -0.0029496059181736843,
 -0.007993097388439452,
 0.015743980127830393,
 0.022589743056483196,
 -0.028581378879143218,
 -0.009650358895679658,
 0.0010493331318405562,
 0.007336567129750246,
 0.007789127035119314,
 0.0327882728296835,
 0.00741305618686828,
 -0.011696439194526596,
 0.006374081108250191,
 -0.005386098268716172,
 0.013168851797818924,
 -0.0024986397385547204,
 0.013589540447814895,
 -0.01096341941305807,
 -0.008171572320709483,
 -0.026847628780446265,
 0.029626728860761727,
 -0.0038658801793480554,
 -0.014456415497163373,
 -0.018523080208145856,
 0.010835938116855967,
 -0.018612315811635728,
 0.00305477831350332,
 -0.014341682144316964,
 0.007081604071684753,
 -0.008560391112316214,
 -0.01650886976768816,
 0.005150257591345508,
 -0.01833185733252013,
 -0.023851811800438823,
 -0.022373024294146077,
 -0.008745239084941521,
 0.02267898052261821,
 -0.012671672693721062,
 0.013615037265848859,
 0.004605273699663786,
 0.0087516130566

## Inserting Embeddings with Pinecone index

In [15]:
import pinecone
from langchain_community.vectorstores import Pinecone

pc = pinecone.Pinecone()

  from tqdm.autonotebook import tqdm


In [24]:
for i in pc.list_indexes().names():
    print('Deleting all indexes ...', end='')

    pc.delete_index(i)

    print('Done')

Deleting all indexes ...Done


In [25]:
index_name = 'churchill-speech'

if index_name not in pc.list_indexes().names():
    print(f'Creating index: {index_name} ...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('done')

Creating index: churchill-speech ...
done


In [26]:
vector_store = Pinecone.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=index_name
)

In [27]:
vector_store

<langchain_community.vectorstores.pinecone.Pinecone at 0x7fe9507cea40>

## Asking Quetions (Similarity Search)

In [33]:
query = 'Where we should fight?'

result = vector_store.similarity_search(query)

for item in result:
    print(item.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------


In [35]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [37]:
query = 'Where should we fight?'

answer = chain.run(query)

print(answer)

We shall fight on the beaches, landing grounds, fields, front, in France, on the seas, and oceans.


In [38]:
query = 'Who was the king of Belgium at that time?'

answer = chain.run(query)

print(answer)

The king of Belgium at that time was King Leopold.


In [40]:
query = 'What about the French Armies?'

answer = chain.run(query)

print(answer)

The context provided indicates that the French Army was involved in the fighting and held territory during the conflict. Additionally, there is a mention of the French Army advancing across the Somme in great strength.
