# RAG and VectorDBs With JFK Speeches: Part 1 

### 1. Introduction

In [4]:
# LangChain
from langchain_google_community.gcs_file import GCSFileLoader
from langchain_google_community.gcs_directory import GCSDirectoryLoader
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

# Google Cloud
import os
from google.cloud import storage
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('../credentials.json')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../credentials.json"


# Pinecone VectorDB
from pinecone import Pinecone
from pinecone import ServerlessSpec

# API Keys
from dotenv import load_dotenv
load_dotenv()


True

### 2. Loading and Embedding Single File

Moving json documents from Google Cloud Storage to Pinecone requires the use of a JSONLoader function that can take in a file path as well as a function to process the metadata about where the speech's name and where it came from

In [6]:
from typing import Dict

def metadata_func(record: Dict[str, str], metadata: Dict[str, str]) -> Dict[str, str]:
    metadata["title"] = record.get("title")
    metadata["source"] = record.get("source")
    metadata["url"] = record.get("url")
    metadata["filename"] = record.get("filename")

    return metadata

    
def load_json(file_path: str, jq_schema: str="."):
    return JSONLoader(
                file_path, 
                jq_schema=jq_schema, 
                text_content=False,
                content_key="text",
                metadata_func=metadata_func
)

An easy example of just loading one file from a bucket can be shown,

In [8]:
loader = GCSFileLoader(project_name=credentials.project_id,
                       bucket="kennedyskis",
                       blob="1st-nixon-kennedy-debate-19600926.json",
                       loader_func=load_json)

Then can load and see the content of the speech/document as text

In [10]:
text = loader.load()
print(text[0].page_content[:1000])


[Text, format, and style are as published in Freedom of Communications: Final Report of the Committee on Commerce, United States Senate..., Part III: The Joint Appearances of Senator John F. Kennedy and Vice President Richard M. Nixon and Other 1960 Campaign Presentations. 87th Congress, 1st Session, Senate Report No. 994, Part 3. Washington: U.S. Government Printing Office, 1961.]
Monday, September 26, 1960
Originating CBS, Chicago, Ill., All Networks carried.
Moderator, Howard K. Smith.
MR. SMITH: Good evening.
The television and radio stations of the United States and their affiliated stations are proud to provide facilities for a discussion of issues in the current political campaign by the two major candidates for the presidency.
The candidates need no introduction. The Republican candidate, Vice President Richard M. Nixon, and the Democratic candidate, Senator John F. Kennedy.
According to rules set by the candidates themselves, each man shall make an opening statement of approx

Also see the metadata that was processed,

In [92]:
text[0].metadata

{'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json',
 'seq_num': 1,
 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-Television Broadcast, September 26, 1960',
 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/1st-nixon-kennedy-debate-19600926',
 'filename': '1st-nixon-kennedy-debate-19600926'}

Now want to chuck out the text into "documents"

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=100)

documents = text_splitter.split_documents(text)

print("Number of documents: ", len(documents))

Number of documents:  500


Now can look at the documents,

In [16]:
for n, doc in enumerate(documents[:3]):
    print(f"Doc {n}: ", doc.page_content, "\n", "\tMetadata:", doc.metadata, "\n")

Doc 0:  [Text, format, and style are as published in Freedom of Communications: Final Report of the Committee on Commerce, United States Senate..., Part III: The Joint Appearances of Senator John F. Kennedy 
 	Metadata: {'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json', 'seq_num': 1, 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-Television Broadcast, September 26, 1960', 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/1st-nixon-kennedy-debate-19600926', 'filename': '1st-nixon-kennedy-debate-19600926'} 

Doc 1:  on Commerce, United States Senate..., Part III: The Joint Appearances of Senator John F. Kennedy and Vice President Richard M. Nixon and Other 1960 Campaign Presentations. 87th Congress, 1st Session, 
 	Metadata: {'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json', 'seq_num': 1, 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-

In [28]:
embedding = OpenAIEmbeddings()# t")


In [39]:
embedding.model

'text-embedding-ada-002'

In [31]:
query = embedding.embed_query(documents[0].page_content)

In [38]:
print("Vector size:", len(query))
print("First 5 entries in embedded document:", query[:5])

Vector size: 1536
First 5 entries in embedded document: [-0.012023020535707474, 0.0033119581639766693, -0.005604343023151159, -0.03061368130147457, 0.013492794707417488]


### 3. Ingesting All The Speeches Into Pinecone Vector Database

Now load all of the speeches using the [GCSDirectoryLoader]() and split using the [lazy_load]()

In [57]:
loader = GCSDirectoryLoader(
                project_name=credentials.project_id,
                bucket="prezkennedyspeches",
                loader_func=load_json
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)


Now get the bucket and count the number of speeches in it

In [58]:
client = storage.Client(project=credentials.project_id,
                        credentials=credentials)

bucket = client.get_bucket("prezkennedyspeches")

speeches = [blob.name for blob in bucket.list_blobs()]
print(f"JFK had {len(speeches)} speeches in his presidency.")

JFK had 22 speeches in his presidency.


The speeches were:

In [59]:
speeches

['american-newspaper-publishers-association-19610427.json',
 'american-society-of-newspaper-editors-19610420.json',
 'american-university-19630610.json',
 'americas-cup-dinner-19620914.json',
 'berlin-crisis-19610725.json',
 'berlin-w-germany-rudolph-wilde-platz-19630626.json',
 'civil-rights-radio-and-television-report-19630611.json',
 'cuba-radio-and-television-report-19621022.json',
 'inaugural-address-19610120.json',
 'inaugural-anniversary-19620120.json',
 'irish-parliament-19630628.json',
 'latin-american-diplomats-washington-dc-19610313.json',
 'massachusetts-general-court-19610109.json',
 'peace-corps-establishment-19610301.json',
 'philadelphia-pa-19620704.json',
 'rice-university-19620912.json',
 'united-nations-19610925.json',
 'united-states-congress-special-message-19610525.json',
 'university-of-california-berkeley-19620323.json',
 'university-of-mississippi-19620930.json',
 'vanderbilt-university-19630518.json',
 'yale-university-19620611.json']

Now load them all and split them into documents

In [60]:
documents = loader.load_and_split(text_splitter)
print(f"There are {len(documents)} documents")

There are 180 documents


Create the index using the Pinecone API to load them into documents, this is basically a collection of embedded documents:

In [61]:
# delete the index if it exists
if pc.has_index(index_name):
    pc.delete_index(index_name)

Create the connection and list out the indices

In [62]:
index_name = "prez-speeches"
dim = 1536

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.list_indexes()

[]

Now create the index:

In [63]:
# delete the index if it exists
if pc.has_index(index_name):
    pc.delete_index(index_name)

# create the index
pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
                  cloud="aws",
                  region="us-east-1"
        )
)

Relist the available indices

In [64]:
pc.list_indexes()

[
    {
        "name": "prez-speeches",
        "dimension": 1536,
        "metric": "cosine",
        "host": "prez-speeches-2307pwa.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    }
]

Get the statistics on the index

In [65]:
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


Create the initial connection to the Vector database:

In [67]:
vectordb = PineconeVectorStore(
                    pinecone_api_key=os.getenv("PINECONE_API_KEY"),
                    embedding=embedding,
                    index_name=index_name
)

Now load the documents into the index:

In [68]:
vectordb = vectordb.from_documents(
                            documents=documents, 
                            embedding=embedding, 
                            index_name=index_name
)

Now get the stats on the index again:

In [69]:
pc.describe_index(index_name)

{
    "name": "prez-speeches",
    "dimension": 1536,
    "metric": "cosine",
    "host": "prez-speeches-2307pwa.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}

In [71]:
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [72]:
index = pc.Index(index_name)

In [73]:
question = "How did Kennedy feel about the Berlin Wall?"

In [74]:
matches = index.query(vector=query, top_k=5)

In [76]:
matches

{'matches': [{'id': 'a48ee926-4c6c-4614-aebf-bc8ea77d9cd3',
              'score': 0.839695573,
              'values': []},
             {'id': '3150a237-c987-4332-adf3-92282f838222',
              'score': 0.839501321,
              'values': []},
             {'id': '2fd52eb5-397a-42f1-a8f3-f2e04471dc20',
              'score': 0.835366726,
              'values': []},
             {'id': '05385d3c-bf94-4f70-84e0-54b56254915e',
              'score': 0.829040587,
              'values': []},
             {'id': 'f3f7962b-790e-402a-9827-0234de0b28ef',
              'score': 0.827951968,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [77]:
id = matches["matches"][0].get('id')

In [78]:
id

'a48ee926-4c6c-4614-aebf-bc8ea77d9cd3'

In [79]:
index.fetch(id)

{'namespace': '', 'usage': {'read_units': 4}, 'vectors': {}}

In [80]:
results = vectordb.search(query=question, search_type="similarity")

In [81]:
for doc in results:
    print()

 Document(id='7dc20458-f082-490f-ae4f-032b36123f57', metadata={'filename': 'berlin-w-germany-rudolph-wilde-platz-19630626', 'seq_num': 1.0, 'source': 'gs://prezkennedyspeches/berlin-w-germany-rudolph-wilde-platz-19630626.json', 'title': 'Remarks of President John F. Kennedy at the Rudolph Wilde Platz, Berlin, June 26, 1963', 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/berlin-w-germany-rudolph-wilde-platz-19630626'}, page_content='Listen to speech. \xa0\xa0 View related documents. \nPresident John F. Kennedy\nWest Berlin\nJune 26, 1963\n[This version is published in the Public Papers of the Presidents: John F. Kennedy, 1963. Both the text and the audio versions omit the words of the German translator. The audio file was edited by the White House Signal Agency (WHSA) shortly after the speech was recorded. The WHSA was charged with recording only the words of the President. The Kennedy Library has an audiotape of a network broadcast of the full spe

In [181]:
template = hub.pull("langchain-ai/retrieval-qa-chat")

In [212]:
print(template.messages[0].prompt)

input_variables=['context'] input_types={} partial_variables={} template='Answer any use questions based solely on the context below:\n\n<context>\n{context}\n</context>'


In [214]:
print(template.messages[1])

variable_name='chat_history' optional=True


In [211]:
print(template.messages[2].prompt)

input_variables=['input'] input_types={} partial_variables={} template='{input}'


In [215]:
print(template.input_variables)

['context', 'input']


In [182]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [183]:
stuff_documents_chain = create_stuff_documents_chain(llm, template)

In [184]:
rag_chain = create_retrieval_chain(
                    vectordb.as_retriever(), 
                    stuff_documents_chain
)

In [217]:
rag_chain.get_prompts()

[PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'),
 ChatPromptTemplate(input_variables=['context', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk,

In [218]:
response = rag_chain.invoke({"input": question})

In [219]:
print(response["answer"])

Kennedy viewed the Berlin Wall as a vivid demonstration of the failures of the Communist system and an offense against humanity. He expressed no satisfaction in its existence, recognizing it as a barrier that separated families and divided people who wished to be united. He emphasized the vitality, hope, and determination of the people of West Berlin despite the wall's presence.


In [234]:
response

{'input': 'How did Kennedy feel about the Berlin Wall?',
 'context': [Document(id='6b164251-8eec-4fd8-8ca9-00b94671d288', metadata={'filename': 'berlin-w-germany-rudolph-wilde-platz-19630626', 'seq_num': 1.0, 'source': 'gs://prezkennedyspeches/berlin-w-germany-rudolph-wilde-platz-19630626.json', 'title': 'Remarks of President John F. Kennedy at the Rudolph Wilde Platz, Berlin, June 26, 1963', 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/berlin-w-germany-rudolph-wilde-platz-19630626'}, page_content='Freedom has many difficulties and democracy is not perfect, but we have never had to put a wall up to keep our people in, to prevent them from leaving us. I want to say, on behalf of my countrymen, who live many miles away on the other side of the Atlantic, who are far distant from you, that they take the greatest pride that they have been able to share with you, even from a distance, the story of the last 18 years. I know of no town, no city, that has

In [232]:
references = [(doc.page_content,doc.metadata["url"]) 
              for doc in response['context']]

In [233]:
references

[('Freedom has many difficulties and democracy is not perfect, but we have never had to put a wall up to keep our people in, to prevent them from leaving us. I want to say, on behalf of my countrymen, who live many miles away on the other side of the Atlantic, who are far distant from you, that they take the greatest pride that they have been able to share with you, even from a distance, the story of the last 18 years. I know of no town, no city, that has been besieged for 18 years that still lives with the vitality and the force, and the hope and the determination of the city of West Berlin. While the wall is the most obvious and vivid demonstration of the failures of the Communist system, for all the world to see, we take no satisfaction in it, for it is, as your Mayor has said, an offense not only against history but an offense against humanity, separating families, dividing husbands and wives and brothers and sisters, and dividing a people who wish to be joined together.',
  'https