In [29]:
# Build a sample vectorDB
from langchain.vectorstores import Chroma
from langchain.document_loaders import WikipediaLoader
# from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") # type: ignore

In [11]:
# db_connection = Chroma(persist_directory='./mk_ultra',embedding_function=gemini_embeddings)

In [20]:
# db_connection.similarity_search("When was this declassified?")

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/United_States_President%27s_Commission_on_CIA_Activities_within_the_United_States', 'summary': 'The United States President\'s Commission on CIA Activities within the United States was ordained by President Gerald Ford in 1975 to investigate the activities of the Central Intelligence Agency and other intelligence agencies within the United States. The Presidential Commission was led by Vice President Nelson Rockefeller, from whom it gained the nickname the Rockefeller Commission.\nThe commission was created in response to a December 1974 report in The New York Times that the CIA had conducted illegal domestic activities, including experiments on US citizens, during the 1960s. The commission issued a single report in 1975, touching upon certain CIA abuses including mail opening and surveillance of domestic dissident groups. It also publicized Project MKUltra, a CIA mind control research program.\nSeveral weeks later, committee

In [30]:
from langchain_community.document_loaders import TextLoader
documents = TextLoader("./mkultra.wiki.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
db = Chroma.from_documents(texts, gemini_embeddings, persist_directory='./MKUltra2')
db.similarity_search("When was this declassified?")

Created a chunk of size 1024, which is longer than the specified 1000
Created a chunk of size 1565, which is longer than the specified 1000
Created a chunk of size 1048, which is longer than the specified 1000
Created a chunk of size 1425, which is longer than the specified 1000
Created a chunk of size 1001, which is longer than the specified 1000
Created a chunk of size 1380, which is longer than the specified 1000
Created a chunk of size 2028, which is longer than the specified 1000
Created a chunk of size 1421, which is longer than the specified 1000
Created a chunk of size 1665, which is longer than the specified 1000
Created a chunk of size 22832, which is longer than the specified 1000


[Document(metadata={'source': './mkultra.wiki.txt'}, page_content="MKUltra was revealed to the public in 1975 by the Church Committee of the United States Congress and Gerald Ford's United States President's Commission on CIA activities within the United States (the Rockefeller Commission). Investigative efforts were hampered by CIA Director Richard Helms's order that all MKUltra files be destroyed in 1973; the Church Committee and Rockefeller Commission investigations relied on the sworn testimony of direct participants and on the small number of documents that survived Helms's order.[19] In 1977, a Freedom of Information Act request uncovered a cache of 20,000 documents relating to MKUltra, which led to Senate hearings.[11][20] Some surviving information about MKUltra was declassified in 2001.\n\nBackground"),
 Document(metadata={'source': './mkultra.wiki.txt'}, page_content="1977 United States Senate report on MKUltra\nIn 1977, during a hearing held by the Senate Select Committee on

### Contextual Compression

In [31]:
# from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [32]:
# llm = ChatOpenAI(temperature=0)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0) #,convert_system_message_to_human=True,temperature=0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro") #,convert_system_message_to_human=True,temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [33]:
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=db.as_retriever())
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=GoogleGenerativeAIEmbeddings.as_retriever())

In [25]:
# docs = db_connection.similarity_search('When was this declassified?')

In [26]:
# docs[0]

Document(metadata={'source': 'https://en.wikipedia.org/wiki/United_States_President%27s_Commission_on_CIA_Activities_within_the_United_States', 'summary': 'The United States President\'s Commission on CIA Activities within the United States was ordained by President Gerald Ford in 1975 to investigate the activities of the Central Intelligence Agency and other intelligence agencies within the United States. The Presidential Commission was led by Vice President Nelson Rockefeller, from whom it gained the nickname the Rockefeller Commission.\nThe commission was created in response to a December 1974 report in The New York Times that the CIA had conducted illegal domestic activities, including experiments on US citizens, during the 1960s. The commission issued a single report in 1975, touching upon certain CIA abuses including mail opening and surveillance of domestic dissident groups. It also publicized Project MKUltra, a CIA mind control research program.\nSeveral weeks later, committees

In [34]:
compressed_docs = compression_retriever.invoke("When was this declassified?")

In [36]:
print(compressed_docs)

[Document(metadata={'source': './mkultra.wiki.txt'}, page_content='Some surviving information about MKUltra was declassified in 2001.')]


In [37]:
compressed_docs[0].page_content

'Some surviving information about MKUltra was declassified in 2001.'