In [14]:
import os
import logging
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever

In [3]:
# Setting the environment path and autheticating the OpenAI API

os.environ["OPENAI_API_KEY"] = "Your API key"

In [4]:
# I haven't mentioned the maximum page content, so it will puck everything there about MKUltra.

document_loader  = WikipediaLoader('MKUltra')
data = document_loader.load()
data



  lis = BeautifulSoup(html).find_all('li')


[Document(page_content='Project MKUltra was an illegal human experiments program designed and undertaken by the U.S. Central Intelligence Agency (CIA) to develop procedures and identify drugs that could be used during interrogations to weaken people and force confessions through brainwashing and psychological torture. It began in 1953 and was halted in 1973. MKUltra used numerous methods to manipulate its subjects\' mental states and brain functions, such as the covert administration of high doses of psychoactive drugs (especially LSD) and other chemicals without the subjects\' consent, electroshocks, hypnosis, sensory deprivation, isolation, verbal and sexual abuse, and other forms of torture.\nMKUltra was preceded by Project Artichoke. It was organized through the CIA\'s Office of Scientific Intelligence and coordinated with the United States Army Biological Warfare Laboratories. The program engaged in illegal activities, including the use of U.S. and Canadian citizens as unwitting t

In [5]:
len(data)

24

In [6]:
# Now we have to split the big document into smaller chunks

text_splitter = CharacterTextSplitter(chunk_size = 500)
docs = text_splitter.split_documents(data)
docs

Created a chunk of size 2073, which is longer than the specified 500
Created a chunk of size 1879, which is longer than the specified 500
Created a chunk of size 1409, which is longer than the specified 500
Created a chunk of size 2057, which is longer than the specified 500
Created a chunk of size 1191, which is longer than the specified 500
Created a chunk of size 964, which is longer than the specified 500
Created a chunk of size 526, which is longer than the specified 500
Created a chunk of size 781, which is longer than the specified 500
Created a chunk of size 600, which is longer than the specified 500
Created a chunk of size 1079, which is longer than the specified 500
Created a chunk of size 640, which is longer than the specified 500
Created a chunk of size 955, which is longer than the specified 500
Created a chunk of size 1150, which is longer than the specified 500
Created a chunk of size 2466, which is longer than the specified 500
Created a chunk of size 1951, which is l

[Document(page_content="Project MKUltra was an illegal human experiments program designed and undertaken by the U.S. Central Intelligence Agency (CIA) to develop procedures and identify drugs that could be used during interrogations to weaken people and force confessions through brainwashing and psychological torture. It began in 1953 and was halted in 1973. MKUltra used numerous methods to manipulate its subjects' mental states and brain functions, such as the covert administration of high doses of psychoactive drugs (especially LSD) and other chemicals without the subjects' consent, electroshocks, hypnosis, sensory deprivation, isolation, verbal and sexual abuse, and other forms of torture.\nMKUltra was preceded by Project Artichoke. It was organized through the CIA's Office of Scientific Intelligence and coordinated with the United States Army Biological Warfare Laboratories. The program engaged in illegal activities, including the use of U.S. and Canadian citizens as unwitting test

In [7]:
len(docs)

83

In [8]:
# Embedding the texts and storing it in Chroma

embedding_function = OpenAIEmbeddings()

db = Chroma.from_documents(docs, embedding=embedding_function, persist_directory='./Wiki_MKUltra')
db.persist()

In [10]:
question = "When was this declassified"
chat_model = ChatOpenAI()

In [13]:
# We need to supply the chroma db instance and llm instance to Multiquery retriever
# It is LLM that generate queries that are similar to our question and that suits better with docs in chroma db

retriver_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(), llm=chat_model)

In [15]:
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [17]:
# So the chat model will retrive the most relevant documents based on the queries (Original query and Genarated queries)
# In the Output displaying part we can see the queries generated by LLM (for this purpose we have used logging functionality)

unique_docs = retriver_from_llm.get_relevant_documents(query=question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the date of the declassification of this information?', '2. Can you provide the timeline for when this was declassified?', '3. Do you have any information on the specific date when this was declassified?']


In [18]:
unique_docs

[Document(page_content='== Footnotes ==\n\n\n== References ==\nMaret, Susan (2018-02-16). "Murky Projects and Uneven Information Policies: A Case Study of the Psychological Strategy Board and CIA". Secrecy and Society. 1 (2). doi:10.31979/2377-6188.2018.010206. ISSN 2377-6188.\nRonson, Jon (2004). The Men Who Stare at Goats. London: Picador. ISBN 0-330-37547-4. OCLC 56653467.\n\n\n== External links ==\nTranscriptions of Declassified Project Artichoke Documents at the Wayback Machine (archived May 1, 2011)', metadata={'source': 'https://en.wikipedia.org/wiki/Project_Artichoke', 'summary': "Project Artichoke (also referred to as Operation Artichoke) was a project developed and enacted by the United States Central Intelligence Agency (CIA) for the purpose of researching methods of interrogation.\nInitially known as Project Bluebird, Project Artichoke officially arose on August 20, 1951, and was operated by the CIA's Office of Scientific Intelligence. The primary goal of Project Artichoke 

In [20]:
print(unique_docs[0].page_content)

== Footnotes ==


== References ==
Maret, Susan (2018-02-16). "Murky Projects and Uneven Information Policies: A Case Study of the Psychological Strategy Board and CIA". Secrecy and Society. 1 (2). doi:10.31979/2377-6188.2018.010206. ISSN 2377-6188.
Ronson, Jon (2004). The Men Who Stare at Goats. London: Picador. ISBN 0-330-37547-4. OCLC 56653467.


== External links ==
Transcriptions of Declassified Project Artichoke Documents at the Wayback Machine (archived May 1, 2011)


In [21]:
print(unique_docs[1].page_content)

== Background ==
In 1974, a New York Times article was published that accused the CIA of illegal operations committed against US citizens. Authored by Seymour M. Hersh, it documented an intelligence operation against the anti-war movement, as well as "break-ins, wiretapping and the surreptitious inspection of mail" conducted since the 1950s. According to former CIA Official Cord Meyer, these disclosures "Convinced large sections of the American public that the CIA had become a domestic Gestapo and stimulated an overwhelming demand for the wide-ranging congressional investigations that were to follow."
Hersh had been tipped off to the possibility of an "in house operation" by an unidentified member of the CIA in spring of 1974. He embarked on an investigation, speaking to sources that included CIA Chief of Counterintelligence James Angleton. Although he was not aware of its existence, Hersh uncovered much information that had been documented in the "Family Jewels", a report ordered by D