# LangChain: Q&A over Documents

An example might be a tool that would allow you to query a product catalog for items of interest.

In [1]:
#pip install --upgrade langchain

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [2]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [8]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings



In [4]:
file = 'titanic.csv'
loader = CSVLoader(file_path=file)

In [5]:
from langchain.indexes import VectorstoreIndexCreator

In [6]:
#pip install docarray

Collecting docarray
  Downloading docarray-0.40.0-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 2.2 MB/s eta 0:00:01
Collecting types-requests>=2.28.11.6
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl (15 kB)
Collecting rich>=13.1.0
  Downloading rich-13.9.4-py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 5.0 MB/s eta 0:00:01
Collecting markdown-it-py>=2.2.0
  Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.0 MB/s eta 0:00:01
Collecting mdurl~=0.1
  Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Installing collected packages: types-requests, mdurl, markdown-it-py, rich, docarray
Successfully installed docarray-0.40.0 markdown-it-py-3.0.0 mdurl-0.1.2 rich-13.9.4 types-requests-2.32.0.20241016
Note: you may need to restart the kernel to use updated packages.


In [9]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

SyntaxError: invalid syntax (1858301364.py, line 3)

In [None]:
query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

In [12]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

**Note**:
- The notebook uses `langchain==0.0.179` and `openai==0.27.7`
- For these library versions, `VectorstoreIndexCreator` uses `text-davinci-003` as the base model, which has been deprecated since 1 January 2024.
- The replacement model, `gpt-3.5-turbo-instruct` will be used instead for the `query`.
- The `response` format might be different than the video because of this replacement model.

In [13]:
llm_replacement_model = OpenAI(temperature=0, 
                               model='gpt-3.5-turbo-instruct')

response = index.query(query, 
                       llm = llm_replacement_model)

NameError: name 'index' is not defined

In [None]:
display(Markdown(response))

## Step By Step

In [14]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path=file)

In [15]:
docs = loader.load()

In [16]:
docs[0]

Document(metadata={'source': 'titanic.csv', 'row': 0}, page_content='Survived: 0\nPclass: 3\nName: Mr. Owen Harris Braund\nSex: male\nAge: 22\nSiblings/Spouses Aboard: 1\nParents/Children Aboard: 0\nFare: 7.25')

In [17]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [18]:
embed = embeddings.embed_query("Hi my name is Manuel")

In [19]:
print(len(embed))

1536


In [20]:
print(embed[:5])

[-0.028231139666453162, 0.013487356447522143, -0.013936079762269567, -0.027154201103356188, -0.019846414753737034]


In [21]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)



In [22]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv(file)

# Display the first few rows of the DataFrame to see the main columns
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [23]:
query = "Please tell me the oldest passenger's name who died in class 1."

In [24]:
docs = db.similarity_search(query)

In [25]:
len(docs)

4

In [26]:
docs[0]

Document(metadata={'source': 'titanic.csv', 'row': 778}, page_content='Survived: 0\nPclass: 1\nName: Mr. Milton Clyde Long\nSex: male\nAge: 29\nSiblings/Spouses Aboard: 0\nParents/Children Aboard: 0\nFare: 30')

In [27]:
retriever = db.as_retriever()

In [28]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)

In [29]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [30]:
response = llm.call_as_llm(f"{qdocs} Question: tell me the oldest passenger who died in class 1") 


  response = llm.call_as_llm(f"{qdocs} Question: tell me the oldest passenger who died in class 1")


In [31]:
display(Markdown(response))

The oldest passenger who died in class 1 was Mr. Milton Clyde Long, who was 29 years old.

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [None]:
query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."

In [None]:
response = qa_stuff.run(query)

In [None]:
display(Markdown(response))

In [None]:
response = index.query(query, llm=llm)

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])

Reminder: Download your notebook to you local computer to save your work.