# Generation: Stuffing Documents

In [None]:
# Run the line of code below to check the version of langchain in the current environment.
# Substitute "langchain" with any other package name to check their version.

In [1]:
# Confirm langchain version
!pip show langchain

Name: langchain
Version: 0.3.26
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: C:\Users\Marcus\anaconda3\envs\langchain_env_py312\Lib\site-packages
Requires: langchain-core, langchain-text-splitters, langsmith, pydantic, PyYAML, requests, SQLAlchemy
Required-by: langchain-community


In [2]:
# Load environment variable
%load_ext dotenv
%dotenv

In [3]:
# Import langchain classes
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [6]:
# Create vectorstore object
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = OpenAIEmbeddings(model='text-embedding-ada-002'))

In [7]:
# Display quantity of documents
len(vectorstore.get()['documents'])

61

In [8]:
# Create vector store backed retriever by applying the 
# as_retriever method to the vectorstore
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k':3, 
                                                      'lambda_mult':0.7})

In [9]:
# Define prompt template from string template
TEMPLATE = '''
Answer the following question:
{question}

To answer the question, use only the following context:
{context}

At the end of the response, specify the name of the lecture this context is taken from in the format:
Resources: *Lecture Title*
where *Lecture Title* should be substituted with the title of all resource lectures.
'''
# Create instance of prompt template class, apply from template method and pass string as an argument
prompt_template = PromptTemplate.from_template(TEMPLATE)

In [12]:
# Define chat object
chat = ChatOpenAI(model_name = 'gpt-4', 
                  seed = 365,
                  max_tokens = 250)

In [13]:
# Create question varible to store input
question = "What software do data scientists use?"

In [14]:
# Create chain invoking elements
chain = {'context': retriever, 
         'question': RunnablePassthrough()} | prompt_template

In [15]:
# Invoke chain
chain.invoke(question)

StringPromptValue(text="\nAnswer the following question:\nWhat software do data scientists use?\n\nTo answer the question, use only the following context:\n[Document(id='4848e646-4039-41ab-a086-178d73847673', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end'), Document(id='8182c318-ff76-4c5f-bf41-95e9645905cb', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Langu

In [16]:
# Print to inspect answer
print("\nAnswer the following question:\nWhat software do data scientists use?\n\nTo answer the question, use only the following context:\n[Document(page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}), Document(page_content='It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks on multiple computers which is basically the way to handle big data nowadays. Power BI, SaS, Qlik, and especially Tableau are top-notch examples of software designed for business intelligence visualizations', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}), Document(page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})]\n\nAt the end of the response, specify the name of the lecture this context is taken from in the format:\nResources: *Lecture Title*\nwhere *Lecture Title* should be substituted with the title of all resource lectures.\n")


Answer the following question:
What software do data scientists use?

To answer the question, use only the following context:
[Document(page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}), Document(page_content='It’s actually a software framework which was designed to address the complexity of big data and its computational intensity. Most notably, Hadoop distributes the computational tasks on multiple computers