In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

#### Load in dataset for retrieval

In [2]:
# Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

# Display the first 15 entries
data[:2]



[Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}),
 Document(page_content='""', metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'})]

#### Chunk the data into docs

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)
docs[0]

Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'})

#### Vectorise the docs

In [4]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2", 
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': False},
)

#### Put the vectors and docs in a vector store

In [5]:
db = FAISS.from_documents(docs, embeddings)

In [6]:
search_docs = db.similarity_search("what is cheesemaking?")
print(search_docs[0])

page_content='"The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow\'s milk is most commonly used worldwide. The cheesemaker\'s goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.\\n\\nSome cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.\\n\\nCulturing\\nCheese is made by bringing milk (possibly pasteurised) in the cheese vat to a temperature required to promote the growth of the bacteria that feed on lactose and thus ferment the lactose into lactic acid. These bacteria in the milk may be wild, as is the case with unpasteurised milk, add

In [7]:
retriever = db.as_retriever(search_kwargs={"k": 4})

In [8]:
print(retriever.get_relevant_documents("What is Cheesemaking?")[0])

page_content='"The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow\'s milk is most commonly used worldwide. The cheesemaker\'s goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.\\n\\nSome cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.\\n\\nCulturing\\nCheese is made by bringing milk (possibly pasteurised) in the cheese vat to a temperature required to promote the growth of the bacteria that feed on lactose and thus ferment the lactose into lactic acid. These bacteria in the milk may be wild, as is the case with unpasteurised milk, add

In [9]:
# model_name = "Intel/dynamic_tinybert"
# max_length = 512
# temperature = 0.1

# question_answerer = pipeline(
#     "question-answering", 
#     model=model_name, 
#     tokenizer=AutoTokenizer.from_pretrained(
#         model_name, 
#         padding=True, 
#         truncation=True, 
#         max_length=max_length
#     ),
#     return_tensors='pt',
#     device_map="cpu",
# )

# llm = HuggingFacePipeline(
#     pipeline=question_answerer,
#     model_kwargs={
#         "temperature": temperature, 
#         "max_length": max_length,
#         "device_map": "cpu",
#     },
# )

In [10]:
# qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

In [11]:
# result = qa.invoke({"query": "who is thomas jefferson?"})
# print(result["result"])

#### Define LLM for text generation

The LLM will be used to draft an answer from the supplied prompt and RAG. The LLM can be queried directly without a RAG.

In [15]:
model_name = "mistralai/Mistral-7B-v0.1"
model_kwargs = {
    "do_sample": True,
    "temperature": 0.01,
}

text_generator = pipeline(
    "text-generation",
    model=model_name,
    tokenizer=AutoTokenizer.from_pretrained(
         model_name, 
         padding=True, 
         truncation=True, 
    ),
    device_map="cpu",
    model_kwargs=model_kwargs,
    max_new_tokens=300,
)

llm = HuggingFacePipeline(
    pipeline=text_generator,
)

# llm = HuggingFacePipeline.from_model_id(
#     model_id="mistralai/Mistral-7B-v0.1",
#     task="text-generation",
#     pipeline_kwargs={
#         "temperature": 0, 
#         "max_new_tokens": 300,
#     }
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
print(llm.invoke("""What were the trends in median household income across different states in the United States between 2021 and 2022."""))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


What were the trends in median household income across different states in the United States between 2021 and 2022.

## Answer (1)

The data is available here.

The data is in the form of a table, so you can't use `ggplot2` directly. You can use `tidyverse` to convert the table to a data frame.

```
library(tidyverse)

df <- read_table("https://www.census.gov/data/tables/time-series/demo/income-poverty/historical-income-households/cps-historical-income-households.html")

df %>%
  mutate(year = as.numeric(year),
         state = as.factor(state)) %>%
  ggplot(aes(x = year, y = median_income, color = state)) +
  geom_line() +
  labs(x = "Year", y = "Median Household Income")
```

Comment: Thank you so much! I'm still learning R and I'm not sure how to use the data frame. I'm trying to plot the data for each state separately. I'm not sure how to do that.

Comment: @Jessica You can use `facet_wrap` to plot the data for each state separately. See the updated answer.


#### Create a question-answer chain

This will allow for the query to first be passed to the vector database for similarity search, and then the best matching docs in the vector store being folded into the prompt. The prompt will then be passed to the LLM.

In [17]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [18]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [19]:
result = retrievalQA.invoke({"query": query})
print(result['result'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

earning households are more likely to be dual earner households.\nstock ownership is tilted towards households at higher income and education levels, resulting in disparate investment income.\nHigher income households are disproportionately likely to prosper when economic times are good, and to suffer losses during downturns. More of their income comes from relatively volatile capital income. For example, in 2011 the top 1% of income earners derived 37% of their income from labor, versus 62% for the middle quintile. The top 1% derived 58% of their income from capital as opposed to 4% for the middle quintile. Government transfers represented o