# Installations, imports, utils

In [1]:
%%time
from IPython.display import clear_output

! pip install -qq -U pypdf

! pip install -qq -U transformers
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes

! pip install sentence_transformers==2.2.2
! pip install -qq -U langchain
! pip install chromadb==0.4.12

clear_output()

CPU times: user 847 ms, sys: 131 ms, total: 977 ms
Wall time: 2min 22s


In [2]:
%%time

import warnings
warnings.filterwarnings("ignore")

import sys
from time import time

from torch import cuda, bfloat16
import torch
import accelerate
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline



from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser



clear_output()

CPU times: user 6.8 s, sys: 1.07 s, total: 7.86 s
Wall time: 12.5 s


# Initialize model, tokenizer, query pipeline

In [3]:
# device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_id = 'daryl149/llama-2-7b-chat-hf'
# model_id = "TheBloke/Chinese-Llama-2-7B-GPTQ"
# FlagAlpha/Llama2-Chinese-13b-Chat-4bit

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    device_map = 'auto',
    low_cpu_mem_usage = True,
    trust_remote_code = True,
)

# model.to(device)

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [6]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
    # torch_dtype=torch.float16,
    device_map="auto",
    # max_length = 2048,
    max_new_tokens = 500,
    temperature = 0,
    top_p = 0.95,
    repetition_penalty = 1.15,
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f92c276b850>)

In [7]:

### testing model
query = "Please explain what is the Geverative AI. Give just a definition. Keep it in 100 words."
llm.invoke(query)

'Please explain what is the Geverative AI. Give just a definition. Keep it in 100 words.\nGeverative AI refers to the use of artificial intelligence (AI) techniques to generate new and original content, such as images, videos, music, or text. This can involve training machine learning models on large datasets of existing content, then using these models to create new content that resembles the originals but is not simply a copy. Geverative AI has applications in areas like entertainment, advertising, and media production.'

# RAG without chat history

## Loader

Ingestion of data using Text loder

In [8]:
# loader = TextLoader("/president-bidens-state-of-the-union-2023/biden-sotu-2023-planned-official.txt",
#                     encoding="utf8")
# documents = loader.load()

from langchain_community.document_loaders import PyPDFLoader

# file_path = "MachineLearning-Lecture01.pdf"
loader = PyPDFLoader("/content/MachineLearning-Lecture01.pdf")
docs = loader.load()

len(docs)


22

In [9]:
len(docs[0].page_content)

3131

In [10]:
print(docs[0].page_content[:500])
print(docs[0].metadata)


MachineLearning-Lecture01  
Instructor (Andrew Ng):  Okay. Good morning. Welcome to CS229, the machine 
learning class. So what I wanna do today is ju st spend a little time going over the logistics 
of the class, and then we'll start to  talk a bit about machine learning.  
By way of introduction, my name's  Andrew Ng and I'll be instru ctor for this class. And so 
I personally work in machine learning, and I' ve worked on it for about 15 years now, and 
I actually think that machine learning i
{'source': '/content/MachineLearning-Lecture01.pdf', 'page': 0}



## Split data in chunks

We split data in chunks using a recursive character text splitter.


In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

print(f'We have created {len(splits)} chunks from {len(docs)} pages')

We have created 82 chunks from 22 pages


In [12]:
len(splits)


82

In [13]:
len(splits[0].page_content)

985


## Creating Embeddings and Storing in Vector Store¶

Create the embeddings using Sentence Transformer and HuggingFace embeddings.


In [14]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

clear_output()

In [15]:
vectordb = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="docs/chroma/")
print(vectordb._collection.count())

82


In [16]:
### test if vector DB was loaded correctly
question = "Is probability a class topic?"
docs = vectordb.similarity_search(question, k=2)

print(docs[0].page_content)

sections will be taught by the TAs, and atte ndance at discussion sections is optional, 
although they'll also be recorded and televi sed. And we'll use the discussion sections 
mainly for two things. For the next two or th ree weeks, we'll use the discussion sections 
to go over the prerequisites to this class or if some of you haven't seen probability or 
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the disc ussion sections to go over extensions for the 
material that I'm teaching in the main lectur es. So machine learning is a huge field, and 
there are a few extensions that we really want  to teach but didn't have time in the main 
lectures for.


In [17]:
docs = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3) # MMR Search
print(docs[0].page_content)

sections will be taught by the TAs, and atte ndance at discussion sections is optional, 
although they'll also be recorded and televi sed. And we'll use the discussion sections 
mainly for two things. For the next two or th ree weeks, we'll use the discussion sections 
to go over the prerequisites to this class or if some of you haven't seen probability or 
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the disc ussion sections to go over extensions for the 
material that I'm teaching in the main lectur es. So machine learning is a huge field, and 
there are a few extensions that we really want  to teach but didn't have time in the main 
lectures for.


In [18]:
retriever = vectordb.as_retriever() #search_type = "mmr"

## Prompt Template

In [27]:

# Build prompt
template = """Use the following pieces of context to answer the question at the end. \
If you don't know the answer, just say that you don't know, don't try to make up an answer. \
Use three sentences maximum and keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.\
Use only the following pieces of context to answer the question at the end.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)



## RetrievalQA Chain

In [40]:
# Option 1
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # map_reduce, map_rerank, stuff, refine
    retriever=retriever,
    return_source_documents = True,
    chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT},
    verbose=False
)


question = "Is probability a class topic?"
response = qa_chain({"query": question})
response

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


{'query': 'Is probability a class topic?',
 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer. Use three sentences maximum and keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.Use only the following pieces of context to answer the question at the end.  and don\'t show the content of context and question,just show the answer\nsections will be taught by the TAs, and atte ndance at discussion sections is optional, \nalthough they\'ll also be recorded and televi sed. And we\'ll use the discussion sections \nmainly for two things. For the next two or th ree weeks, we\'ll use the discussion sections \nto go over the prerequisites to this class or if some of you haven\'t seen probability or \nstatistics for a while or maybe algebra, we\'ll go over those in the discussion sections as a \nrefresher for those of you that wan

In [47]:
# print(response["result"])
response["result"].split("\n")[-1].split("Helpful Answer:")[-1].strip()

'Thanks for asking! Probability is indeed a class topic, and we will cover many fundamental concepts in probability theory throughout the quarter.'

In [42]:
print(response["source_documents"][0])

page_content="sections will be taught by the TAs, and atte ndance at discussion sections is optional, \nalthough they'll also be recorded and televi sed. And we'll use the discussion sections \nmainly for two things. For the next two or th ree weeks, we'll use the discussion sections \nto go over the prerequisites to this class or if some of you haven't seen probability or \nstatistics for a while or maybe algebra, we'll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'm teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to teach but didn't have time in the main \nlectures for." metadata={'page': 8, 'source': '/content/MachineLearning-Lecture01.pdf'}


## Post-Processing

In [23]:
import os
import glob
import textwrap

def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )

    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [24]:
def llm_ans(query):

    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)

    return ans

In [43]:


query = "Is probability a class topic?"
print(llm_ans(query))


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum and keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.Use only the following pieces of context to answer the question at the end.  and don't show the content of context and question,just show the answer
sections will be taught by the TAs, and atte ndance at discussion sections is optional,
although they'll also be recorded and televi sed. And we'll use the discussion sections
mainly for two things. For the next two or th ree weeks, we'll use the discussion sections
to go over the prerequisites to this class or if some of you haven't seen probability or
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a
refresher for those of you that want one.
Later in this quarter, we'll also use the disc ussion sections to go

In [28]:
# Option 2 - LCEL
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | QA_CHAIN_PROMPT
    | llm
    | StrOutputParser()
)

print(rag_chain.invoke("Is probability a class topic?"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum and keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.Use only the following pieces of context to answer the question at the end.  and don't show the content of context and question,just show the answer
sections will be taught by the TAs, and atte ndance at discussion sections is optional, 
although they'll also be recorded and televi sed. And we'll use the discussion sections 
mainly for two things. For the next two or th ree weeks, we'll use the discussion sections 
to go over the prerequisites to this class or if some of you haven't seen probability or 
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the disc ussion section

In [29]:
print(rag_chain.invoke("why are those prerequesites needed?"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum and keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.Use only the following pieces of context to answer the question at the end.  and don't show the content of context and question,just show the answer
Student : [Inaudible] what language [inaudible]?  
Instructor (Andrew Ng): So let's see. There is no C programming in this class other 
than any that you may choose to do yourself in your project. So all the homeworks can be 
done in MATLAB or Octave, and let's see. A nd I guess the program prerequisites is more 
the ability to understand big?O notation and know ledge of what a data structure, like a 
linked list or a queue or bina ry treatments, more so than  your knowledge of C or Java 
specifically. Yeah?  
Student : Looking at the end semester proje

# Add Chat History

In many Q&A applications we want to allow the user to have a back-and-forth conversation, meaning the application needs some sort of "memory" of past questions and answers, and some logic for incorporating those into its current thinking.

## Memory

In [30]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

## ConversationalRetrievalChain

In [31]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
conversation = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [44]:
question = "Is probability a class topic?"
result = conversation({"question": question})

print(result['answer'])

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

sections will be taught by the TAs, and atte ndance at discussion sections is optional, 
although they'll also be recorded and televi sed. And we'll use the discussion sections 
mainly for two things. For the next two or th ree weeks, we'll use the discussion sections 
to go over the prerequisites to this class or if some of you haven't seen probability or 
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the disc ussion sections to go over extensions for the 
material that I'm teaching in the main lectur es. So machine learning is a huge field, and 
there are a few extensions that we really want  to teach but didn't have time in the main 
lectures for.

So later this quarter, we'll use the discussio

In [45]:
question = "why are those prerequesites needed?"
result = conversation({"question": question})

print(result['answer'])

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

sections will be taught by the TAs, and atte ndance at discussion sections is optional, 
although they'll also be recorded and televi sed. And we'll use the discussion sections 
mainly for two things. For the next two or th ree weeks, we'll use the discussion sections 
to go over the prerequisites to this class or if some of you haven't seen probability or 
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the disc ussion sections to go over extensions for the 
material that I'm teaching in the main lectur es. So machine learning is a huge field, and 
there are a few extensions that we really want  to teach but didn't have time in the main 
lectures for.

So later this quarter, we'll use the discussio

## End to End

In [None]:

from langchain_core.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory


llm = HuggingFacePipeline(pipeline = pipe)
prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            "You are a nice chatbot having a conversation with a human."
        ),
        # The `variable_name` here is what must align with memory
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)
# Notice that we `return_messages=True` to fit into the MessagesPlaceholder
# Notice that `"chat_history"` aligns with the MessagesPlaceholder name.
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
conversation = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory
)

# Notice that we just pass in the `question` variables - `chat_history` gets populated by memory
conversation({"question": "hi"})

## Wrap all into a Function

In [None]:
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"

def load_db(file, embedding_model_name, chain_type, k, llm):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={"device": "cuda"})
    # create vector database from data
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="docs/chroma/" )
    # define retriever
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # define memory
    memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)

    # create a chatbot chain.
    qa = ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type=chain_type,
        retriever=retriever,
        memory=memory,
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa


qa = load_db(file_path,embedding_model_name, "stuff", 3, llm)

# Optional : LCEL

In [32]:

from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_chroma import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
# from langchain_community.document_loaders import WebBaseLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_text_splitters import RecursiveCharacterTextSplitter





In [33]:
### Construct retriever ###
loader = PyPDFLoader("/content/MachineLearning-Lecture01.pdf")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="docs/chroma/" )
retriever = vectorstore.as_retriever(search_type = "mmr")

In [34]:
### Contextualize question ###

from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)



In [35]:
### Answer question ###

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [36]:
### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [37]:
conversational_rag_chain.invoke(
    {"input": "Is probability a class topic?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]



"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nsections will be taught by the TAs, and atte ndance at discussion sections is optional, \nalthough they'll also be recorded and televi sed. And we'll use the discussion sections \nmainly for two things. For the next two or th ree weeks, we'll use the discussion sections \nto go over the prerequisites to this class or if some of you haven't seen probability or \nstatistics for a while or maybe algebra, we'll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'm teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to teach but

In [38]:
conversational_rag_chain.invoke(
    {"input": "why are those prerequesites needed?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


'System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nsections will be taught by the TAs, and atte ndance at discussion sections is optional, \nalthough they\'ll also be recorded and televi sed. And we\'ll use the discussion sections \nmainly for two things. For the next two or th ree weeks, we\'ll use the discussion sections \nto go over the prerequisites to this class or if some of you haven\'t seen probability or \nstatistics for a while or maybe algebra, we\'ll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we\'ll also use the disc ussion sections to go over extensions for the \nmaterial that I\'m teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to 