In [1]:
%load_ext autoreload
%autoreload 2

### Necessary imports

In [2]:
!pip install -q -U torch datasets transformers
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7

### Dependencies

In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

2023-12-30 20:17:10.097815: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-30 20:17:10.151997: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-30 20:17:10.152038: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-30 20:17:10.153422: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-30 20:17:10.162219: I tensorflow/core/platform/cpu_feature_guar

### Load quantized Mistal 7B

In [2]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Count number of trainable parameters

In [3]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(mistral_model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


### Build Mistral text generation pipeline

In [4]:
standalone_query_generation_pipeline = pipeline(
 model=mistral_model,
 tokenizer=tokenizer,
 task="text-generation",
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=1000,
)
standalone_query_generation_llm = HuggingFacePipeline(pipeline=standalone_query_generation_pipeline)

response_generation_pipeline = pipeline(
 model=mistral_model,
 tokenizer=tokenizer,
 task="text-generation",
 temperature=0.2,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=1000,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

### Load and chunk documents. Load chunked documents into FAISS index 

In [5]:
import nest_asyncio
nest_asyncio.apply()

# Articles to index
articles = ["https://www.fantasypros.com/2023/12/fantasy-football-panic-meter-patrick-mahomes-austin-ekeler-stefon-diggs-travis-etienne/",]

# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()

In [6]:
# Converts HTML to plain text 
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=800, 
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents, 
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever(k = 1)

Created a chunk of size 4148, which is longer than the specified 800
Created a chunk of size 898, which is longer than the specified 800


### Create PromptTemplate and LLMChain

In [14]:
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.memory import ConversationBufferMemory

In [15]:
from operator import itemgetter

In [16]:
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts.chat import ChatPromptTemplate

_template = """
[INST] 
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language, that can be used to query a FAISS index. This query will be used to retrieve documents with additional context. 

Let me share a couple examples that will be important. 

If you do not see any chat history, you MUST return the "Follow Up Input" as is:

```
Chat History:

Follow Up Input: How is Lawrence doing?
Standalone Question:
How is Lawrence doing?
```

If this is the second question onwards, you should properly rephrase the question like this:

```
Chat History:
Human: How is Lawrence doing?
AI: 
Lawrence is injured and out for the season.

Follow Up Input: What was his injurt?
Standalone Question:
What was Lawrence's injury?
```

Now, with those examples, here is the actual chat history and input question.

Chat History:
{chat_history}

Follow Up Input: {question}
Standalone question:
[your response here]
[/INST] 
"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [17]:
template = """
[INST] 
Answer the question based only on the following context:
{context}

Question: {question}
[/INST] 
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

In [18]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [20]:
# Instantiate ConversationBufferMemory
memory = ConversationBufferMemory(
 return_messages=True, output_key="answer", input_key="question"
)

# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | CONDENSE_QUESTION_PROMPT
    | standalone_query_generation_llm,
}
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"],
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question"),
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | response_generation_llm,
    "question": itemgetter("question"),
    "context": final_inputs["context"],
    "docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [24]:
def call_conversational_rag(question, chain, memory):
    
    inputs = {"question": question}
    result = chain.invoke(inputs)
    
    memory.save_context(inputs, {"answer": result["answer"]})
    
    return result

In [25]:
question = "how is maholmes doing?"
call_conversational_rag(question, final_chain, memory)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'answer': '\nAccording to the provided context, Patrick Mahomes\' current performance is rated as "Panic Meter: 3" which indicates that he is officially panicked and it is recommended to consider a Plan B, explore trade options or bench him for a more reliable option.',
 'question': 'Standalone question:\nWhat is the current performance of Patrick Mahomes according to the panic meter grade?',
 'context': 'Fantasy Football Panic Meter: Patrick Mahomes, Austin Ekeler, Stefon Diggs,\nTravis Etienne | FantasyPros **PANIC METER GRADE** | **STRATEGY/PLAN OF\nACTION**  \n---|---  \n0 | This past week WAS not ideal, but it can be chalked up as an anomaly.\nPanic is not necessary.  \n1 | Panic is creeping up. It’s not time to sound the alarm yet, but it is\nsomething to be aware of. Said player should still be considered a starter but\nis now under surveillance.  \n2 | Officially panicked, taking things week by week, considering a Plan B,\nexploring trade options or benching for a more reliabl

In [26]:
question = "Who are good alternatives right now?"
call_conversational_rag(question, final_chain, memory)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'answer': '\nBased on the provided context, some good alternatives to Patrick Mahomes at quarterback right now include Baker Mayfield and Joe Flacco.',
 'question': 'Standalone question:\nWho are some good alternatives to Patrick Mahomes at quarterback right now?',
 'context': '**Patrick Mahomes (QB – KC)| Panic Meter: 3 **\n\nSooner or later the Chiefs have to figure it out, right? We’ve been asking\nourselves this question for 8 weeks now and nothing has changed. In week 16,\nMahomes finished with less than 17 points for the seventh time in his last\neight games. He is QB10 on the season. No QB will offer you the same upside as\nMahomes, but there are several likely-available players that have displayed a\nmuch higher floor as of late. Consider these options:\n\n  * Baker Mayfield\n\n(QB – TB): averaging 22.9 PPG over his last three games.\n\n  * Joe Flacco\n\n(QB – CLE): averaging 351 pass YPG and 20.6 PPG over his last three games.\n\n**Austin Ekeler (RB – LAC) | Panic Meter: 3**\

In [27]:
question = "How many PPG are both averaging?"
call_conversational_rag(question, final_chain, memory)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'answer': '\nThe average points per game (PPG) for Baker Mayfield over his last three games is 22.9. The average points per game (PPG) for Joe Flacco over his last three games is 351 pass yards and 20.6 PPG.',
 'question': 'Standalone question:\nWhat is the average points per game (PPG) for both Baker Mayfield and Joe Flacco?',
 'context': '**Patrick Mahomes (QB – KC)| Panic Meter: 3 **\n\nSooner or later the Chiefs have to figure it out, right? We’ve been asking\nourselves this question for 8 weeks now and nothing has changed. In week 16,\nMahomes finished with less than 17 points for the seventh time in his last\neight games. He is QB10 on the season. No QB will offer you the same upside as\nMahomes, but there are several likely-available players that have displayed a\nmuch higher floor as of late. Consider these options:\n\n  * Baker Mayfield\n\n(QB – TB): averaging 22.9 PPG over his last three games.\n\n  * Joe Flacco\n\n(QB – CLE): averaging 351 pass YPG and 20.6 PPG over his las