In [1]:
!pip install -qqq -U transformers datasets accelerate peft trl bitsandbytes wandb langchain sentence_transformers --progress-bar off

In [16]:
!pip install -qqq -U pypdf pinecone-client langchain-pinecone langchain-cohere lark --progress-bar off

In [42]:
import os, gc, sys, random, string, re, time
import pandas as pd
from uuid import uuid4
from tqdm.autonotebook import tqdm

In [27]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
import torch

In [7]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from pinecone import Pinecone, ServerlessSpec, PodSpec


In [5]:
# load keys from secrets

from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
PINECONE_API_ENV = userdata.get('PINECONE_API_ENV')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
COHERE_API_KEY = userdata.get('HF_TOKEN')

In [8]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Device: {DEVICE}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")


Device: cuda
CUDA Version: 12.1
Pytorch 2.2.1+cu121


In [9]:
# Check the type and quantity of GPUs

if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))

Num CPUs: 2
Num GPUs: 1
GPU Type: Tesla T4


### Load Sample Data

In [21]:
# Load documennts

loader = PyPDFLoader('sample_data/Bigbook_MLOps.pdf')

In [22]:
pages = loader.load_and_split()

In [23]:
# Clean Text data

def clean_text(sent):

    cleaned_data = []
    sent_cleaned = ""
    sent = re.sub(r'\b(https?:\/\/[^ ,\n\r]*)\b', '', sent)
    sent = re.sub(r'\s*(?:<br\s*\/?>)+\s*', ' ', sent)
    sent = re.sub(r'[^\x00-\x7F]', '', sent)
    for word in sent.split():
        sent_cleaned += word
        sent_cleaned += " "

    cleaned_data.append(sent_cleaned)

    return cleaned_data[0]

In [24]:
page_list = []
for page in pages[7:]:
    page_list.append(clean_text(str(page.page_content))[:-34])

In [28]:
df_book = pd.DataFrame(page_list, columns=['text'])

In [29]:
df_book.head()

Unnamed: 0,text
0,MLOps = DataOps + DevOps + ModelOpsMachine lea...
1,CHAPTER 2 Big Book of MLOps V1 RecapWe begin w...
2,"Semantics of development, staging and producti..."
3,ML deployment patterns Code and models often p...
4,Model training is executed in the development ...


### Chunking Text

In [32]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # the maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # the number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200B",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
)

In [33]:
chunks_load = df_book.to_dict(orient='records')

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(chunks_load)):
    record_texts = text_splitter.split_text(record['text'])
    texts.extend(record_texts)

  0%|          | 0/74 [00:00<?, ?it/s]

In [34]:
df_chunk = pd.DataFrame(texts, columns=['text'])

In [35]:
df_chunk.shape

(164, 1)

In [37]:
# adding synthetic metadata that will be used for filtering

df_chunk.loc[:60, 'update_month'] = 'April'
df_chunk.loc[60:120, 'update_month'] = 'May'
df_chunk.loc[120:, 'update_month'] = 'June'


In [38]:
df_chunk.head()

Unnamed: 0,text,update_month
0,MLOps = DataOps + DevOps + ModelOpsMachine lea...,April
1,be replaced and easily scaled with Databricks ...,April
2,"features such as Models in Unity Catalog, Mode...",April
3,CHAPTER 2 Big Book of MLOps V1 RecapWe begin w...,April
4,time to business value is accelerated. This ef...,April


### Embedding Text content

In [39]:
embed_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embed_model = HuggingFaceBgeEmbeddings(
    model_name=embed_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [40]:
# Embedding text - 1024 dimensions

df_chunk.loc[:, 'embeddings'] = df_chunk.loc[: , 'text'].apply(lambda x : embed_model.embed_query(x))

In [43]:
df_chunk['id'] = [str(uuid4()) for _ in range(len(df_chunk))]

In [44]:
df_chunk.head()

Unnamed: 0,text,update_month,embeddings,id
0,MLOps = DataOps + DevOps + ModelOpsMachine lea...,April,"[-0.010729925706982613, -0.03555789962410927, ...",2f0fcd08-6ef4-496d-beb5-ecc7b39b2271
1,be replaced and easily scaled with Databricks ...,April,"[0.01224721409380436, -0.01093058381229639, -0...",0feb8350-0ee4-4b4b-ae92-827a26c93ff7
2,"features such as Models in Unity Catalog, Mode...",April,"[0.0281523410230875, -0.002673514885827899, -0...",01980b0a-c83c-46eb-b297-e1ef2dd7db76
3,CHAPTER 2 Big Book of MLOps V1 RecapWe begin w...,April,"[0.030428269878029823, -0.011662106961011887, ...",c8cb99ee-9759-443a-aee3-9c6dec015862
4,time to business value is accelerated. This ef...,April,"[0.01924646832048893, 0.01792166195809841, -0....",385724fa-9332-4953-8cef-148791a13a83


### Indexing and loading to Vector DB (Pinecone)

In [45]:
pc = Pinecone(api_key=PINECONE_API_KEY)

In [46]:
# Define index name

index_name = "llm-rag-gemma-gpu"

# Check if index already exists, create it if it doesn't
if index_name not in pc.list_indexes():
    spec = ServerlessSpec(cloud='aws', region='us-east-1')
    pc.create_index(index_name, dimension=1024, metric='cosine', spec=spec)

In [47]:
# Connect to the index and view index stats
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [48]:
%%time
# Upsert embeddings into Pinecone in batches of 50
batch_size = 50
# Convert the DataFrame to a list of dictionaries
chunks = df_chunk.to_dict(orient='records')

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    mini_batch = chunks[i:i_end]
    ids_batch = [x['id'] for x in mini_batch]
    embeds = [x['embeddings'] for x in mini_batch]
    mini_batch = [{
        'text': x['text'],
        'update_month': x['update_month']
    } for x in mini_batch]
    to_upsert = list(zip(ids_batch, embeds, mini_batch))
    index.upsert(vectors=to_upsert)

  0%|          | 0/4 [00:00<?, ?it/s]

CPU times: user 723 ms, sys: 16.2 ms, total: 739 ms
Wall time: 2.65 s


In [49]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 164}},
 'total_vector_count': 164}

In [None]:
# spotcheck an index row
index.fetch(["2f0fcd08-6ef4-496d-beb5-ecc7b39b2271"], namespace='')

### LLM - Gemma-7B-Instruct

In [51]:
MODEL_NAME = "google/gemma-7b-it"

In [None]:
# Un-quantized version - for a quick test

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [65]:
# Quantized version of the model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [66]:
model.config.use_cache = False

# Set the pretraining throughput to 1.
model.config.pretraining_tp = 1

In [67]:
def format_text(text):

    # Create a list
    answer_list = text.split('\n')

    for i, item in enumerate(answer_list):

        # Replace * with nothing
        new_item = item.replace('*','')

        # Remove leading and trailing spaces
        new_item = new_item.strip()

        # Create the output string
        if i == 0:
            fin_string = new_item + '\n'
        else:
            fin_string = fin_string + new_item + '\n'

    return fin_string

In [None]:
%%time
question = 'When is the American Independence day'

# Create the prompt
prompt = f"""<start_of_turn>user
{question}<end_of_turn>
<start_of_turn>model
"""

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# Generate the outputs from prompt
generate_ids = model.generate(**inputs, max_new_tokens=768)
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)[0]

print('generated_text: ', generated_text)

# Extract the answer

# Split and select the last item in the list
response = generated_text.split('<start_of_turn>model')[-1]
# Remove leading and trailing spaces
response = response.strip()
# Remove the '<end_of_turn> token
response = response.replace('<end_of_turn>', "")

# Remove markdown '*' symbols
# The deafult Markdown that Gemma outputs
# doesn't always display well.
response = format_text(response)


print()
print('User:\n',question)
print()
print('Gemma:\n', response)

generated_text:  user
When is the American Independence day
model
The American Independence Day is celebrated on the 4th of July. It is a national holiday in the United States of America.

User:
 When is the American Independence day

Gemma:
 user
When is the American Independence day
model
The American Independence Day is celebrated on the 4th of July. It is a national holiday in the United States of America.

CPU times: user 3.64 s, sys: 237 ms, total: 3.88 s
Wall time: 4.26 s


In [68]:
READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=256,
)

* **Gemma-7b-it**

In [None]:
%%time
READER_LLM('''<start_of_turn>user
When is the American independence day<end_of_turn>
<start_of_turn>model''')

CPU times: user 1min 41s, sys: 1min 48s, total: 3min 29s
Wall time: 10min 29s


[{'generated_text': ' answer:\n\nThe America Independence Day was declared on July  Declaration, and it occurred in a year.'}]

* **Quantized Gemma-7b-it**

In [83]:
%%time
READER_LLM('''<start_of_turn>user
When is the American independence day<end_of_turn>
<start_of_turn>model''')

CPU times: user 3.65 s, sys: 90 ms, total: 3.74 s
Wall time: 7.5 s


[{'generated_text': '\n\nThe America Independence Day was declared on July  Declaration, in. The declaration of freedom states that "America colonies are free from taxation."'}]

In [69]:
# Using LangChain wrapper for HuggingFace pipeline
from langchain_community.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=READER_LLM)

### RAG - Naive

#### Base Retriever

* Similarity search with a threshold and top_k

In [70]:
vectorstore = PineconeVectorStore(
        index, embed_model, "text"
    )

In [71]:
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={
                                         "score_threshold": 0.6, "k": 3})

#### Test RAG - Naive

In [76]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [73]:
# query = 'what is the Databricks unity catalog useful for'
query = 'Explain in brief 5 new features in Databricks Unity'

In [77]:
base_docs = retriever.invoke(query)
pretty_print_docs(base_docs)

Document 1:

CHAPTER 3 Whats New? In this section we outline the key features and product updates introduced into our updated MLOps reference architecture. For each of these, we highlight the benefits they bring and how they impact our end-to-end MLOps workflow. Unity Catalog The Lakehouse forms the foundation of a data-centric AI platform. Key to this is the ability to manage both data and AI assets from a unified governance solution on the Lakehouse. Databricks Unity Catalog enables this by providing centralized access control, auditing, lineage, and data discovery capabilities across Databricks workspaces. These benefits are now extended to MLflow models with the introduction of Models in Unity Catalog . By providing a hosted version of the MLflow Model Registry in Unity Catalog, the full lifecycle of an ML model can be managed while leveraging Unity Catalogs capability to share assets across Databricks workspaces and trace lineage across both data and models. In addition to managin

In [72]:
def get_response_naive(query, vectorstore, llm_model):

    prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}
    Answer in no more than 6 sentences"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={
                                         "score_threshold": 0.6, "k": 3})
    if len(retriever.get_relevant_documents(query)) > 0:
        chain_type_kwargs = {"prompt": PROMPT}
        qa = RetrievalQA.from_chain_type(
            llm=llm_model,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs=chain_type_kwargs
        )

        return qa.invoke(query)
    else:
        return {"result": "I'm sorry, the context provided does not have information related to the query"}


In [85]:
prompt = f"""<start_of_turn>user
{query}<end_of_turn>
<start_of_turn>model
"""

resp = get_response_naive(query, vectorstore, llm)

# resp['result'].replace('\n', ' ').strip()
print(resp)

{'query': 'Explain in brief 5 new features in Databricks Unity', 'result': ' explaining newly added feature'}


### RAG - Advanced

#### Pre-Retrieval

* Metadata filtering with SelfQuery Retriever

In [52]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.self_query.pinecone import PineconeTranslator
from langchain.chains.query_constructor.base import (
    AttributeInfo,
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

In [54]:
metadata_field_info = [
    AttributeInfo(
        name="update_month",
        description="The month indicating when features and updates have been made by Databricks. One of ['April', 'June']",
        type="string",
    )
]

document_content_description = "Handbook by Databricks summarizing tools and features in MLOps, LLMOps"


In [55]:
allowed_comparators = [
    "$eq",  # Equal to (number, string, boolean)
    "$ne",  # Not equal to (number, string, boolean)
    "$gt",  # Greater than (number)
    "$gte",  # Greater than or equal to (number)
    "$lt",  # Less than (number)
    "$lte",  # Less than or equal to (number)
]

constructor_prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
    allowed_comparators=allowed_comparators,
)


output_parser = StructuredQueryOutputParser.from_components()
query_constructor = constructor_prompt | llm | output_parser

retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vectorstore,
    structured_query_translator=PineconeTranslator(),
)

NameError: name 'llm' is not defined

#### Re-Ranking

* Re-ranking the retrieved documents using Cohere re-ranker

In [None]:
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(query)

pretty_print_docs(compressed_docs)

#### Test  Advanced RAG

In [None]:
def get_response_adv(query, base_retriever, rank_retriever, llm_model):

    prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}
    Answer in no more than 6 sentences"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    if len(base_retriever.get_relevant_documents(query)) > 0:
        chain_type_kwargs = {"prompt": PROMPT}
        qa = RetrievalQA.from_chain_type(
            llm=llm_model,
            chain_type="stuff",
            retriever=rank_retriever,
            chain_type_kwargs=chain_type_kwargs
        )

        return qa.invoke(query)
    else:
        return {"result": "I'm sorry, the context provided does not have information related to the query"}


In [None]:
prompt = f"""<start_of_turn>user
{query}<end_of_turn>
<start_of_turn>model
"""

resp = get_response_adv(prompt, vectorstore, llm)

# resp['result'].replace('\n', ' ').strip()
print(resp)

### Conclusion

In [None]:
del tokenizer, model
gc.collect()
gc.collect()
torch.cuda.empty_cache()