In [None]:
# INSPIRED BY https://github.com/madhavthaker1/llm/blob/main/rag/simple_rag.ipynb
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu > /dev/null
!pip install -q accelerate==0.21.0 bitsandbytes==0.40.2 trl==0.4.7 > /dev/null
!pip install -q -U peft > /dev/null
!playwright install > /dev/null
!playwright install-deps > /dev/null

In [None]:
# prompt: reboot colab
import os
os.kill(os.getpid(), 9)

In [None]:
# required packages
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

In [None]:
# Here you can choose to use a Mistral Model deployed on a Vertex AI API endpoint, refere to the chapter 2 notebooks, or as following, load Mistral's model locally in the notebook

model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
config.max_position_embeddings = 8096
quantization_config = BitsAndBytesConfig(
bnb_4bit_compute_dtype=torch.bfloat16,
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
llm_int8_enable_fp32_cpu_offload=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
config=config,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="auto",
offload_folder="./offload"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# inference pipeline for the base model
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

In [None]:
M7B = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
#necessary packages for the web scrapping
import nest_asyncio
nest_asyncio.apply()

# Articles to index: in this exemple pieces of Google Cloud technical Documentation
articles = ["https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-multimodal",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-chat",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-text",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-text-embeddings",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart",
            "https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-tuning"
            ]

# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()

In [None]:
# Converts HTML to plain text
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

# Split you documents in chunks
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

# Load chunked documents into the FAISS Vextor Database using a pre built embedding model from HF hub
FaissVectordb = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))
# get the retriever to get the closer chunks from FAISS
retriever = FaissVectordb.as_retriever()



.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
# Build a prompt
prompt_template = """
### [INST] Instruction: Answer the question on Google Cloud generative AI features with the help of this context:
{context}
### QUESTION:
{question} [/INST]
 """
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# the base model chain
llm_chain = LLMChain(llm=M7B, prompt=prompt)

In [None]:
# lets get the answer from the base model first, so then we could compare with the answer from the base model + RAG chain
llm_chain.invoke({"context": "", "question": "What the is best Google Cloud LLM for text generation?"})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'context': '',
 'question': 'What the best Google Cloud LLM for text generation?',
 'text': '\nGoogle Cloud offers several Language Models (LMs) that can be used for text generation. The choice of which one to use depends on your specific needs and requirements. Here are some of the most popular LMs available on Google Cloud:\n\n1. BERT: BERT (Bidirectional Encoder Representations from Transformers) is a powerful pre-trained language model that can be fine-tuned for various NLP tasks, including text generation. It uses a transformer architecture and has been shown to achieve state-of-the-art results on many benchmarks.\n2. T5: T5 (Text-to-Text Transfer Transformer) is another pre-trained language model that can be fine-tuned for text generation. It uses a fill-in-the-blank approach and has been shown to generate high-quality text in various domains.\n3. Pegasus: Pegasus is a denoising autoencoder-based language model that can be fine-tuned for text generation. It uses a masked languag

In [None]:
# the base model + RAG chain
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)
result = rag_chain.invoke("What is the best Google Cloud LLM for text generation?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
# let's see which piece of ducmentation are embedded the closest to our prompt, those are the ones retreived with FAISS
result['context']

[Document(page_content="Use Vertex AI to send text embedding requests to Google's PaLM 2 Large\nLanguage Model (LLM) and to receive a response. Test and customize prompts to\nmeet the needs of your application.", metadata={'source': 'https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-text-embeddings'}),
 Document(page_content="Use Vertex AI to send a text prompt to Google's PaLM 2 Large Language Model\n(LLM) and to receive a response. Test and customize prompts to meet the needs\nof your application.", metadata={'source': 'https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-text'}),
 Document(page_content="Use Vertex AI Studio to design, test, and customize your prompts sent to\nGoogle's PaLM 2 Large Language Model (LLM). After the LLM processes the\nprompt, a response is received.", metadata={'source': 'https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart'}),
 Document(page_content="Use Vertex

In [None]:
# Let's print the output of the RAG chain, when we compare with the base model we can see the ouptu is more accurate
print(result['text'])


The best Google Cloud LLM for text generation would be Google's PaLM 2 Large Language Model (LLM). It can be used with Vertex AI to send text embedding requests or prompts and receive responses. The model can be tested and customized to meet the needs of your application. Additionally, Vertex AI Studio can be used to design, test, and customize prompts sent to the LLM.
