In [None]:
!pip install -q \
    transformers==4.31.0 \
    accelerate==0.21.0 \
    bitsandbytes==0.41.0 \
    sentence-transformers==2.2.2 \
    xformers==0.0.20 \

!pip install -q \
    langchain==0.1.0 \
    langchain-community==0.0.12 \
    langchainhub==0.1.14 \
    faiss-gpu \
    faiss-cpu

!pip install -q pandas
!pip install -q colab-xterm

In [16]:
import pandas as pd
from torch import cuda

from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,)
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough


from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain.llms import HuggingFacePipeline

from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


ModuleNotFoundError: No module named 'torch'

In [None]:
from google.colab import userdata
import huggingface_hub as hf_hub

HF_TOKEN = userdata.get('HF_TOKEN')
hf_hub.login(HF_TOKEN)

## Constants

In [None]:
# Dataset files
PLACES_PATH = "places.csv"
REVIEWS_PATH = "reviews.csv"

# Models
MODEL_NAME =  "meta-llama/Llama-2-7b-hf" # "meta-llama/Llama-2-7b-chat-hf"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# Embeddings
EMBEDDINGS_CACHE_STORE="./cache/"

# Faiss
FAISS_REVIEWS_PATH = "faiss_index"
FAISS_INDEX_NAME = "index"
FAISS_DISTANCE_STRATEGY='EUCLIDEAN_DISTANCE'

## Load Dataset

Here we are using 2 csv files containing places (restuarants, bars, ...) info and reviews for each of them.

In [None]:
def get_documents(content_func=lambda row:row['review'],
                  source_func=lambda row:row['place_id'],
                  metadata_fields=[]):

  # Load both data files
  places_df = pd.read_csv(PLACES_PATH)
  reviews_df = pd.read_csv(REVIEWS_PATH)

  # merge them on 'place_id'
  merged_df = pd.merge(places_df, reviews_df, on='place_id', how='inner')

  # add page_content and source columns using their corresponing functions
  merged_df['page_content'] = merged_df.apply(content_func, axis=1)
  merged_df['source'] = merged_df.apply(source_func, axis=1)

  # update metadata_fields with 'page_content', 'source'
  metadata_fields = list(set(metadata_fields + ['page_content', 'source']))

  loader = DataFrameLoader(merged_df[metadata_fields],page_content_column='page_content')
  return loader.load()

In [None]:
documents = get_documents()

In [15]:
documents[0]

NameError: name 'documents' is not defined

## Load Embeddings model

In [None]:
def get_hf_embedding_model(embedding_model_name,
                           cache_embeddings_store,
                           device='cpu',
                           normalize_embeddings=False,
                           ):
  model_kwargs = {'device': device}
  encode_kwargs = {'normalize_embeddings': normalize_embeddings} # Set `True` for cosine similarity
  embedding_model = HuggingFaceEmbeddings(
      model_name=embedding_model_name,
      model_kwargs=model_kwargs,
      encode_kwargs=encode_kwargs
      )
  store = LocalFileStore(cache_embeddings_store)
  embedding_model = CacheBackedEmbeddings.from_bytes_store(
                    embedding_model, store)
  return embedding_model



In [None]:
embedding_model = get_hf_embedding_model(EMBEDDING_MODEL_NAME,
                                         EMBEDDINGS_CACHE_STORE,
                                         device='cuda' if cuda.is_available() else 'cpu',
                                         normalize_embeddings=False)

## Load FAISS (Vector Database)

In [None]:
def get_vector_database(documents, embedding_model):

  vector_database = FAISS.from_documents(
      documents, embedding_model,
      distance_strategy= FAISS_DISTANCE_STRATEGY
      )
  return vector_database

In [None]:
vector_db = get_vector_database(documents, embedding_model)

In [None]:
## if you want to save the db and use the files to load it again later.
vector_db.save_local(folder_path=FAISS_REVIEWS_PATH, index_name=FAISS_INDEX_NAME)
vector_db = FAISS.load_local(folder_path=FAISS_REVIEWS_PATH,
                             embeddings=embedding_model,
                             index_name=FAISS_INDEX_NAME)


In [None]:
docs = vector_db.similarity_search("which one is the best pizza restaurant in the city?", k = 5)

In [None]:
docs[0]

### Load LLM

In [None]:
def get_hf_llm(model_name):

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16,
  )
  model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, )
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  pipe = pipeline(
      model=model,
      tokenizer=tokenizer,
      return_full_text=True,  # langchain expects the full text
      task='text-generation',
      # we pass model parameters here too
      temperature=0.0001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
      max_new_tokens=512,  # mex number of tokens to generate in the output
      repetition_penalty=1.1  # without this output begins repeating
  )

  llm = HuggingFacePipeline(pipeline=pipe)
  return llm


In [17]:
llm =  get_hf_llm(MODEL_NAME)

NameError: name 'BitsAndBytesConfig' is not defined

## Create LangChain pipeline

In [None]:
review_template_str = """Your job is to use Google Map
reviews to answer questions about their experience at a restaurant. Use
the following context to answer questions. Be as detailed as possible, but
don't make up any information that's not from the context. If you don't know
an answer based on the context, say you don't know.
context:
{context}
"""
## """
# If you don't know an answer based on the context, say you don't know, and
# if the context is not about restaurants, then kindly tell them that  you can
# only provide assistance and answer questions related to restaurants.
##"""

review_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"], template=review_template_str
    )
)

review_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["question"], template="{question}")
)
messages = [review_system_prompt, review_human_prompt]

review_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"], messages=messages
)



output_parser = StrOutputParser()


reviews_retriever = vector_db.as_retriever(k=10,)

# review_chain = (
#     {"context": reviews_retriever, "question": RunnablePassthrough()}
#     | review_prompt_template
#     | llm
#     | StrOutputParser()
# )

In [None]:
review_chain = (
    {"context": reviews_retriever, "question": RunnablePassthrough()}
    | review_prompt_template
)

question = """What are the pros and cons of the best pizza restaurant in the city?"""
review_chain.invoke(question).to_string()

System: Your job is to use Google Map
reviews to answer questions about their experience at a restaurant. Use
the following context to answer questions. Be as detailed as possible, but
don't make up any information that's not from the context. If you don't know
an answer based on the context, say you don't know.
context:
[Document(page_content="name: places/ChIJtaHsbrrafkcRdQQTi3yHS8Q/reviews/ChZDSUhNMG9nS0VJQ0FnSURtNExXdFpnEAE\ntext.languageCode: en\ntext.text: We had dinner at this place and it is the best pizza I've had in 5 years. The place is beautiful and it feels very classy. The food is superb, great service.", metadata={'source': "We had dinner at this place and it is the best pizza I've had in 5 years. The place is beautiful and it feels very classy. The food is superb, great service.", 'row': 294, 'place_id': 'ChIJtaHsbrrafkcRdQQTi3yHS8Q', 'rating': '5'}), Document(page_content="name: places/ChIJmShy2QTbfkcRoPeMDtwZ6pE/reviews/ChZDSUhNMG9nS0VJQ0FnSURhN3B1d0pBEAE\ntext.languageCode: en\ntext.text: Great pizza, cool retro design and lovely staff. It's topped off with a nice selection of beers which is a big plus and not a given in this area of Italy.\r\nThe location isn't too exciting, which takes away from the otherwise amazing restaurant.", metadata={'source': "Great pizza, cool retro design and lovely staff. It's topped off with a nice selection of beers which is a big plus and not a given in this area of Italy.\r\nThe location isn't too exciting, which takes away from the otherwise amazing restaurant.", 'row': 132, 'place_id': 'ChIJmShy2QTbfkcRoPeMDtwZ6pE', 'rating': '4'}), Document(page_content="name: places/ChIJWdpV5_7RfkcRI93ap-aXK90/reviews/ChZDSUhNMG9nS0VJQ0FnSUNsbm92R2NBEAE\ntext.languageCode: en\ntext.text: I have been several times already and every time the experience was great. Awesome food, good atmosphere and great service! Every pizza I tried was excellent and the quality of the ingredients was outstanding!! One of the best pizzeria in Padua.\r\nThe only thing I didn't like honestly was the limited choice about toppings if you wanna customize your pizza.", metadata={'source': "I have been several times already and every time the experience was great. Awesome food, good atmosphere and great service! Every pizza I tried was excellent and the quality of the ingredients was outstanding!! One of the best pizzeria in Padua.\r\nThe only thing I didn't like honestly was the limited choice about toppings if you wanna customize your pizza.", 'row': 578, 'place_id': 'ChIJWdpV5_7RfkcRI93ap-aXK90', 'rating': '5'}), Document(page_content="name: places/ChIJWdpV5_7RfkcRI93ap-aXK90/reviews/ChRDSUhNMG9nS0VJQ0FnSURqaVpjWBAB\ntext.languageCode: en\ntext.text: Me and my family went there to have dinner, based on the reviews that I red about this place and it was defenetely a very good choice, because pizza is just great and, if you are in the neighborhood, you don't want to miss it.", metadata={'source': "Me and my family went there to have dinner, based on the reviews that I red about this place and it was defenetely a very good choice, because pizza is just great and, if you are in the neighborhood, you don't want to miss it.", 'row': 576, 'place_id': 'ChIJWdpV5_7RfkcRI93ap-aXK90', 'rating': '5'})]

Human: What are the pros and cons of the best pizza restaurant in the city?

In [None]:
question = """What are the pros and cons of the best pizza restaurant in the city?"""
reviews_retriever.invoke(question)

# {"context": reviews_retriever, "question": RunnablePassthrough()}

# review_prompt_template

In [None]:
question = """What are the pros and cons of the best pizza restaurant in the city?"""
review_chain.invoke(question)