## Installing packages and Imports

In [5]:
!pip install -q ollama
!pip install -q langchain==0.1.0 langchain-community==0.0.12 langchainhub==0.1.14
!pip install -q faiss-gpu faiss-cpu
!pip install -q colab-xterm
# !pip install langchain-core

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/115.6 kB[0m [31m735.7 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m71.7/115.6 kB[0m [31m863.5 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.6/115.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [43]:
import pandas as pd
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama

from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from google.colab import userdata
import huggingface_hub as hf_hub

%load_ext colabxterm

The colabxterm extension is already loaded. To reload it, use:
  %reload_ext colabxterm


## Constants

In [14]:
HF_TOKEN = "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" #userdata.get('HF_TOKEN')
PLACES_PATH = "places.csv"
REVIEWS_PATH = "reviews.csv"

hf_hub.login(HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load Dataset

Here we are using 2 csv files containing places (restuarants, bars, ...) info and reviews for each of them.

In [63]:
def get_documents(content_func=lambda row:row['review'],
                  source_func=lambda row:row['place_id'],
                  metadata_fields=[]):

  # Load both data files
  places_df = pd.read_csv(PLACES_PATH)
  reviews_df = pd.read_csv(REVIEWS_PATH)

  # merge them on 'place_id'
  merged_df = pd.merge(places_df, reviews_df, on='place_id', how='inner')

  # add page_content and source columns using their corresponing functions
  merged_df['page_content'] = merged_df.apply(content_func, axis=1)
  merged_df['source'] = merged_df.apply(source_func, axis=1)

  # update metadata_fields with 'page_content', 'source'
  metadata_fields = list(set(metadata_fields + ['page_content', 'source']))

  loader = DataFrameLoader(merged_df[metadata_fields],page_content_column='page_content')
  return loader.load()

In [60]:
get_documents()[0]

Document(page_content='Huge "eating palace" which is lacking the fine accents of the italian food although the service was top. I would recommend for lunch, less for dinner', metadata={'source': 'ChIJpXSgrsDbfkcRzf_5kCMmrZI'})

## Defining functions to get different models

In [None]:
def get_review_data(file_path, source_column, metadata_columns):
  loader = CSVLoader(file_path= file_path, source_column= source_column,
                    encoding = 'utf-8', metadata_columns= metadata_columns)
  documents = loader.load()
  return documents

def get_hf_embedding_model(embedding_model_name,
                           cache_embeddings_store,
                           normalize_embeddings=False,):
  model_kwargs = {'device': device}
  encode_kwargs = {'normalize_embeddings': normalize_embeddings} # Set `True` for cosine similarity
  embedding_model = HuggingFaceEmbeddings(
      model_name=embedding_model_name,
      model_kwargs=model_kwargs,
      encode_kwargs=encode_kwargs
      )
  store = LocalFileStore(cache_embeddings_store)
  embedding_model = CacheBackedEmbeddings.from_bytes_store(
                    embedding_model, store)
  return embedding_model

def get_ollama_embedding_model(embedding_model_name,
                               num_ctx,
                               temperature,
                               cache_embeddings_store):
  store = LocalFileStore(cache_embeddings_store)
  # can increase num_ctx up to 4,096 tokens!
  embedding_model = OllamaEmbeddings(model=embedding_model_name,
                                     num_ctx=num_ctx,
                                     temperature=temperature)
  embedding_model = CacheBackedEmbeddings.from_bytes_store(
                    embedding_model, store,
                    namespace=embedding_model.model)
  return embedding_model


def get_vector_database(documents, embedding_model,
                        distance_strategy='EUCLIDEAN_DISTANCE'):

  vector_database = FAISS.from_documents(
      documents, embedding_model,
      distance_strategy= distance_strategy
      )
  return vector_database


def get_ollama_llm(model_name, temperature):
  return Ollama(model=model_name, temperature=temperature)


def get_hf_llm(model_name):

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16,
  )
  model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, )
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  pipe = pipeline(
      model=model,
      tokenizer=tokenizer,
      return_full_text=True,  # langchain expects the full text
      task='text-generation',
      # we pass model parameters here too
      temperature=0.0001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
      max_new_tokens=512,  # mex number of tokens to generate in the output
      repetition_penalty=1.1  # without this output begins repeating
  )

  llm = HuggingFacePipeline(pipeline=pipe)
  return llm


## OLLAMA - LLAMA2-7b

In [None]:
# curl -fsSL https://ollama.com/install.sh | sh
# ollama serve & ollama pull llama2
# ollama pull llama2

In [None]:
%xterm

In [64]:
# loader = CSVLoader(file_path='reviews.csv', source_column="text.text",
#                     encoding = 'utf-8', metadata_columns= ["place_id", "rating"])
# documents = loader.load()

file_path='reviews.csv'
source_column="text.text"
metadata_columns= ["place_id", "rating"]
documents = get_documents()

In [None]:
# @title Get some insight on a sample document
def get_insight(docs):
  anomalies = []
  print(f"docs.page_content:\n{docs[0].page_content}")
  print(f"\ndocs.metadata:\n{docs[0].metadata}")
  m = 0
  for doc in docs:
    review = doc.metadata["source"]
    l = len(review)
    if l > m:
        m = l
    if l == 3427:
      print(review)
    if l >= 2040:
      anomalies.append(doc.metadata["row"])
  print(f"\nMaximum length of reviews in the dataset: {m}")
  if m < 2048:
    print(f"Maximum length of review < 2048, so we're good!")
  else:
    print("Maximum length of review >= 2028, so we're not good!")
  return anomalies

In [None]:
anomalies = get_insight(documents)

docs.page_content:
name: places/ChIJpXSgrsDbfkcRzf_5kCMmrZI/reviews/ChdDSUhNMG9nS0VJQ0FnSUNUdnYyMnBBRRAB
text.languageCode: en
text.text: Huge "eating palace" which is lacking the fine accents of the italian food although the service was top. I would recommend for lunch, less for dinner

docs.metadata:
{'source': 'Huge "eating palace" which is lacking the fine accents of the italian food although the service was top. I would recommend for lunch, less for dinner', 'row': 0, 'place_id': 'ChIJpXSgrsDbfkcRzf_5kCMmrZI', 'rating': '4'}
Sincerely shocked. I asked for information first by telephone and then in person about the gym and the various single, package or subscription entrances and I was given obviously incorrect information. This in itself wasn't even a big problem, the problem was the way the guys at reception handled it, truly disgraceful. Instead of calmly explaining the situation and trying to find a solution (to what was their mistake resulting from incorrect information given 

In [None]:
documents.remove(documents[anomalies[-1]])

In [None]:
anomalies = get_insight(documents)

docs.page_content:
name: places/ChIJpXSgrsDbfkcRzf_5kCMmrZI/reviews/ChdDSUhNMG9nS0VJQ0FnSUNUdnYyMnBBRRAB
text.languageCode: en
text.text: Huge "eating palace" which is lacking the fine accents of the italian food although the service was top. I would recommend for lunch, less for dinner

docs.metadata:
{'source': 'Huge "eating palace" which is lacking the fine accents of the italian food although the service was top. I would recommend for lunch, less for dinner', 'row': 0, 'place_id': 'ChIJpXSgrsDbfkcRzf_5kCMmrZI', 'rating': '4'}

Maximum length of reviews in the dataset: 1707
Maximum length of review < 2048, so we're good!


Take a look at this [link](https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.ollama.OllamaEmbeddings.html#langchain-community-embeddings-ollama-ollamaembeddings) and this [link](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.Ollama.html) for details of models parameters.

In [None]:
# store = LocalFileStore("./cache/")
# # can increase num_ctx up to 4,096 tokens!
# embedding_model = OllamaEmbeddings(model="llama2", num_ctx=2048, temperature=0)

# embedder = CacheBackedEmbeddings.from_bytes_store(
#     embedding_model, store, namespace=embedding_model.model,)

In [None]:
embedding_model_name = "llama2"
num_ctx=2048
temperature=0
cache_embeddings_store = "./cache/"
embedding_model = get_ollama_embedding_model(embedding_model_name,
                          num_ctx,
                          temperature,
                          cache_embeddings_store)

In [None]:
# smaple embedding
text = "This is a test document."

query_result = embedding_model.embed_query(text)
query_result[:5], len(query_result)

([-0.10165729373693466,
  0.00975171010941267,
  0.1753971129655838,
  0.16434210538864136,
  0.21161003410816193],
 4096)

API reference [FAISS](https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS.similarity_search)

In [None]:
# vector_db = FAISS.from_documents(documents, embedder,
#                                  distance_strategy="EUCLIDEAN_DISTANCE") # "COSINE"

distance_strategy='EUCLIDEAN_DISTANCE'
vector_db= get_vector_database(documents, embedding_model,
                        distance_strategy=distance_strategy)

In [None]:
docs = vector_db.similarity_search("which one is the best pizza restaurant in the city?",
                                      k = 5)

In [None]:
docs[0]

Document(page_content='name: places/ChIJpXSgrsDbfkcRzf_5kCMmrZI/reviews/ChZDSUhNMG9nS0VJQ0FnSUM2dzlpOVZREAE\ntext.languageCode: en\ntext.text: Unexpectedly 5 stars ⭐\r\nWe received a warm welcome in a highly restyled location where the quality of the food was outperformed only by the wonderful service of the employees.\r\nSpecial thanks to Alessandra that served is with great personality. 👍🏻', metadata={'source': 'Unexpectedly 5 stars ⭐\r\nWe received a warm welcome in a highly restyled location where the quality of the food was outperformed only by the wonderful service of the employees.\r\nSpecial thanks to Alessandra that served is with great personality. 👍🏻', 'row': 1, 'place_id': 'ChIJpXSgrsDbfkcRzf_5kCMmrZI', 'rating': '5'})

In [None]:
docs = vector_db.similarity_search_with_score("which one is the best pizza restaurant in the city?",
                                      k = 5, )

In [None]:
docs[0]

(Document(page_content='name: places/ChIJ4aOff1rafkcRLpeSS2KAsPI/reviews/ChZDSUhNMG9nS0VJQ0FnSURqeU92Q0lBEAE\ntext.languageCode: en\ntext.text: Pizza was very and home made. Staff was fast, efficient and organized. I recommend this place!', metadata={'source': 'Pizza was very and home made. Staff was fast, efficient and organized. I recommend this place!', 'row': 607, 'place_id': 'ChIJ4aOff1rafkcRLpeSS2KAsPI', 'rating': '4'}),
 8951.218)

In [None]:
docs[4]

(Document(page_content="name: places/ChIJu9bZt0_afkcRsYoU1BCI1SI/reviews/ChZDSUhNMG9nS0VJQ0FnSUNUdUltVEl3EAE\ntext.languageCode: en\ntext.text: After a third visit to Padova and several fAfter three visits to Padova and several unsuccessful attempts, we finally managed to find a table in this wonderful osteria. The place has a charming, family-run feel with a lot of heart. It's cosy, quite and comfortable.\r\nThe food was brilliant, the wine was great, and the grappa was so tasty!\r\nWhen you travel to Padova, make sure to visit this place!\r\n(hard to find a free table, so make a reservation).", metadata={'source': "After a third visit to Padova and several fAfter three visits to Padova and several unsuccessful attempts, we finally managed to find a table in this wonderful osteria. The place has a charming, family-run feel with a lot of heart. It's cosy, quite and comfortable.\r\nThe food was brilliant, the wine was great, and the grappa was so tasty!\r\nWhen you travel to Padova, mak

In [None]:
REVIEWS_FAISS_PATH = "faiss_index"
FAISS_INDEX_NAME = "index"
vector_db.save_local(folder_path=REVIEWS_FAISS_PATH, index_name=FAISS_INDEX_NAME)
vector_db = FAISS.load_local(folder_path=REVIEWS_FAISS_PATH, embeddings=embedding_model, index_name=FAISS_INDEX_NAME)

`db.as_retriever` has some cool options!

In [None]:
# Retrieve more documents with higher diversity
# Useful if your dataset has many similar documents
vector_db.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 6, 'lambda_mult': 0.25}
)

# Fetch more documents for the MMR algorithm to consider
# But only return the top 5
vector_db.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 5, 'fetch_k': 50}
)

# Only retrieve documents that have a relevance score
# Above a certain threshold
vector_db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.8}
)

# Only get the single most similar document from the dataset
vector_db.as_retriever(search_kwargs={'k': 1})

# Use a filter to only retrieve documents from a specific paper
vector_db.as_retriever(
    search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}}
)

## Creating chatbot

In [None]:
# old method
# chat_model = Ollama(model="llama2", temperature=0.)

# new method
model_name = "llama2"
temperature = 0.0
chat_model = get_ollama_llm(model_name, temperature)

In [None]:
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)

from langchain_core.output_parsers import StrOutputParser
# from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough

review_template_str = """Your job is to use Google Map
reviews to answer questions about their experience at a restaurant. Use
the following context to answer questions. Be as detailed as possible, but
don't make up any information that's not from the context. If you don't know
an answer based on the context, say you don't know.
context:
{context}
"""
## """
# If you don't know an answer based on the context, say you don't know, and
# if the context is not about restaurants, then kindly tell them that  you can
# only provide assistance and answer questions related to restaurants.
##"""

review_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"], template=review_template_str
    )
)

review_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["question"], template="{question}")
)
messages = [review_system_prompt, review_human_prompt]

review_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"], messages=messages
)



output_parser = StrOutputParser()


reviews_vector_db = vector_db = FAISS.load_local(folder_path=REVIEWS_FAISS_PATH,
                                         embeddings=embedding_model,
                                         index_name=FAISS_INDEX_NAME)

reviews_retriever = reviews_vector_db.as_retriever(k=10)

review_chain = (
    {"context": reviews_retriever, "question": RunnablePassthrough()}
    | review_prompt_template
    | chat_model
    | StrOutputParser()
)

In [None]:
question = """What are the pros and cons of the best pizza restaurant in the city?"""
review_chain.invoke(question)

'Based on the Google Map reviews provided, here are some pros and cons of the best pizza restaurant in the city:\n\nPros:\n\n1. Delicious food: The restaurant serves delicious pizzas that are well-liked by customers.\n2. Good portions: The restaurant offers good portions of food, including their pizzas.\n3. Organized staff: The staff is well-organized and efficient, ensuring a smooth dining experience.\n4. Recommendation: Many reviewers have recommended this restaurant to others.\n\nCons:\n\n1. Crowded during lunchtime: The restaurant can get crowded during lunchtime, which may be inconvenient for some customers.\n2. Limited seating: The restaurant has limited seating capacity, which can lead to long wait times for a table.\n3. Difficulty finding a free table: It can be challenging to find a free table at the restaurant, especially during peak hours.\n4. No reservation system: The restaurant does not have a reservation system in place, which can make it difficult for customers to secur

In [None]:
s = """Based on the Google Map reviews provided, here are some pros and cons of the best pizza restaurant in the city:

Pros:

1. Delicious food: The restaurant serves delicious pizzas that are well-liked by customers.
2. Good portions: The restaurant offers good portions of food, including their pizzas.
3. Organized staff: The staff is well-organized and efficient, ensuring a smooth dining experience.
4. Recommendation: Many reviewers have recommended this restaurant to others.

Cons:

1. Crowded during lunchtime: The restaurant can get crowded during lunchtime, which may be inconvenient for some customers.
2. Limited seating: The restaurant has limited seating capacity, which can lead to long wait times for a table.
3. Difficulty finding a free table: It can be challenging to find a free table at the restaurant, especially during peak hours.
4. No reservation system: The restaurant does not have a reservation system in place, which can make it difficult for customers to secure a table without waiting.

Overall, the best pizza restaurant in the city seems to have both positive and negative aspects. While the food is delicious and the staff is well-organized, the crowded atmosphere during lunchtime and limited seating capacity may be drawbacks for some customers."""

In [None]:
# from langchain.agents import create_openai_functions_agent, Tool, AgentExecutor
# from langchain import hub
# from langchain_intro.tools import get_current_wait_time

tools = [
    Tool(
        name="Reviews",
        func=review_chain.invoke,
        description="""Useful when you need to answer questions
        about patient reviews or experiences at the hospital.
        Not useful for answering questions about specific visit
        details such as payer, billing, treatment, diagnosis,
        chief complaint, hospital, or physician information.
        Pass the entire question as input to the tool. For instance,
        if the question is "What do patients think about the triage system?",
        the input should be "What do patients think about the triage system?"
        """,
    ),
    Tool(
        name="Waits",
        func=get_current_wait_time,
        description="""Use when asked about current wait times
        at a specific hospital. This tool can only get the current
        wait time at a hospital and does not have any information about
        aggregate or historical wait times. This tool returns wait times in
        minutes. Do not pass the word "hospital" as input,
        only the hospital name itself. For instance, if the question is
        "What is the wait time at hospital A?", the input should be "A".
        """,
    ),
]

hospital_agent_prompt = hub.pull("hwchase17/openai-functions-agent")

agent_chat_model = ChatOpenAI(
    model="gpt-3.5-turbo-1106",
    temperature=0,
)

hospital_agent = create_openai_functions_agent(
    llm=agent_chat_model,
    prompt=hospital_agent_prompt,
    tools=tools,
)

hospital_agent_executor = AgentExecutor(
    agent=hospital_agent,
    tools=tools,
    return_intermediate_steps=True,
    verbose=True,
)

## LLAMA2 from HuggingFace

In [None]:
!pip install -q \
    transformers==4.31.0 \
    accelerate==0.21.0 \
    bitsandbytes==0.41.0 \
    sentence-transformers==2.2.2 \
    xformers==0.0.20 \

!pip install -q \
    langchain==0.1.0 \
    langchain-community==0.0.12 \
    langchainhub==0.1.14 \
    faiss-gpu \
    faiss-cpu


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m37.7 MB/s

In [None]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain.llms import HuggingFacePipeline

from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
device = f'cuda' if cuda.is_available() else 'cpu'
MODEL_NAME =  "meta-llama/Llama-2-7b-hf" # "meta-llama/Llama-2-7b-chat-hf"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
device

'cpu'

In [None]:
file_path='reviews.csv'
source_column="text.text"
metadata_columns= ["place_id", "rating"]
documents = get_review_data(file_path, source_column, metadata_columns)

In [None]:
cache_embeddings_store="./cache/"
embedding_model = get_hf_embedding_model(EMBEDDING_MODEL_NAME,
                                         cache_embeddings_store=cache_embeddings_store,
                                         normalize_embeddings=False)

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# smaple embedding
text = "This is a test document."

query_result = embedding_model.embed_query(text)
query_result[:5], len(query_result)

([-0.03833850473165512,
  0.1234646886587143,
  -0.028642967343330383,
  0.053652726113796234,
  0.00884536374360323],
 384)

In [None]:
cache_embeddings_store="./cache/"
distance_strategy='EUCLIDEAN_DISTANCE'

vector_database = get_vector_database(documents, embedding_model,
                        distance_strategy=distance_strategy)

In [None]:
REVIEWS_FAISS_PATH = "faiss_index"
FAISS_INDEX_NAME = "index"
vector_db.save_local(folder_path=REVIEWS_FAISS_PATH, index_name=FAISS_INDEX_NAME)
vector_db = FAISS.load_local(folder_path=REVIEWS_FAISS_PATH, embeddings=embedding_model, index_name=FAISS_INDEX_NAME)

In [None]:
MODEL_NAME =  "meta-llama/Llama-2-7b-hf" # "meta-llama/Llama-2-7b-chat-hf"

In [None]:
# old method

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )
# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# pipe = pipeline(
#     model=model,
#     tokenizer=tokenizer,
#     return_full_text=True,  # langchain expects the full text
#     task='text-generation',
#     # we pass model parameters here too
#     temperature=0.0001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
#     max_new_tokens=512,  # mex number of tokens to generate in the output
#     repetition_penalty=1.1  # without this output begins repeating
# )

# llama2 = HuggingFacePipeline(pipeline=pipe)

In [None]:
# new methon
llama2 =  get_hf_llm(MODEL_NAME)



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:
llama2(prompt="Tell me how can I use quantization for a LangChain Embedding model.")

' Unterscheidung zwischen „Quantile“ und „Quantization“. The Quantization of the Difference between Two Numbers. The quantization of the difference between two numbers is a technique used to reduce the number of bits required to represent a value in a computer system. The quantization of the difference between two numbers is a technique used to reduce the number of bits required to represent a value in a computer system. The quantization of the difference between two numbers is a technique used to reduce the number of bits required to represent a value in a computer system. The quantization of the difference between two numbers is a technique used to reduce the number of bits required to represent a value in a computer system. The quantization of the difference between two numbers is a technique used to reduce the number of bits required to represent a value in a computer system. The quantization of the difference between two numbers is a technique used to reduce the number of bits req