## RAG Using Llama2 With Hugging Face

In [None]:
# Install the PyPDF library, which provides tools for working with PDF documents
!pip install pypdf

In [10]:
# Install the necessary libraries for the project:
# - transformers: for using pre-trained models from the Hugging Face library
# - einops: for flexible tensor operations
# - accelerate: for optimizing model training and inference
# - langchain: for building language models and chains
# - bitsandbytes: for efficient model training on GPUs
!pip install -q transformers einops accelerate langchain bitsandbytes

In [None]:
# Install the llama-index library, which is used for indexing and querying large datasets
!pip install llama-index

In [None]:
# Install the following libraries:
# - llama-index-llms-llama-cpp: for integrating LLaMA models with llama-index using the llama-cpp backend
# - llama-index-embeddings-huggingface: for utilizing Hugging Face embeddings within llama-index
# - llama-index-llms-huggingface: for integrating Hugging Face language models with llama-index
!pip install llama-index-llms-llama-cpp llama-index-embeddings-huggingface llama-index-llms-huggingface

In [None]:
# Upgrade to the latest version of the langchain-community library.
# This library includes community-contributed tools and extensions for enhancing LangChain functionalities.
!pip install -U langchain-community

In [None]:
# Log in to the Hugging Face CLI (Command Line Interface).
# This command will prompt you to enter your Hugging Face credentials,
# allowing you to access and interact with Hugging Face's models and datasets.
!huggingface-cli login

In [None]:
# Display detailed information about the installed llama-index package,
# including its version, dependencies, and other metadata.
!pip show llama-index

In [None]:
# Display detailed information about the installed langchain package,
# including its version, dependencies, and other metadata.
!pip show langchain

In [None]:
# Install the Streamlit library, which is used for building interactive web applications
# for data science and machine learning projects.
!pip install streamlit

In [None]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext, set_global_service_context, VectorStoreIndex, SimpleDirectoryReader
from llama_index.legacy.embeddings.langchain import LangchainEmbedding
import os

# Set the Hugging Face API token as an environment variable.
# This token is used to authenticate and access Hugging Face models and datasets.
os.environ["HF_TOKEN"] = ''


# Define the system prompt for the Q&A assistant
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""

# Define the query wrapper prompt in the default format supportable by LLama2
query_wrapper_prompt = SimpleInputPrompt("{query_str}")

# Initialize the Hugging Face LLaMA model with specific parameters
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="cpu",
    # Uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16 , "load_in_4bit":True}
)

# Initialize the embedding model using Hugging Face embeddings
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

# Create a service context with default settings, including chunk size, LLM, and embedding model
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

# Load documents from a specified directory using SimpleDirectoryReader
documents = SimpleDirectoryReader("/content/drive/MyDrive/Colab Notebooks/Langchain_RAG/data").load_data()

# Create a vector store index from the loaded documents using the service context
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

# Create a query engine from the vector store index
query_engine = index.as_query_engine()

# Perform a query to find the most ordered item
response = query_engine.query("which is the most ordered item?")
print(response)

## Streamlit UI

In [None]:
import streamlit as st
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext, set_global_service_context, VectorStoreIndex, SimpleDirectoryReader
from llama_index.legacy.embeddings.langchain import LangchainEmbedding
import os
from getpass import getpass

# Set the Hugging Face API token as an environment variable
os.environ["HF_TOKEN"] = getpass()

# Define the system prompt for the Q&A assistant
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""

# Define the query wrapper prompt in the default format supportable by LLama2
query_wrapper_prompt = SimpleInputPrompt("{query_str}")

# Initialize the Hugging Face LLaMA model with specific parameters
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="cpu"
)

# Initialize the embedding model using Hugging Face embeddings
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

# Create a service context with default settings, including chunk size, LLM, and embedding model
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

# Streamlit UI setup
st.title("Document Q&A with LLama-2")

# File upload interface
uploaded_files = st.file_uploader("Upload your documents", accept_multiple_files=True, type=["txt", "pdf", "docx"])

if uploaded_files:
    # Create a temporary directory for saving uploaded files
    temp_dir = "temp_uploads"
    os.makedirs(temp_dir, exist_ok=True)

    # Save uploaded files to the temporary directory
    for uploaded_file in uploaded_files:
        with open(os.path.join(temp_dir, uploaded_file.name), "wb") as f:
            f.write(uploaded_file.getbuffer())

    # Load documents from the temporary directory
    documents = SimpleDirectoryReader(temp_dir).load_data()

    # Create a vector store index from the loaded documents using the service context
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)

    # Create a query engine from the vector store index
    query_engine = index.as_query_engine()

    # Input field for the user to ask a question
    question = st.text_input("Ask a question about your documents")

    if question:
        # Get the response from the query engine and display it
        response = query_engine.query(question)
        st.write("Response:", response)

    # Clean up temporary files after processing
    for uploaded_file in uploaded_files:
        os.remove(os.path.join(temp_dir, uploaded_file.name))
else:
    st.write("Please upload documents to proceed.")

# Run the Streamlit app
!streamlit run app.py