In [1]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

In [2]:
df_chunks = spark.read.format('parquet').load('chunks.snappy.parquet')
df_chunks.show()

+--------------------+--------------------+
|            pdf_name|             content|
+--------------------+--------------------+
|file:/content/pdf...|="i6311855089984a...|
|file:/content/pdf...|All references to...|
|file:/content/pdf...|Hopper architectu...|
|file:/content/pdf...|The computing ind...|
|file:/content/pdf...|In July 2023, the...|
|file:/content/pdf...|As a percentage o...|
|file:/content/pdf...|Our product and s...|
|file:/content/pdf...|We have not exper...|
|file:/content/pdf...|Data Center reven...|
|file:/content/pdf...|We had solid dema...|
|file:/content/pdf...|We introduced an ...|
|file:/content/pdf...|Financial Informa...|
|file:/content/pdf...|4 (0.5) 0.4 1.3 5...|
|file:/content/pdf...|084 $\n14,862 $ 5...|
|file:/content/pdf...|Revenue from GPU ...|
|file:/content/pdf...|We refer to custo...|
|file:/content/pdf...|Gross Profit and ...|
|file:/content/pdf...|The net effect on...|
|file:/content/pdf...|For fiscal year 2...|
|file:/content/pdf...|For fiscal

In [3]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(3, 384)
tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


In [2]:
#%pip install --quiet transformers==4.46.1 "unstructured[pdf,docx]==0.10.30" llama-index==0.9.3 pydantic==2.9.2 mlflow==2.12.1
%pip install langchain_community
%pip install faiss-gpu
%pip install -U bitsandbytes

Collecting langchain_community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadat

In [1]:
# Importing necessary libraries and modules.
import uuid
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
import os
import faiss
import cloudpickle
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from operator import itemgetter
from langchain.schema.runnable import RunnableMap
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


In [2]:
# Set the path for the database directory.
DB_PATH="./db/"

In [4]:
# Function to load or create a vector database.
# This database will be used for storing and retrieving document embeddings.
def load_vector_db(DB_PATH="./db/"):
    # Initialize variables for the components of the database.
    db = None
    memoryDocStoreDict = {}
    indexToDocStoreIdDict = {}

    # Check if the database already exists. If it does, load its components.
    if os.path.exists(DB_PATH):
        memoryDocStoreDict = cloudpickle.load(open(DB_PATH+"memoryDocStoreDict.pkl", "rb"))
        indexToDocStoreIdDict = cloudpickle.load(open(DB_PATH+"indexToDocStoreIdDict.pkl", "rb"))
        index = faiss.read_index(DB_PATH+"faiss.index")
    else:
        # If the database does not exist, create a new FAISS index.
        index = faiss.IndexFlatL2(384)

    # Create the FAISS vector database with the loaded or new components.
    db = FAISS(
        index=index,
        docstore=InMemoryDocstore(memoryDocStoreDict),
        index_to_docstore_id=indexToDocStoreIdDict,
        embedding_function=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cuda:0'})
    )
    return db

In [5]:
# Function to populate the vector database with documents.
# It processes each file in the 'wiki/' directory, splits the content into smaller chunks,
# and stores these chunks along with their metadata in the database.
def populate_vector_db(DB_PATH="./db/"):
    db = load_vector_db(DB_PATH=DB_PATH)

    # Process each file in the 'wiki/' directory.
    for wiki_file in os.listdir("wiki/"):
        texts = []
        metadatas = []

        wiki_file_path  = "wiki/"+wiki_file
        wiki_chunks_dir = "wiki_chunks/"+wiki_file
        os.makedirs(wiki_chunks_dir, exist_ok=True)

        # Read the content of the file.
        content = open(wiki_file_path, "r").read()
        # Split the content into smaller chunks for better manageability.
        for chunk in TokenTextSplitter(chunk_size=256).split_text(content):
            random_uuid = str(uuid.uuid4())
            texts.append(chunk)

            wiki_chunk_file_path = wiki_chunks_dir+"/"+random_uuid+".txt"
            open(wiki_chunk_file_path, "w").write(chunk)
            metadatas.append({
                'wiki_file_path': wiki_file_path,
                'wiki_chunk_file_path': wiki_chunk_file_path
            })

        # Add the text chunks and their metadata to the database.
        db.add_texts(texts, metadatas)

    # Save the components of the database if the directory does not exist.
    if not os.path.exists(DB_PATH):
        os.makedirs(DB_PATH)

    cloudpickle.dump(db.docstore._dict, open(DB_PATH+"memoryDocStoreDict.pkl", "wb"))
    cloudpickle.dump(db.index_to_docstore_id, open(DB_PATH+"indexToDocStoreIdDict.pkl", "wb"))
    faiss.write_index(db.index, DB_PATH+"faiss.index")

    return db

In [6]:
# Function to configure and retrieve a large language model from Hugging Face.
def get_llm():
    # Define the model name and retrieve the necessary token for authentication.
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    token = os.environ['HUGGINGFACE_TOKEN']

    # Configure the model for quantization to reduce memory usage.
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    device_map = {"": 0}

    # Load the model and tokenizer from Hugging Face with the specified configurations.
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        use_auth_token=token
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

    # Create a pipeline for text generation using the loaded model and tokenizer.
    llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
    llm = HuggingFacePipeline(pipeline=llama_pipeline, model_kwargs={'temperature':0.7})

    return llm

In [7]:
# Function to format a list of documents into a single string.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [8]:
# Function to ask a question and receive an answer using the large language model and the document database.
def ask(q):
    # Define a template for the prompt to be used with the large language model.
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""
    rag_prompt_custom = PromptTemplate.from_template(template)

    llm = get_llm()

    # Create a chain of operations to process the question.
    rag_chain_from_docs = (
        {
            "context": lambda input: format_docs(input["documents"]),
            "question": itemgetter("question"),
        }
        | rag_prompt_custom
        | llm
        | StrOutputParser()
    )
    rag_chain_with_source = RunnableMap(
        {"documents": db.as_retriever(), "question": RunnablePassthrough()}
    ) | {
        "documents": lambda input: [doc.metadata for doc in input["documents"]],
        "answer": rag_chain_from_docs,
    }

    # Invoke the chain of operations with the question.
    response = rag_chain_with_source.invoke(q)
    print(response["answer"])
    for doc in response["documents"]:
        print(doc['wiki_chunk_file_path'])

In [None]:
import os
# Specify access token
os.environ['HUGGINGFACE_TOKEN'] = 'sgfgsdfsgd'

In [14]:
# Main execution block: populate and load the vector database, then use it to answer a sample question.
if __name__=="__main__":
    db = populate_vector_db(DB_PATH=DB_PATH)
    db = load_vector_db(DB_PATH=DB_PATH)
    ask("What is the capital of NJ?")

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=llama_pipeline, model_kwargs={'temperature':0.7})


Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer. 
    Use three sentences maximum and keep the answer as concise as possible. 
    
    Question: What is the capital of NJ?
    Helpful Answer: The capital of NJ is Trenton.
    Context:
        * New Jersey is a state in the northeastern United States.
        * Trenton is located in the central part of the state.
        * Trenton has been the capital of New Jersey since 1784.


In [16]:
ask("What is the capital of NJ?")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer. 
    Use three sentences maximum and keep the answer as concise as possible. 
    
    Question: What is the capital of NJ?
    Helpful Answer: The capital of NJ is Trenton.
    Additional Information: NJ is a state located in the northeastern United States. It is bordered by NY to the north and east, PA to the northwest, DE to the south, and MD to the southwest.

    Please answer the question below using the provided context.


In [18]:
input

In [13]:
import os
# Specify access token
os.environ['HUGGINGFACE_TOKEN'] = ''