Simple Pipeline that
- read all the .txt from a folder
- chunk data
- apply embedding
- save in Pgvector and qdrant preprod
- retrieve data from pgvector
- query a question using mistral 


In [2]:
import requests
import json
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from dotenv import load_dotenv



In [3]:
load_dotenv()
local_path=os.getenv("LOCAL_PATH")
collection_name=os.getenv('COLLECTION_NAME')
embedding_model=os.getenv('EMBEDDING_MODEL')
chunk_size=int(os.getenv('CHUNK_SIZE'))
chunk_overlap=int(os.getenv('CHUNK_OVERLAP'))
pgddisconnection=os.getenv('PGDDISCONNECTION')
qdrant_url = os.getenv("QDRANT_URL", "")
qdrant_api_key = os.getenv("QDRANT_API_KEY", "")



In [4]:
question = "What is Retrieval-Augmented Generation (RAG), and why is it useful?"

In [5]:
## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model,
            model_kwargs = {'device': 'cpu'})


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print(local_path)

In [6]:
loader = DirectoryLoader(f'{local_path}', glob="./*.pdf")

documents = loader.load()
len(documents)

2

In [7]:
text_splitter = RecursiveCharacterTextSplitter(separators=['##'], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)

In [8]:
texts

[Document(metadata={'source': 'data/pdfs/India Associate Handbook V 1.1(23 June) (1).pdf'}, page_content="TABLE OF CONTENTS\n\nS. No.\n\nINDEX\n\n01\n\nRED HAT’s VALUE PROPOSITION, WHY, MISSION, VISION, AND VALUES\n\n02\n\nPURPOSE\n\n03\n\nRECRUITMENT/ PRE-EMPLOYMENT\n\n04\n\nBACKGROUND SCREENING\n\n05\n\nTYPES OF EMPLOYMENT\n\n06\n\nPROBATIONARY PERIOD\n\n07\n\nRECOGNITION OF SENIORITY\n\n08\n\nEMPLOYEE NUMBERS; IDENTITY/ACCESS CARD\n\n09\n\nWAY OF WORKING AND WORK PROFILES\n\n10\n\nCHANGE OF PERSONAL INFORMATION\n\n11\n\nWORKING HOURS & RECORDING OF TIME WORKED\n\n01\n\nNormal Working Hours\n\n02\n\nWork in Excess of Normal Working Hours\n\n03\n\nWork on a Weekly-Off or Public Holiday\n\n04\n\nRecording of Time Worked\n\n05\n\nShift-Work\n\n06\n\nOn-Call Program\n\n07\n\nPlanned Weekend Program\n\n12\n\nSTATUTORY BENEFITS\n\n01\n\nEmployee Provident Fund\n\n02\n\nEmployee State Insurance Act (ESI)\n\n03\n\nGratuity\n\n04\n\nBonus\n\n13\n\nTOTAL REWARDS AT RED HAT\n\n01\n\nPay\n\n02\n

In [None]:
len(documents)

In [None]:
texts[0:3]

In [11]:
db = PGVector.from_documents(
    documents= texts,
    embedding = embeddings,
    collection_name= collection_name,
    distance_strategy = DistanceStrategy.COSINE,
    pre_delete_collection = True,
    connection=pgddisconnection)

In [None]:

Qdrant.from_documents(
    texts,
    embeddings,
    url=qdrant_url,
    api_key=qdrant_api_key,
    port=None,
    collection_name=collection_name,
    force_recreate=True
)

In [None]:
vector_store = PGVector(
        connection=pgddisconnection, 
        collection_name=collection_name, 
        embeddings=embeddings,
    )


In [None]:
test = vector_store.similarity_search(question, k=2)
test

In [None]:
# Query for which we want to find semantically similar documents


#Fetch the k=2 most similar documents
docs =  db.similarity_search(question, k=1)

In [None]:
docs

In [None]:
context = '\n'.join([x.page_content for x in docs])


prompt = f"""[INST]You are a helpful chatbot that can answer questions based on the provided context. 
You need not make use of the entire context provided to you.
Try to interpret the question. If it is a general question asking for definitions, you can rephrase the content without changing the meaning of it.
If the asked question demands steps or process or procedure, do not change the content and stick to the original form as possible. Also if context has Red Hat specific knowledge add that in answer.
Also provide the source from which you took the answer under source: tag

Context: {context} [\INST]
Question: {question}"""

In [None]:
url = 'https://ddis-mistral-7b.apps.int.stc.ai.preprod.us-east-1.aws.paas.redhat.com/v1/chat/completions'
headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}
data = {
    "messages": [
        {
            "role": "user",
            "content": prompt
        }
    ],
    "model": "mistral-7b",
    "stream": False
}

try:
    response = requests.post(url, headers=headers, data=json.dumps(data), verify=False, timeout=30)
    response.raise_for_status()  # Check for HTTP errors

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

In [None]:
print(response.json()['choices'][0]['message']['content']) 

In [None]:
# Intializing llm variables
# openai_api_key = constants.OPEN_AI_KEY
# openai_api_base = "{llm_url}/v1".format(llm_url=constants.LLM_URL)
# Fetch model information
try:
    response = requests.get("https://ddis-mistral-7b.apps.int.stc.ai.preprod.us-east-1.aws.paas.redhat.com/v1/models")
    response.raise_for_status()
    model = response.json()["data"][0]["id"]
    print(f"Model ID: {model}")
except requests.RequestException as e:
    print(f"Failed to fetch model information: {e}")
    raise
model

In [None]:
from openai import OpenAI


client = OpenAI(
    api_key="EMPTY",
    base_url="https://ddis-mistral-7b.apps.int.stc.ai.preprod.us-east-1.aws.paas.redhat.com/v1",
)

stream = client.chat.completions.create(
    model=model,
    messages=[{'role': 'user', 'content': prompt}],
    stream=True,
    user='user_identifier',
)

response = ""
for chunk in stream:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")
        response += chunk.choices[0].delta.content
response
