# PostgreSQL as Vector Database with LangChain

In this lab you will utilize the built in Document Loading capabilities of LangChain to embed documents into PostgreSQL.

You will then use a similarity search to search the indexed and embedded documents followed by using a query Retriever to add context to an Open AI request.

In [None]:
!pip install langchain
!pip install langchain-openai
!pip install psycopg2
!pip install tiktoken
!pip install pypdf

Import the necessary modules and types, create a database connection string.

In [None]:
import os
import psycopg2
from pgvector.psycopg2 import register_vector

from typing import List, Tuple
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter

#depening on your langchain version...
#from langchain_community.document_loaders import TextLoader
#from langchain_community.document_loaders import PdfLoader
#from langchain_community.vectorstores import PGEmbedding
#from langchain_openai import OpenAIEmbeddings

from langchain.document_loaders import UnstructuredXMLLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.vectorstores.pgvector import DistanceStrategy

from langchain.embeddings.azure_openai import AzureOpenAIEmbeddings

Connect to the database and register the vector extension. Be sure to replace the `SUFFIX`.

In [None]:
username = 'wsuser'
password = 'Solliance123'
host = 'pgsqldevuguksx7rflex14.postgres.database.azure.com'
port = '5432'
dbname = 'ailabs'

connection_string = f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{dbname}'

In [None]:
dbconn = psycopg2.connect(host=host, user=username, password=password,
    port=port, database=dbname , connect_timeout=10)
dbconn.set_session(autocommit=True)

cur = dbconn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

register_vector(dbconn)

## Azure OpenAI Studio

## GPT Text Embeddings

Now that you have deployed a simple model via Azure Machine Learning Studio, let's look at redo-ing our embeddings using a different embedding model called `text-embedding-ada-002`.

This model is not deployed via Azure Machine Learning Studio, but is a part of Azure OpenAI.

- Open the Azure Portal
- Search for **Azure Open AI**
- Under **Resource Management**, select **Keys and Endpoint**
- Record the endpoing and the key
- Under **Resource Management**, select **Model deployments**
- Select **Manage Delopments**
- Select **Create new deployment**
- Select the **text-embedding-ada-002** model
- For the deployment name, type **embeddings**
- Select **Create**
- Once the model is deployed, run the following cells to regenerate your embeddings. Be sure to replace the endpoint and key with the ones you just recorded.

In [None]:
#get the openai embeddings
embedding_model = "embeddings" #this is the name of the model deployment in azure open ai (not the type of model)

azure_endpoint = "YOUR_API_URL"  #https://your-endpoing.openai.azure.com/
azure_key = 'YOUR_API_KEY'

azure_endpoint = "https://pgsqldevuguksx7r2.openai.azure.com/"  #https://your-endpoing.openai.azure.com/
azure_key = '4358bd337d834ed492561bcd20851c81'

embeddings = AzureOpenAIEmbeddings(
                deployment=embedding_model,
                openai_api_base=azure_endpoint,
                openai_api_key=azure_key,
                openai_api_type="azure",
            )

Setup a logging handler to output Langchain logs.

In [None]:
import logging
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")
rootLogger = logging.getLogger()

logPath = "./logs"
fileName = "langchain"

try:
    os.mkdir(logPath)
except:
    pass

fileHandler = logging.FileHandler("{0}/{1}.log".format(logPath, fileName))
fileHandler.setFormatter(logFormatter)
rootLogger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
rootLogger.addHandler(consoleHandler)

rootLogger.setLevel(logging.DEBUG)

Create method to insert documents to PostgreSQL.

In [None]:
def insert_document(db, doc, collection_id):

    import uuid

    #generate the embeddings
    doc_embeddings = embeddings.embed_documents(doc.page_content)

    #insert with manual sql
    sql = "INSERT INTO langchain_pg_embedding (collection_id, embedding, document, cmetadata, custom_id, uuid) VALUES ({0},ARRAY {1}, {2}, {3},{4}, {5})".format(collection_id, doc_embeddings, doc.page_content, None, None, uuid.uuid4())
    
    cur.execute(sql)

Use LangChain and PyPDF to split a document apart and then  PGVector to insert the embeddings into PostgreSQL.

In [None]:
loader = PyPDFLoader("./data/azure_openai_infographic.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

collection_name = "open_ai_pdf"

db = PGVector(
    collection_name=collection_name,
    connection_string=connection_string,
    embedding_function=embeddings,
)

Create a new vector database object from the previous objects.

In [None]:
pdf_db = db.from_documents(
    documents= docs,
    embedding = embeddings,
    collection_name= collection_name,
    distance_strategy = DistanceStrategy.COSINE,
    connection_string=connection_string,
    logger=rootLogger
)

Perform a query against the database using simple query.  This will return matching results, but not perform any type of OpenAI completion.

In [None]:
#utilize a simple similarity search
query = "Azure OpenAI"

docs_with_score: List[Tuple[Document, float]] = pdf_db.similarity_search_with_score(query)

for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

Load the earnings transcript.

In [None]:
#use the text loader and splitter to break apart the document into chunks
loader = TextLoader("./data/msft_earnings_call_transcript.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

collection_name = "msft_earnings"

db = PGVector(
    collection_name=collection_name,
    connection_string=connection_string,
    embedding_function=embeddings,
)

#Full database load with the creation of the collection and the embedding table.
msft_db = db.from_documents(
    documents= docs,
    embedding = embeddings,
    collection_name= collection_name,
    distance_strategy = DistanceStrategy.COSINE,
    connection_string=connection_string)

#You can also manually import the documents into a target collection
#for i in range(0, len(docs), 1):
#    temp_docs = docs[i:i+1]
#    msft_db.add_documents( documents=temp_docs, collection_name=collection_name, connection_string=connection_string)

Import the needed classes.

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory.buffer import ConversationBufferMemory

Configure a retriever, setup a prompt, the LLM and then create the LLM Chain.  Then execute the Chain.

In [None]:
#utilize a query retriver
prompt_prefix = "You are a question and answering system. You are given a question and a context. You must answer the question based on the context provided. {context} Question: {question}"

deployment_name = "your_deployment_name" #completions4
api_version = "your_api_version" #2024-02-15-preview
model_version = "your_model_version"  #0125-Preview

retriever = msft_db.as_retriever()

#create a simple QA chain
prompt = PromptTemplate(
    template=prompt_prefix,
    input_variables=["context", "question",  "chat_history"], #"summaries", "question"
)

llm = AzureChatOpenAI(deployment_name=deployment_name,
                        temperature=0,
                        openai_api_base=azure_endpoint,
                        openai_api_key=azure_key,
                        openai_api_type="azure",
                        openai_api_version=api_version,
                        model_version=model_version)

memory = ConversationBufferMemory(
            memory_key="chat_history",
            input_key="question"
        )

llm_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=False,
    memory=memory,
    #callbacks = [self.handler],
    chain_type="stuff",
    combine_docs_chain_kwargs={"prompt": prompt},
    verbose=True
)


Ask the GPT model a question and get display the answer.

In [None]:
question = "Which analysts were on the call?"

answer = llm_chain.invoke(question, return_only_outputs=True)['answer']

print(answer)