In [None]:
import yaml, os
import psycopg2
from sqlalchemy import make_url
from llama_index.llms import AzureOpenAI
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from llama_index.vector_stores import PGVectorStore
from llama_index.node_parser import SimpleNodeParser
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index import SimpleDirectoryReader, ServiceContext, StorageContext, VectorStoreIndex

In [None]:
# !mkdir -p 'data/'
# !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham_essay.txt'

In [None]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']

## Configure LLMs

In [None]:
embedding_llm = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm
                                                )
set_global_service_context(service_context)

In [None]:
documents = SimpleDirectoryReader(
                                './data',
                                encoding='utf-8',
                                ).load_data()
documents

In [None]:
simple_node_parser = SimpleNodeParser.from_defaults() 
nodes = simple_node_parser.get_nodes_from_documents(documents)
nodes

## pgvector for Storing Embeddings
1. pull the image using `docker pull ankane/pgvector`
2. run the image using `docker run --name pgvector-demo -e POSTGRES_PASSWORD=mysecretpassword -p 5432:5432 -d ankane/pgvector`

In [None]:

# CONNECTION_STRING = "postgresql+psycopg2://postgres:{}@localhost:5432/pgvector_rag".format(credentials['POSTGRES_PASSWORD'])
# COLLECTION_NAME = 'state_of_union_vectors'

# db = PGVector.from_documents(
#                         embedding=embeddings,
#                         documents=texts,
#                         collection_name=COLLECTION_NAME,
#                         connection_string=CONNECTION_STRING,
#                     )

### create the db

In [None]:
connection_string = "postgresql://postgres:{}@localhost:5432".format(credentials['POSTGRES_PASSWORD'])
db_name = "pgvector_rag"
conn = psycopg2.connect(connection_string)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [None]:
url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
                                        database=db_name,
                                        host=url.host,
                                        password=url.password,
                                        port=url.port,
                                        user=url.username,
                                        table_name="vectors",
                                        embed_dim=384,
                                        )

In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
                                    documents, 
                                    storage_context=storage_context, 
                                    show_progress=True
                                    )

In [None]:
query_engine = index.as_query_engine()