# 0.1 Install ACS SDK

In [None]:
!pip install azure-search-documents==11.4.0b8
!pip install azure-identity

# 0.2 Import libraries

In [3]:
import dotenv
import openai
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch

# 0.3.1 Configure OpenAI settings (API key based)

In [4]:
dotenv.load_dotenv()
openai.api_type = "azure"
openai.api_version = "2023-05-15" 
openai.api_base = os.getenv("AZURE_OPENAI_API_BASE")
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")

# 0.3.2 Configure OpenAI settings (role based)

In [None]:
import openai
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
token = credential.get_token("https://cognitiveservices.azure.com/.default")

openai.api_type = "azure_ad"
openai.api_key = token.token
openai.api_base = "https://dev-qia-design-review-automation-oai-eus.openai.azure.com/"
openai.api_version = "2023-05-15"  # subject to change

# 1.1 Create embeddings and ACS instance (single file)

In [4]:
deployment_id = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(
    openai_api_key=openai.api_key,
    deployment=deployment_id,
    openai_api_type='azure',
    chunk_size=1
)

vector_store_address = os.getenv("AZURE_COGNITIVE_SEARCH_URL")
vector_store_password = os.getenv("AZURE_COGNITIVE_SEARCH_KEY")
index_name = "acs-demo"
vector_store = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

# 1.2 Upload embeddings to ACS

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = TextLoader("state_of_the_union.txt", encoding="utf-8")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# vector_store.add_documents(documents=docs) # this step will upload embeddings to ACS

# 1.3 Perform a vector similarity search

In [None]:
docs = vector_store.similarity_search(
    query="What did the president say about Ketanji Brown Jackson",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

# 2.1 Create embeddings and ACS instance (multi-file)

In [None]:
deployment_id = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(
    openai_api_key=openai.api_key,
    deployment=deployment_id,
    openai_api_type='azure',
    chunk_size=1
)

vector_store_address = os.getenv("AZURE_COGNITIVE_SEARCH_URL")
vector_store_password = os.getenv("AZURE_COGNITIVE_SEARCH_KEY")
index_name = "acs-demo-multi-file"
vector_store = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

# 2.2 Combine and upload multiple PDFs to ACS

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter

path = "contents/"
loaders = [PyMuPDFLoader(os.path.join(path, fn)) for fn in os.listdir(path)]

all_documents = []
for loader in loaders:
    raw_documents = loader.load()

    text_splitter = CharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        length_function=len,
    )
    documents = text_splitter.split_documents(raw_documents)
    all_documents.extend(documents)

vector_store.add_documents(documents=all_documents) # this step will upload embeddings to ACS

# 2.3 Perform a query

In [None]:
from langchain import OpenAI
from langchain.chains import RetrievalQA

llm = OpenAI(
    temperature = 0, 
    openai_api_key = openai.api_key, 
    engine="gpt-35-turbo"
)

qa = RetrievalQA.from_chain_type(
    llm = llm, 
    chain_type = "stuff", 
    retriever = vector_store.as_retriever()
)
qa.run("What is the scope of this project?")

# 3.1 Create embeddings and ACS instance (unstructured files)

In [None]:
deployment_id = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(
    openai_api_key=openai.api_key,
    deployment=deployment_id,
    openai_api_type='azure',
    chunk_size=1
)

vector_store_address = os.getenv("AZURE_COGNITIVE_SEARCH_URL")
vector_store_password = os.getenv("AZURE_COGNITIVE_SEARCH_KEY")
index_name = "acs-demo-unstructured-file"
vector_store = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

# 3.2 Combine and upload multiple files to ACS

In [None]:
!pip install "unstructured[all-docs]"

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

directory = "contents/"
files = []

for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath) and (filename.endswith('.pdf') or filename.endswith('.xlsx') or filename.endswith('.xlsm')):
      files.append(filepath)
    
loaders = [UnstructuredFileLoader(file) for file in files]

all_documents = []
for loader in loaders:
    raw_documents = loader.load()

    text_splitter = CharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        length_function=len,
    )
    
    print(raw_documents[0].page_content[:100])
    
    documents = text_splitter.split_documents(raw_documents)
    all_documents.extend(documents)

# vector_store.add_documents(documents=all_documents) # this step will upload embeddings to ACS