# Query your .pdf's with AzureOpenAI and AzureAI Search with Langchain 

### read and clean the .pdf document


In [None]:
from pypdf import PdfReader

doc_reader = PdfReader('./IMF.pdf')

raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

print(len(raw_text))



In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(

separator= "\n",
chunk_size= 1000,
chunk_overlap= 200,
length_function = len
)

texts = text_splitter.split_text(raw_text)

### Normalize and clean the text for embeddings 

In [None]:
import re 

def normalize_text(s, sep_token = "\n"):
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r". ,","",s)

    s = s.replace("..", ".")
    s = s.replace("..", ".")
    s = s.replace("\n", "")
    s = s.strip()
    return s 

texts = list(map(normalize_text, texts))


### Create embeddings 
%pip install --upgrade --quiet  azure-search-documents
%pip install --upgrade --quiet  azure-identity


In [None]:
import os

from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import OpenAIEmbeddings

In [None]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["AZURE_OPENAI_ENDPOINT"] = "XXX"
os.environ["AZURE_OPENAI_API_KEY"] = "XXX"
#os.environ["OPENAI_API_VERSION"] = "2022-12-01"
model: str = "text-embedding-ada-002"

In [None]:
vector_store_address: str = ""
vector_store_password: str = "MJR0bptv0TCgtPNg4fnmN7fAm0FJYP1GD61inCij4MAzSeCTnacB"

In [None]:
!pip install azure-search-documents



In [None]:
import os
from openai import AzureOpenAI

# Assuming AzureOpenAI is correctly imported or defined
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def generate_embeddings(text, model="text-embedding-ada-002"):
    # Adjusted to handle a single text input
    return client.embeddings.create(input=[text], model=model).data[0].embedding



# Generate embeddings for each text in the list
embeddings = [generate_embeddings(text, model='text-embedding-ada-002') for text in texts]

# If you need to associate these embeddings with their respective texts in a structured form, you can do so. For example:
text_embeddings = [{"text": text, "embedding": embedding} for text, embedding in zip(texts, embeddings)]


In [None]:
print(len(text_embeddings))
print(len(text_embeddings[0]["embedding"]))

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex, SimpleField, SearchableField, ComplexField

# Your service details
search_service_name = "ai-search-service-01"
search_index_name = "imf0x"
api_key = "XXX"

curl -X PUT https://ai-search-service-01.search.windows.net/indexes/imf0x?api-version=2023-11-01&allowIndexDowntime=true
Content-Type: application/json
api-key: {{MJR0bptv0TCgtPNg4fnmN7fAm0FJYP1GD61inCij4MAzSeCTnacB}}
{
    "name": "{{imf0x}}",
    "fields": [
        {
            "name": "id",
            "type": "Edm.String",
            "key": true,
            "filterable": true
        },
        {
            "name": "title",
            "type": "Edm.String",
            "searchable": true,
            "filterable": true,
            "sortable": true,
            "retrievable": true
        },
        {
            "name": "titleVector",
            "type": "Collection(Edm.Single)",
            "searchable": true,
            "retrievable": true,
            "dimensions": 1536,
            "vectorSearchProfile": "my-default-vector-profile"
        },
        {
            "name": "content",
            "type": "Edm.String",
            "searchable": true,
            "retrievable": true
        },
        {
            "name": "contentVector",
            "type": "Collection(Edm.Single)",
            "searchable": true,
            "retrievable": true,
            "dimensions": 1536,
            "vectorSearchProfile": "my-default-vector-profile"
        }
    ],
    "vectorSearch": {
        "algorithms": [
            {
                "name": "my-hnsw-config-1",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            }
        ],
        "profiles": [
            {
                "name": "my-default-vector-profile",
                "algorithm": "my-hnsw-config-1"
            }
        ]
    }
}


In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

search_service_name = "ai-search-service-01"
search_index_name = "imf0x"
api_key = "XXX"

search_endpoint = f"XXX"
search_client = SearchClient(endpoint=search_endpoint, index_name=search_index_name, credential=AzureKeyCredential(api_key))


In [None]:
print (embeddings[0])

In [None]:
print(text_embeddings[0]["text"])
print(text_embeddings[0]["embedding"])


### Upload the document 

In [None]:
# Assuming text_embeddings is structured like this:
# text_embeddings = [{"text": "sample text 1", "embedding": [0.1, 0.2, ..., 0.x]}, ...]

# Prepare documents for upload
documents = []
for i, item in enumerate(text_embeddings, start=1):
    documents.append({
        "id": str(i),  # Assigning a unique ID for each document
        "content": item["text"],  # Your text field
        "contentVector": item["embedding"]  # Your embedding field; ensure this matches your index configuration
    })

# Upload documents to the index
try:
    result = search_client.upload_documents(documents=documents)
    print("Upload successful")
except Exception as e:
    print(f"Failed to upload documents: {e}")
