# Load  Files from Jira  into CogSearch 
### The issues from Jira are loaded into CogSearch by following below steps
- Establish a connection with Jira using the Python SDK.
- Retrieve the required issues from Jira using the JQL query and Search method.
- Use Azure open ai to index the issue content.
- Index the parsed chunks into Azure Cognitive Search.
- Repeat the process for all the required files.

#### Using the Azure Storage Pyhon SDK  to fetch the file stream and use PYPDF and Document Intelligence to chunk the page in memory and create a vector index
- https://developer.atlassian.com/cloud/jira/platform/rest/v3/api-group-issue-search/#api-rest-api-3-search-post
- Refer to https://github.com/MSUSAzureAccelerators/Azure-Cognitive-Search-Azure-OpenAI-Accelerator/blob/main/04-Complex-Docs.ipynb for loading large documents using PYPDF and Document Intelligence- 


In [None]:
pip install -r requirements.txt

#### Imports and credentials

In [None]:

import shutil
from PyPDF2 import PdfFileReader, PdfFileWriter,PdfReader 
import os
import json
from dotenv import load_dotenv  
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import html
from azure.storage.blob.aio import BlobClient
from azure.storage.blob import ContainerClient
from tqdm import tqdm
import base64
from azure.storage.blob import BlobClient
import io  
import requests
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt 
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain import OpenAI, VectorDBQA
from langchain.chat_models import AzureChatOpenAI
from langchain.chat_models import ChatOpenAI

# Configure environment variables  
load_dotenv()

openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 



In [None]:
# Setup the Payloads header for cog search
headers = {'Content-Type': 'application/json','api-key': os.getenv('AZURE_SEARCH_KEY')}

# Set the ENV variables that Langchain needs to connect to Azure OpenAI
os.environ["OPENAI_API_BASE"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION")
os.environ["OPENAI_API_TYPE"] = "azure"

In [None]:
embedder = OpenAIEmbeddings(deployment=os.getenv("AZURE_OPENAI_EMBEDDEPLOY_NAME"), chunk_size=1) 

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json

# Generate Document Embeddings using OpenAI Ada 002
# Read the text-sample.json
with open('Elecitems.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

extracted_fields = []

index_list=[]
# Print the issues
for item in input_data:
    print(item)
    content=''
    content=json.dumps(item)
    index_list.append({'id': item['shortitem'], 'title':item['itemDesc'], 'content':content })

#### build the issue list

In [None]:
str_index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
### Create Azure Search Vector-based Index
# Setup the Payloads header
headers = {'Content-Type': 'application/json','api-key': os.getenv('AZURE_SEARCH_KEY')}
params = {'api-version': os.getenv('AZURE_SEARCH_API_VERSION')}

let's create the Vector-based index in our Azure Search Engine where this content is going to land

In [None]:
index_payload = {
    "name": str_index_name,
    "fields": [
        {"name": "id", "type": "Edm.String", "key": "true", "filterable": "true" },
        {"name": "title","type": "Edm.String","searchable": "true","retrievable": "true"},
        {"name": "chunk","type": "Edm.String","searchable": "true","retrievable": "true"},
        {"name": "chunkVector","type": "Collection(Edm.Single)","searchable": "true","retrievable": "true","dimensions": 1536, "vectorSearchProfile": "my-default-vector-profile"},
        {"name": "name", "type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {"name": "location", "type": "Edm.String", "searchable": "false", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"}
        
    ],
     "vectorSearch": {
        "algorithms": [
            {
                "name": "my-hnsw-config-1",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            }
        ],
        "profiles": [
            {
                "name": "my-default-vector-profile",
                "algorithm": "my-hnsw-config-1"
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "chunk"
                        }
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    }
}

r = requests.put(os.getenv('AZURE_SEARCH_ENDPOINT') + "/indexes/" + str_index_name,
                 data=json.dumps(index_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

### Push data to index

In [None]:
%%time
for item in index_list:
    print("Uploading chunks from",item["id"])
    
    try:
        upload_payload = {"value": [
                    {
                        "id": item["id"],
                        "title": item["title"],
                        "chunk": item["content"],
                        "chunkVector": embedder.embed_query(item["content"] if item["content"]!="" else "-------"),
                        "name": item["id"],
                        "location": item["id"],
                        "@search.action": "upload"
                    },
                ]
            }

        r = requests.post(os.environ['AZURE_SEARCH_ENDPOINT'] + "/indexes/" + str_index_name + "/docs/index",
                                 data=json.dumps(upload_payload), headers=headers, params=params)
        if r.status_code != 200:
                print(r.status_code)
                print(r.text)
    except Exception as e:
            print("Exception:",e)
            print(content)