### Install Python Packages

In [1]:
### Create or update your local Python environment

%pip install azure-functions
%pip install azure-core
%pip install azure-cosmos
%pip install openai
%pip install numpy
%pip install requests
%pip install pandas
%pip install azure-storage-blob 
%pip install azure-identity
%pip install smart_open
%pip install tenacity
%pip install pinecone-client
%pip install redis
%pip install tiktoken
%pip install azure-storage-file-share
%pip install python-dotenv
%pip install azure-search-documents==11.4.0b3
%pip install azure-ai-formrecognizer
%pip install beautifulsoup4
%pip install lxml
%pip install openpyxl
%pip install azure-ai-textanalytics
%pip install langchain==0.0.139
%pip install flask
%pip install flask-socketio



[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting azure-functions
  Downloading azure_functions-1.17.0-py3-none-any.whl (165 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: azure-functions
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Li

### Import Packages

In [2]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

import shutil
import sys
sys.path.append('./utils')

### Uncomment below imports as needed -- make sure that all relevant values and keys in the .env file are properly populated
# from utils import redis_helpers
# from utils import helpers
# from utils import language
# from utils import openai_helpers
# from utils import storage
# from utils import bot_helpers

from utils.env_vars import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
COG_SEARCH_ENDPOINT

'https://searchseh000000.search.windows.net'

### Activate Cognitive Search Ingestion 
#### First Run - Create Index and Indexer 
##### Caution: this will destroy any data you might already have in your index

In [3]:
#### Ingest all knowledge base documents
from utils import cogsearch_helpers

cogsearch_helpers.ingest_kb(container = KB_BLOB_CONTAINER)


Index vec-index Deleted
Index vec-index created
Index km-openai Deleted
Index km-openai created
Deleted Skillset - km-openai-skills
Created new Skillset - km-openai-skills
Deleted Indexer - km-openai-indexer
Deleted Data Source - km-openai-skills
Created new Data Source Connection - km-openai-docs
Created new Indexer - km-openai-indexer
Running Indexer km-openai-indexer


#### Additional Runs - Re-indexing with delta documents

In [2]:
### Re-index additional documents
from utils import cogsearch_helpers

cogsearch_helpers.run_indexer()

Running Indexer km-openai-indexer


### Activate Form Recognizer Ingestion

In [None]:
#### Ingest all form documents

from utils import storage
from utils import fr_helpers

fr_helpers.process_forms(in_container = FR_CONTAINER, out_container = OUTPUT_BLOB_CONTAINER)



In [39]:
### other containers could be used as the sources of documents to index

cogsearch_helpers.create_indexer('kmoaidemo2') 
cogsearch_helpers.run_indexer()

Deleted Indexer - km-openai-indexer
Deleted Data Source - km-openai-skills
Created new Data Source Connection - km-openai-docs
Created new Indexer - km-openai-indexer
Running Indexer km-openai-indexer


### Scrape Web Pages

In [8]:
### Scrape data from web pages
### This saves the text from the web pages into the processed folder, and any files which are found are saved
### to the demo folder. For the files to be processed, you need to run the cognitive search re-indexer

# TODO: Some files may be better parsed by form recognizer. Need to decide how to know which is more suitable, 
# and place them in the correct folder. 

# Enter the URLs (only enter the root url, the crawler traverses the hierarchy of the webpages below the root) 
# that you want to scrape in the urls list. Each url should be a string
urls = ['https://www.lufthansa.com']

from utils import web_crawler

for url in urls:
    web_crawler.crawl(url, KB_BLOB_CONN_STR, KB_BLOB_CONTAINER, OUTPUT_BLOB_CONTAINER)


https://www.lufthansa.com
HTTP Error 403: Forbidden
HTTP Error 403: Forbidden


In [None]:
from urllib.parse import urlparse

local_domain = urlparse(url).netloc

### Interrogate the APIs with the sample Knowledge Base

In [None]:
### Use this cell to query Redis with the below queries
import os
import json
from utils import bot_helpers
from dotenv import load_dotenv
load_dotenv(override=True)
from utils import km_agents

from utils import redis_helpers
from utils import openai_helpers
redis_conn = redis_helpers.get_new_conn() 

params_dict = {
    'enable_unified_search': False,
    'enable_redis_search': True,
    'enable_cognitive_search': False,
    'evaluate_step': False,
    'check_adequacy': False,
    'check_intent': False,
    'use_calendar': False,
    'use_bing': False
    }

stream = True

queries = [
    "what is mentioned about the Lost City hotel?",
    "what is mentioned about the Strip in Vegas?",
    "what is mentioned about the Fountain Hotel?"
]

agent_name = 'ccr' # or 'os' or 'zs'

for q in queries:
    agent = km_agents.KMOAI_Agent(agent_name = agent_name, params_dict=params_dict, verbose = False, stream=stream)
    answer, sources, likely_sources, s_id = agent.run(q, redis_conn)
    print("\n\n", answer, '\n', sources, '\n', likely_sources, '\n', s_id, '\n\n\n###############################')
    


### Try out the summarization

In [None]:
from utils import summarization

folder = './docs_to_summarize/'

ref_summ_df = summarization.summarize_folder(folder, mode='refine', verbose=False)
mp_summ_df  = summarization.summarize_folder(folder, mode='map_reduce', verbose=False)

## Experimentation Code Below - NO NEED TO RUN 
### For your reference only

### Reset Redis Index

In [3]:
#### Reset Index in Redis
from utils import redis_helpers

reset_index = True

if reset_index:
    redis_conn = redis_helpers.get_new_conn()
    redis_helpers.redis_reset_index(redis_conn)

### Generate Embeddings and Load into Redis

In [None]:
### Use this cell to load embeddings directly into Redis from this notebook

import json


emb_documents = []


for item in os.listdir("dump"):
    path = os.path.join("dump", item)

    with open(path, 'r') as openfile:
        data = json.load(openfile)
        
    emb_documents += helpers.generate_embeddings(data, CHOSEN_EMB_MODEL, SMALL_EMB_TOKEN_NUM,  text_suffix = 'S')

    if MEDIUM_EMB_TOKEN_NUM != 0:
        emb_documents += helpers.generate_embeddings(data, CHOSEN_EMB_MODEL, MEDIUM_EMB_TOKEN_NUM, text_suffix = 'M')

    if LARGE_EMB_TOKEN_NUM != 0:
        emb_documents += helpers.generate_embeddings(data, CHOSEN_EMB_MODEL, LARGE_EMB_TOKEN_NUM,  text_suffix = 'L')


helpers.load_embedding_docs_in_redis(emb_documents)

In [None]:
emb_documents = []

emb_documents += helpers.generate_embeddings_from_json_docs('dump', ADA_002_EMBEDDING_MODEL, ADA_002_MODEL_MAX_TOKENS, text_suffix='XL', limit=-1)

print(f"Generated {len(emb_documents)} embeddings.")
helpers.save_embdding_docs_to_pkl(emb_documents, "test.pkl")

In [24]:
emb_documents = helpers.load_embedding_docs_from_pkl("test.pkl")
helpers.load_embedding_docs_in_redis(emb_documents)

Loading 141 embeddings into Redis
