Skip to content

Commit

Permalink
Merge branch 'vNext-Dev' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
dayland committed Dec 12, 2023
2 parents 6641eed + 98cf8cd commit c5ac17a
Show file tree
Hide file tree
Showing 11 changed files with 86 additions and 26 deletions.
5 changes: 3 additions & 2 deletions app/enrichment/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,8 @@ def poll_queue() -> None:
i = 0
for chunk in chunks:

statusLog.update_document_state( blob_path, f"Indexing {i+1}/{len(chunks)}")
statusLog.update_document_state( blob_path, f"Indexing {i+1}/{len(chunks)}", State.INDEXING)
# statusLog.update_document_state( blob_path, f"Indexing {i+1}/{len(chunks)}", State.PROCESSING
# open the file and extract the content
blob_path_plus_sas = utilities_helper.get_blob_and_sas(
ENV["AZURE_BLOB_STORAGE_CONTAINER"] + '/' + chunk.name)
Expand Down Expand Up @@ -402,7 +403,7 @@ def poll_queue() -> None:
backoff = random.randint(
int(ENV["EMBEDDING_REQUEUE_BACKOFF"]) * requeue_count, max_seconds)
queue_client.send_message(message_string, visibility_timeout=backoff)
statusLog.upsert_document(blob_path, f'Message requed to embeddings queue, attempt {str(requeue_count)}. Visible in {str(backoff)} seconds. Error: {str(error)}.',
statusLog.upsert_document(blob_path, f'Message requeued to embeddings queue, attempt {str(requeue_count)}. Visible in {str(backoff)} seconds. Error: {str(error)}.',
StatusClassification.ERROR,
State.QUEUED)
else:
Expand Down
6 changes: 5 additions & 1 deletion app/frontend/src/api/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,13 @@ export type GetUploadStatusRequest = {
export const enum FileState {
All = "ALL",
Processing = "PROCESSING",
Indexing = "INDEXING",
Skipped = "SKIPPED",
Queued = "QUEUED",
Complete = "COMPLETE",
Error = "ERROR"
Error = "ERROR",
THROTTLED = "THROTTLED",
UPLOADED = "UPLOADED"
}


Expand Down Expand Up @@ -135,6 +138,7 @@ export const enum StatusLogClassification {
// shared code (functions/shared_code/status_log.py)
export const enum StatusLogState {
Processing = "Processing",
Indexing = "Indexing",
Skipped = "Skipped",
Queued = "Queued",
Complete = "Complete",
Expand Down
30 changes: 21 additions & 9 deletions app/frontend/src/components/FileStatus/DocumentsDetailList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,19 @@ export const DocumentsDetailList = ({ items, onFilesSorted}: Props) => {
ariaLabel: 'Column operations for state, Press to sort by states',
onColumnClick: onColumnClick,
data: 'string',
onRender: (item: IDocument) => (
<TooltipHost content={`${item.state_description} `}>
<span>{item.state}</span>
</TooltipHost>
),
// onRender: (item: IDocument) => (
// <TooltipHost content={`${item.state_description} `}>
// <span>{item.state}</span>
// </TooltipHost>
// ),
isPadded: true,
},
{
key: 'column4',
name: 'Submitted On',
fieldName: 'upload_timestamp',
minWidth: 70,
maxWidth: 90,
minWidth: 90,
maxWidth: 120,
isResizable: true,
isCollapsible: true,
ariaLabel: 'Column operations for submitted on date, Press to sort by submitted date',
Expand All @@ -127,8 +127,8 @@ export const DocumentsDetailList = ({ items, onFilesSorted}: Props) => {
key: 'column5',
name: 'Last Updated',
fieldName: 'modified_timestamp',
minWidth: 70,
maxWidth: 90,
minWidth: 90,
maxWidth: 120,
isResizable: true,
isSorted: true,
isSortedDescending: false,
Expand All @@ -142,6 +142,18 @@ export const DocumentsDetailList = ({ items, onFilesSorted}: Props) => {
return <span>{item.modified_timestamp}</span>;
},
},
{
key: 'column6',
name: 'Status Detail',
fieldName: 'state_description',
minWidth: 90,
maxWidth: 200,
isResizable: true,
isCollapsible: true,
ariaLabel: 'Column operations for status detail',
data: 'string',
onColumnClick: onColumnClick
}
]);

return (
Expand Down
3 changes: 3 additions & 0 deletions app/frontend/src/components/FileStatus/FileStatus.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@ const dropdownFileStateOptions = [
{ key: FileState.Complete, text: 'Completed' },
{ key: FileState.Error, text: 'Error' },
{ key: FileState.Processing, text: 'Processing' },
{ key: FileState.Indexing, text: 'Indexing' },
{ key: FileState.Queued, text: 'Queued' },
{ key: FileState.Skipped, text: 'Skipped'},
{ key: FileState.UPLOADED, text: 'Uploaded'},
{ key: FileState.THROTTLED, text: 'Throttled'},
];

interface Props {
Expand Down
Binary file added docs/images/frontend-watch.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/vite-debug.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/webapp-backend.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 24 additions & 1 deletion docs/knownissues.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,33 @@ InvalidApiSetId - The account type 'OpenAI' is either invalid or unavailable in
### Solution:
Deploy Azure OpenAI Service only in the supported regions. Review the local.env file and update the location as per supported models and [region availability](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#model-summary-table-and-region-availability)
## Error: jq parse error: Expected value before ','
If you see a jq parse error while doing deployments, it means one of the makefile scripts to extract environment variables is failing to find a value it expects to be there. The files related would be the main.parameters.json file which is the variables from bicep output from the infrastructure create. The other would be the env file used during build and deploy time
### Solution:
To resolve carefully check your deployment .env file for any missing but required values. There are rare times when ARM has issues and output values are not written. In which case simply double check your configuration and rerun the ```make deploy``` and/or ```make extract-env``` command so that the bicep outputs can be written again
## Error: Creation of new Media Service accounts are not allowed as the resource has been deprecated
### Solution:
Media Services is scheduled for 30th June 2024. This is the [guide](https://learn.microsoft.com/en-us/azure/media-services/latest/azure-media-services-retirement). On deeper investigation Video Indexer, which is the service we use that sits on top of Media Services, will switch away from this before the end date....
```
Is Azure Video Indexer being retired?
No, Azure Video Indexer isn't part of the Media Services retirement. Although Video Indexer currently relies on a Media Services account as part of its workflow, this dependency will be eliminated before Media Services is retired on June 30, 2024. See the following for more [impact of Media Services retirement for Video Indexer](https://aka.ms/vi-ams-retirement-announcement)
```
As of today, Video Indexer still requires a Media Services service to be created, and so we can't remove it from bicep deployment. We will need to assess closer to the date if VI is working without the service and we can then remove the dependency.
The error is interesting as it seems to indicate the media service cannot be created. This is not the case, it does work in regions where VI and Media Services are available. I have updated this to an enhancement and we will add a ticket to the board to action this when VI can be deployed without this supporting service.
## Error: Token limit often exceeded with PDF files
### Solution:
The root of this is table processing. If a table is greater than our target token count for a chunk, this is not respected.Essentially tables are not chunked, but treated as units. We have added a task to our board to split tables by chunk size and repeat the table header rows in each chunk..
When we switched to using unstructured.io for non-PDF documents, we were aware of the same issue there. They were planning on adding this feature. So, we need to make the change in our code, and follow up with unstructured to confirm if this has been fixed and update that path also.
This issue has been updated to an enhancement.
18 changes: 13 additions & 5 deletions docs/webapp_debug.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,23 @@ The app consists of two layers, namely the frontend user interface components an

To debug the webapp, both frontend and backend, first set breakpoints in your code under the frontend and/or backend. Select the 'Run & Debug' tab from the sidebar in VS Code. Select Python: Flask from the dropdown and hit run. This will initiate local debugging of the backend code.

![backend debugging](/docs/images/webapp_debug_1.png)
Next verify you have a virtual environment created, which should be seen as a folder called .venv under the root of your workspace. If this doesn't exists you can create one by following these steps:

Next, you will need to initiate debugging of the frontend code. To do this select 'Vite: Debug' from the drop down and hit run.

![frontend debugging](/docs/images/webapp_debug_2.png)
1. Opening the command palette (Ctrl+Shift+P)
1. Select the command Python: Create Environment
1. Next select Venv
1. Now select the latest version of Python from the list
1. Finally enter check marks next to all requirements.txt files listed and hit OK

This will initiate frontend running and debugging. A browser will open and show the web app running under localhost:5000. Next proceed to interact with the web app, by asking a question. In the VS Code interface, your code will hit the breakpoints, frontend or backend, and you will be able to view variable, trace logic etc. You can switch between the two running debuggers by selecting frontend or backend (flask or vite) from the debug dropdown.

![frontend debugging](/docs/images/webapp_debug_3.png)
Now initiate debugging of the front end code by selecting 'Frontend: watch' and then hitting run
![backend debugging](/docs/images/frontend-watch.png)

Finally hit Vite: Debug
![backend debugging](/docs/images/vite-debug.png)

A browser will open and show the web app running under localhost:5000. Next proceed to interact with the web app, by asking a question. In the VS Code interface, you code will hit the breakpoints, frontend or backend, and you will be able to view variable, trace logic etc. You can switch between the two running debuggers by selecting frontend or backend (flask or vite) from the debug dropdown.

## Known Issues

Expand Down
2 changes: 1 addition & 1 deletion functions/TextEnrichment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def main(msg: func.QueueMessage) -> None:

statusLog.upsert_document(
blob_path,
f"{FUNCTION_NAME} - Text enrichment is complete",
f"{FUNCTION_NAME} - Text enrichment is complete, message sent to embeddings queue",
StatusClassification.DEBUG,
State.QUEUED,
)
Expand Down
23 changes: 16 additions & 7 deletions functions/shared_code/status_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
class State(Enum):
""" Enum for state of a process """
PROCESSING = "Processing"
INDEXING = "Indexing"
SKIPPED = "Skipped"
QUEUED = "Queued"
COMPLETE = "Complete"
Expand Down Expand Up @@ -155,6 +156,9 @@ def upsert_document(self, document_path, status, status_classification: StatusCl
if json_document['state'] != state.value:
json_document['state'] = state.value
json_document['state_timestamp'] = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# Update state description with latest status
json_document['state_description'] = status

# Append a new item to the array
status_updates = json_document["status_updates"]
Expand All @@ -176,7 +180,7 @@ def upsert_document(self, document_path, status, status_classification: StatusCl
"file_name": base_name,
"state": str(state.value),
"start_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"state_description": "",
"state_description": status,
"state_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"status_updates": [
{
Expand All @@ -194,7 +198,7 @@ def upsert_document(self, document_path, status, status_classification: StatusCl
"file_name": base_name,
"state": str(state.value),
"start_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"state_description": "",
"state_description": status,
"state_timestamp": str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
"status_updates": [
{
Expand All @@ -208,29 +212,34 @@ def upsert_document(self, document_path, status, status_classification: StatusCl

#self.container.upsert_item(body=json_document)
self._log_document[document_id] = json_document

def update_document_state(self, document_path, state_str):


def update_document_state(self, document_path, status, state=State.PROCESSING):
"""Updates the state of the document in the storage"""
try:
document_id = self.encode_document_id(document_path)
logging.info(f"{state_str} DocumentID - {document_id}")
document_id = self.encode_document_id(document_path)
logging.info(f"{status} DocumentID - {document_id}")
if self._log_document.get(document_id, "") != "":
json_document = self._log_document[document_id]
json_document['state'] = state_str

json_document['state'] = state.value
json_document['state_description'] = status
json_document['state_timestamp'] = str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
self.save_document(document_path)
self._log_document[document_id] = json_document
else:
logging.warning(f"Document with ID {document_id} not found.")
except Exception as err:
logging.error(f"An error occurred while updating the document state: {str(err)}")
logging.error(f"An error occurred while updating the document state: {str(err)}")


def save_document(self, document_path):
"""Saves the document in the storage"""
document_id = self.encode_document_id(document_path)
self.container.upsert_item(body=self._log_document[document_id])
self._log_document[document_id] = ""


def get_stack_trace(self):
""" Returns the stack trace of the current exception"""
Expand Down

0 comments on commit c5ac17a

Please sign in to comment.