In [None]:
!pip install -qU --upgrade azure-cognitiveservices-vision-computervision
!pip install -qU pillow
!pip install -qU papermill

In [None]:
import pandas as pd
from google.colab import drive
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from array import array
import os
from PIL import Image
import sys
import time
import ast
import numpy as np
from xml.etree import ElementTree as ET

In [None]:
drive.mount('/content/drive')

This notebook allows you to run OCR processing on all historical data with one cell call. Please input ALL desired semester names in a list like the one below in order to run it on all semesters at once.

In [None]:
class_names = ['data8_sp24_multiturn','data8_fa23_multiturn']

Please replace the subscription key appropriately.

In [None]:
subscription_key = <YOUR_KEY>
endpoint = "https://edllm-ocr-v1.cognitiveservices.azure.com/"

computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))

print("===== Read File - remote =====")
read_image_url = "https://learn.microsoft.com/azure/ai-services/computer-vision/media/quickstarts/presentation.png"

read_response = computervision_client.read(read_image_url,  raw=True)
read_operation_location = read_response.headers["Operation-Location"]
operation_id = read_operation_location.split("/")[-1]
while True:
    read_result = computervision_client.get_read_result(operation_id)
    if read_result.status not in ['notStarted', 'running']:
        break
if read_result.status == OperationStatusCodes.succeeded:
    for text_result in read_result.analyze_result.read_results:
        for line in text_result.lines:
            print(line.text)
            print(line.bounding_box)

In [None]:
memoized_dict = {}

In [None]:
def my_python_tool(question: str, xml: str) -> str:

        subscription_key = '68297262be7f447ba78f4707709f93d6'
        endpoint = 'https://edllm-ocr-v1.cognitiveservices.azure.com/'
        computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
        question = str(question)
        root = ET.fromstring(xml)
        image_links = [image.get('src') for image in root.iter('image')]
        extracted_text = ''

        for img_link in image_links:
            if img_link in memoized_dict:
              extracted_text += memoized_dict[img_link]
              continue

            try:

              read_response = computervision_client.read(img_link,  raw=True)

            except:
              print(f"Failed question: {question}")
              continue
            read_operation_location = read_response.headers["Operation-Location"]
            operation_id = read_operation_location.split("/")[-1]
            while True:
                read_result = computervision_client.get_read_result(operation_id)
                if read_result.status not in ['notStarted', 'running']:
                    break
            if read_result.status == OperationStatusCodes.succeeded:
                for text_result in read_result.analyze_result.read_results:
                    for line in text_result.lines:
                        extracted_text += str(line.text)
                        memoized_dict[img_link]=extracted_text

        return question, extracted_text

In [None]:
def safe_my_python_tool(row):
        try:
            if row.name % 100 == 0:
              print(f'{row.name}/{len(qa_data)}')
            questions = []
            for i in ast.literal_eval(row['memory']):
              question = {}
              q, context = my_python_tool(i['text'],i['document'])
              if (i['user_role'] == 'student' and not i['endorsed']):
                question['role'] = 'Student'
              else:
                question['role'] = 'TA'
              question['text'] = q
              question['image context'] = context
              questions.append(question)

            question = {}
            q, context = my_python_tool(row.question, row.document_q)
            question['role'] = 'Student'
            question['text'] = q
            question['image context'] = context
            questions.append(question)

            return questions
        except Exception as e:
            print(f"Error processing row with index {row.name}: {e}")
            return []

While the next cell is running, you will see a progress bar for each semester along with any OCR errors. This cell may take a long time to run (several hours)

In [None]:
for class_name in class_names:
    qa_data = pd.read_csv(f'drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/qa.csv')

    print('\n\n===============================================================================================')
    print(f'BEGIN PROCESSING OF {class_name}')
    print('===============================================================================================\n\n')
    test_ser = qa_data.apply(safe_my_python_tool, axis=1)
    qa_data["QuestionOCR"] = test_ser
    data = qa_data
    data['Metadata'] = 'type:' + data['type'] + ' | category:' + data['category']
    data['Question'] = data['QuestionOCR']
    data['Answer'] = data['answer']
    xls = data[['Question', 'Answer', 'Metadata']]
    xls.to_excel(f"drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/excel_simple_ocr.xlsx", index=False)
