In [4]:
from dotenv import dotenv_values
from pathlib import Path
import os
import pandas as pd
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

# specify the name of the .env file name 
env_name = "../../.env" # change to your own .env file name
config = dotenv_values(env_name)

# Extract data and context using Azure Form Recognizer

This code sample shows Prebuilt Document operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk


In [5]:
"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""

if config['KEYS_FROM'] == "KEYVAULT":
    print('keyvault was selected.')
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)
    
    endpoint = client.get_secret("AZURE-FORM-RECOGNIZER-ENDPOINT").value
    key = client.get_secret("AZURE-FORM-RECOGNIZER-KEY").value
else:
    print('.env was selected.')

    endpoint = config["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = config["AZURE_FORM_RECOGNIZER_KEY"]


keyvault was selected.


##  Read pdf files using Azure Form Recognizer and split into chunks 
Azure form recognizer reads pdf files and then we chunk the extracted text, and also save page number and line number for the extracted chunks 

In [6]:
import re

################################################################################
#################### Helper Functions ##########################################
################################################################################

# Read pdf files
def analyze_pdf(doc_path):  
    with open(doc_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-document", document=f
        )
    result = poller.result()
                
    return result

# Extract stock symbol, year, and quarter from filename
def extract_info_from_filename(filename):
    '''
    Input: filename ("MSFTTranscriptFY23Q4")
    Output: Extract stock symbol, year and quarter from filename
    '''
    pattern = r'([A-Z]+)TranscriptFY(\d{2})Q(\d)'
    match = re.search(pattern, filename)
    
    if match:
        symbol = match.group(1)
        fiscal_year = match.group(2)
        fiscal_quarter = match.group(3)
        return symbol, fiscal_year, fiscal_quarter
    else:
        return None

# Extract line number and page number
def create_line_page_tuples(result):
    '''
    Input: result of form recognizer analyze_pdf function
    Output: Create list of tuples of the form (line, page_num, line_num) 
    This will keep reference of the line number and page number of each line in the document.
    '''
    line_page_tuples = []

    total_pages = len(result.pages)
    for page_num in range(total_pages):
        lines = result.pages[page_num].lines
        total_lines = len(lines)

        for line_num in range(total_lines):
            line = lines[line_num].content
            line_page_tuples.append((line, page_num + 1, line_num + 1))

    return line_page_tuples

# Retrieve page number and chunks
def chunk_with_page_number(line_page_tuples, chunk_length=10, chunk_overlap=2):
    '''
    Given the list of tuples of the form (line, page_num, line_num) and chunk length and overlap,
    it will create chunks of text with page number and line number of the first line in the chunk.
    chunk length: number of lines in each chunk
    chunk_overlap: number of overlapping lines between chunks
    '''
    pointer = 0 
    chunks = []
    total_lines = len(line_page_tuples)
    #for line, page_number, line_number in line_page_tuples:
    while pointer < total_lines:
        line_count = 0
        current_chunk = ""
        if not chunks: 
            # for first chunk we can not use overlap
            pointer = 0
        else:
            pointer = pointer - chunk_overlap
        
        # take starting page number and line number 
        page_number, line_number = line_page_tuples[pointer][1:]  
        while line_count < chunk_length and pointer < total_lines:
            current_chunk = current_chunk + line_page_tuples[pointer][0]
            current_chunk = current_chunk + " "
            line_count += 1
            pointer += 1
        chunks.append((current_chunk, page_number, line_number))
    return chunks


In [7]:
# Define document analysis client
document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

doc_dir = Path("DATA/")
pdf_files = [filename for filename in os.listdir(doc_dir) if filename.endswith('.pdf')]

dfs = []

for file_name in pdf_files:
    
    values = extract_info_from_filename(file_name) # symbol, fiscal_year, fiscal_quarter
    file_path = os.path.join(doc_dir, f"{os.path.splitext(file_name)[0]}.pdf")
    
    # analyze the pdf using form recognizer
    result = analyze_pdf(file_path)
    
    # get the chunks in a tuple of the form (chunk, page_number, line_number)
    line_page_tuples = create_line_page_tuples(result)
    chunks = chunk_with_page_number(line_page_tuples=line_page_tuples, chunk_length=10, chunk_overlap=2)
   
    # Write results to dataframe 
    df_chunks = pd.DataFrame(chunks, columns = ['Chunk', 'PageNumber', 'LineNumber'])  

    df_chunks["Ticker"], df_chunks["Year"], df_chunks["Quarter"]  = "NULL", "NULL", "NULL"
    if values:
        symbol, fiscal_year, fiscal_quarter = values
        df_chunks["Ticker"], df_chunks["Year"], df_chunks["Quarter"]  = symbol, fiscal_year, fiscal_quarter
        
    # Reorder dataframe column name
    new_column_order = ['Ticker', 'Year', 'Quarter', 'Chunk', 'PageNumber', 'LineNumber']
    df_chunks = df_chunks[new_column_order]
        
    # Add all datframe to list
    dfs.append(df_chunks)

    # Saving results to csv files
    if not os.path.exists("AnalyzedPDF/"):
        os.makedirs("AnalyzedPDF/")

    print('writing the results of: \n' + file_name)  
    if not os.path.exists(f"AnalyzedPDF/Chunks_{file_name[0:-4]}.csv"):
        df_chunks.to_csv(f"AnalyzedPDF/Chunks_{file_name[0:-4]}.csv", index=False)
    else:
        print(f'File: chunks_{file_name}.csv already exists, skipping...')
        
## Combine all the files
df = pd.concat(dfs, ignore_index=True)
df = df.reset_index(drop=True)
df.insert(0, 'Id', [i for i in range(1, df.shape[0]+1)]) # Add 'Id' column

## Save to csv
df.to_csv("AnalyzedPDF/Chunks.csv", index=False)

df.head()


writing the results of: 
MSFTTranscriptFY23Q1.pdf
writing the results of: 
MSFTTranscriptFY23Q2.pdf
writing the results of: 
MSFTTranscriptFY23Q3.pdf
writing the results of: 
MSFTTranscriptFY23Q4.pdf


Unnamed: 0,Id,Ticker,Year,Quarter,Chunk,PageNumber,LineNumber
0,1,MSFT,23,1,Microsoft FY23 First Quarter Earnings Conferen...,1,1
1,2,MSFT,23,1,"On the Microsoft Investor Relations website, y...",1,9
2,3,MSFT,23,1,GAAP. They are included as additional clarifyi...,1,17
3,4,MSFT,23,1,"same in constant currency, we will refer to th...",2,6
4,5,MSFT,23,1,"predictions, projections, or other statements ...",2,14
