In [1]:
# Using Azure Document Intelligence to Extract Data
# https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python-preview&viewFallbackFrom=azure-python
import json
import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import os
import glob


In [2]:
from dotenv import load_dotenv
load_dotenv() 

True

In [3]:
endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
key = os.environ["DOCUMENTINTELLIGENCE_KEY"]


In [4]:
BURSAPDF_DATA_FOLDER_PATH="bursapdf/"
BURSACSV_DATA_FOLDER_PATH = "bursacsv/"
if not os.path.exists(BURSACSV_DATA_FOLDER_PATH):
    os.makedirs(BURSACSV_DATA_FOLDER_PATH)

In [5]:
def extract_tables_from_pdf_with_di(endpoint, key, file_path):
  document_analysis_client = DocumentAnalysisClient(endpoint, AzureKeyCredential(key))
  with open(file_path, "rb") as f:
      poller = document_analysis_client.begin_analyze_document(
          "prebuilt-layout", document=f
      )
  result = poller.result()

  data_json = []
  for table in result.tables:
      table_list = []
      for cell in table.cells:
          table_list.append({"row_index":cell.row_index,"column_index":cell.column_index,"content":cell.content})
      data_json.append({"cells":table_list,"column_count":table.column_count})
  return  data_json

In [9]:
def convert_to_dataframe(data_json):
    df_list =[]
    for record in data_json:
        table_cells_list = record["cells"] # 
        df = pd.DataFrame(table_cells_list)[["row_index","column_index","content"]]
        df_table = df.pivot(index="row_index",columns="column_index",values="content")
        df_table.columns = df_table.iloc[0]
        df_table = df_table.drop(0)
        df_table.reset_index(drop=True, inplace=True)
        df_list.append(df_table)
    df_all = pd.concat(df_list,ignore_index=True)

    return df_all

In [10]:

# Get all PDF files in the BURSAPDF_DATA_FOLDER_PATH
pdf_files = glob.glob(BURSAPDF_DATA_FOLDER_PATH + "*.pdf")



In [12]:
try:
    for pdf_file in pdf_files:
        # Extract tables from the PDF
        csv_file = os.path.join(BURSACSV_DATA_FOLDER_PATH, os.path.basename(pdf_file).replace(".pdf", ".csv"))
        
        # Check if the CSV file already exists
        if os.path.exists(csv_file):
            print(f"File {csv_file} already exists. Skipping.")
            continue

        extracted_data = extract_tables_from_pdf_with_di(endpoint, key, pdf_file)
        
        # Convert the extracted data to a dataframe
        df = convert_to_dataframe(extracted_data)
        
        # Generate the CSV file path
 
        
        # Save the dataframe to a CSV file
        df.to_csv(csv_file, index=False)
except Exception as e:
    print(f"An error occurred: {e}")


File bursacsv/listed_companies_ip_2024-08-01.csv already exists. Skipping.
File bursacsv/listed_companies_ip_2024-08-02.csv already exists. Skipping.
File bursacsv/listed_companies_ip_2024-08-05.csv already exists. Skipping.
