In [1]:
from datetime import datetime
import csv
import pandas as pd
import os

In [2]:
csv_file = '../files/isp_azure.csv'
if not os.path.isfile(csv_file):
    # If the file doesn't exist, create it with a header
    df = pd.DataFrame(columns=['image_path', 'ISP', 'date', 'amount'])
    df.to_csv(csv_file, index=False)
else:
    # If the file exists, append the data to the DataFrame
    df = pd.read_csv(csv_file)

In [3]:
def convert_date(date_str):
    try:
        # Attempt to parse the date in "month/day/year" format
        date_obj = datetime.strptime(date_str, "%d/%m/%Y")
        # Convert it to "year-month-day" format
        return date_obj.strftime("%Y-%m-%d")
    except ValueError:
        # If the date is already in "year-month-day" format or is not a valid date, return it as is
        return date_str

In [4]:
folder_path = '../Data/All_bills'

# Get a list of all files in the folder
file_list = os.listdir(folder_path)


# Filter the list to keep only image files (you can add more extensions if needed)
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
image_files = [f for f in file_list if any(f.lower().endswith(ext) for ext in image_extensions)]


In [5]:
!pip install azure-ai-formrecognizer==3.3.0

Collecting azure-ai-formrecognizer==3.3.0
  Obtaining dependency information for azure-ai-formrecognizer==3.3.0 from https://files.pythonhosted.org/packages/ec/d6/255f4afed1e4d0e2aaf1b988d4c5ae453b65d9b92a3370634d7523804fd7/azure_ai_formrecognizer-3.3.0-py3-none-any.whl.metadata
  Downloading azure_ai_formrecognizer-3.3.0-py3-none-any.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.8 kB 222.6 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/61.8 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 61.8/61.8 kB 412.5 kB/s eta 0:00:00
Collecting azure-core<2.0.0,>=1.23.0 (from azure-ai-formrecognizer==3.3.0)
  Obtaining dependency information for azure-core<2.0.0,>=1.23.0 from https://files.pythonhosted.org/packages/9c/f8/1cf23a75cb8c2755c539ac967f3a7f607887c4979d073808134803720f0f/azure_core-1.29.5-py3-none-any.whl.metadata
  Downloading azure_core-1.29.5-py

In [6]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""

endpoint = "https://thilakna-doc-intelligence-instance.cognitiveservices.azure.com/"
key = "29579bf5af1f4559bb8228d643e79d7b"
model_id = "ISP-M1"

# Initialize the DocumentAnalysisClient
document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [8]:
# Loop through the image files and open each one
for image_file in image_files:
    # Construct the full path to the image file
    image_path = os.path.join(folder_path, image_file)
    # if image_path in df['image_path'].tolist():
    #     print(f"Skipping image {image_file} as it's already in the DataFrame.")
    #     continue


    try:
      # Analyze the document using the custom model
      with open(image_path, "rb") as document:
        poller = document_analysis_client.begin_analyze_document(model_id, document)
      result = poller.result()
            # Process the analysis results as before
      for idx, document in enumerate(result.documents):
          # print("--------Analyzing document #{}--------".format(idx + 1))
          # print("Document has type {}".format(document.doc_type))
          # print("Document has confidence {}".format(document.confidence))
          # print("Document was analyzed by model with ID {}".format(result.model_id))

          data = []
          for name, field in document.fields.items():
              field_value = field.value if field.value else field.content
              data.append(field_value)

          # Append the data to the existing CSV file
          data_dict = {
              "image_path": [image_path],
              "date":[data[2]],  # Adjust the index to match the data
              "amount": [data[0]],  # Adjust the index to match the data
              "isp": [data[1]],  # Adjust the index to match the data
          }
          df = pd.DataFrame(data_dict)
          df.to_csv('../files/isp_azure.csv', mode='a', header=False, index=False)
    except Exception as e:
      print(e)



In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df_old = pd.read_csv('../files/isp_data_2.csv')
df_old

Unnamed: 0,oimage_path,ISP,date,amount
0,../Data/batch_process\upload_image_00467868600...,SLT,2022-11-02,4044.86
1,../Data/batch_process\upload_image_1.png.png,Dialog,2021-12-48,2890.44
2,../Data/batch_process\upload_image_10.png.png,Dialog,2023-01-04,2800.0
3,../Data/batch_process\upload_image_11.png.png,Dialog,2023-01-26,4200.0
4,../Data/batch_process\upload_image_12.png.png,Dialog,2023-02-27,4000.0
5,../Data/batch_process\upload_image_2.png.png,Dialog,2022-03-25,500.0
6,../Data/batch_process\upload_image_3.png.png,Dialog,2022-05-31,200.0
7,../Data/batch_process\upload_image_4.png.png,Dialog,2022-07-04,2228.56
8,../Data/batch_process\upload_image_5.png.png,Dialog,,200.0
9,../Data/batch_process\upload_image_6.png.png,Dialog,2022-08-07,200.0


In [25]:
df_new = pd.read_csv('../files/isp_azure.csv')
# df_new

In [26]:
df_new.loc[df_new["ISP"].str.contains("MOBITEL"), "ISP"] = "SLT"

# df_new

In [27]:
from dateutil import parser

df_new['date'] = df_new['date'].apply(lambda x: parser.parse(x).strftime('%d-%m-%Y'))
df_new

Unnamed: 0,image_path,ISP,date,amount
0,../Data/All_bills\0046786860019XImage-1.png,SLT,11-02-2022,4044.86
1,../Data/All_bills\1.png,Dialog,16-12-2021,2890.44
2,../Data/All_bills\10.png,Dialog,04-01-2023,2800.0
3,../Data/All_bills\11.png,Dialog,26-01-2023,1200.0
4,../Data/All_bills\12.png,Dialog,27-02-2023,1000.0
5,../Data/All_bills\2.png,Dialog,23-03-2022,500.0
6,../Data/All_bills\3.png,Dialog,31-05-2022,800.0
7,../Data/All_bills\4.png,Dialog,04-07-2022,2226.56
8,../Data/All_bills\5.png,Dialog,15-07-2022,800.0
9,../Data/All_bills\6.png,Dialog,07-08-2022,800.0
