In [72]:
import os
import re
import json
import pandas as pd

# Paths
EXTRACTED_DATA_FILE = "../data/processed/extracted_data.json"
CLEANED_DATA_FILE = "../data/processed/cleaned_data.csv"

# Define regex patterns for extracting PO fields
PO_NUMBER_PATTERN = r"(PO-PO-\d+)"  # Pattern to extract PO number from content
DELIVERY_DATE_PATTERN = r"(?i)Delivery by\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})"  # Pattern to capture delivery date
PRODUCT_PATTERN = r"(\d+\s*\d+\s*units\s*of\s*([^\(]+)\s*\()"  # Pattern to capture product and units
ITEM_CODE_PATTERN = r"\(Item Code:\s*([^)]+)\)"  # Pattern to capture Item Code

# Keywords to capture in the Notes column
NOTES_KEYWORDS = ["urgent", "ASAP", "confirm availability"]

def clean_text(text):
    """Clean extracted text: remove extra spaces, special characters, and normalize text."""
    text = text.replace("\n", " ").strip()
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    return text

def extract_notes(text):
    """Extract notes based on specific keywords."""
    notes = []
    for keyword in NOTES_KEYWORDS:
        if re.search(rf"\b{keyword}\b", text, re.IGNORECASE):  # Case-insensitive search
            notes.append(keyword)
    return ", ".join(notes) if notes else None  # Join keywords into a single string

def extract_fields(text):
    """Extract key fields from text using regex."""
    po_number = re.search(PO_NUMBER_PATTERN, text)
    delivery_date = re.search(DELIVERY_DATE_PATTERN, text)

    # Extract product, units, and item codes
    product_matches = re.findall(PRODUCT_PATTERN, text)  # Extract product and units
    item_codes = re.findall(ITEM_CODE_PATTERN, text)  # Extract item codes

    # Combine product, units, and item codes into a list of dictionaries
    items = []
    for i, (unit_product, product) in enumerate(product_matches):
        if i < len(item_codes):  # Ensure there's a corresponding item code
            unit = re.search(r"\d+", unit_product).group()  # Extract units from the match
            items.append({
                "Product": product.strip(),
                "Units": unit,
                "Item Code": item_codes[i]
            })

    # Extract notes
    notes = extract_notes(text)

    return {
        "PO Number": po_number.group(1) if po_number else None,
        "Delivery Date": delivery_date.group(1) if delivery_date else None,
        "Items": items,  # List of dictionaries containing product, units, and item code
        "Notes": notes  # Add extracted notes
    }

dff = pd.DataFrame()
def clean_and_structure_data():
    """Load extracted data, clean and structure it, and save to CSV."""
    if not os.path.exists(EXTRACTED_DATA_FILE):
        print(f"Error: {EXTRACTED_DATA_FILE} not found!")
        return

    with open(EXTRACTED_DATA_FILE, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    structured_data = []
    for entry in raw_data:
        filename = entry["filename"]
        content = clean_text(entry["content"])
        extracted_fields = extract_fields(content)

        # Extract PO number from filename (e.g., "PO_123_ClientName.txt" -> "PO_123")
        po_from_filename = re.search(r"PO_\d+", filename)
        po_number = po_from_filename.group(0) if po_from_filename else None

        # Extract client name from filename (e.g., "PO_123_ClientName.txt" -> "ClientName")
        client_name = re.sub(r"\.\w+$", "", filename)    # Remove .txt extension
        client_name = re.sub(r"_PO_\d+$", "", client_name)  # Remove "_PO_<digits>" suffix

        # Add extracted fields to the result
        extracted_fields["PO Number"] = po_number  # Override PO number from filename
        extracted_fields["Client Name"] = client_name
        structured_data.append(extracted_fields)

    # Convert structured data to a DataFrame for CSV output
    df = pd.DataFrame(structured_data)

    # Normalize item details for better CSV formatting
    df_items = df.explode("Items")  # Explode the list of items into separate rows
    df_items = pd.concat([df_items.drop(columns=["Items"]), df_items["Items"].apply(pd.Series)], axis=1)
    dff = df_items
    print(df_items.to_string())


    # Save the cleaned data to a CSV file
    #df_items.to_csv(CLEANED_DATA_FILE, index=False)

clean_and_structure_data()

   PO Number      Delivery Date                 Notes           Client Name             Product Units Item Code
0      PO_12  February 09, 2025                  None      Allen_Consulting  Pneumatic Cylinder   197   MA-2200
0      PO_12  February 09, 2025                  None      Allen_Consulting     Compressor Unit   322   MA-2200
0      PO_12  February 09, 2025                  None      Allen_Consulting      Hydraulic Pump   298   CU-5643
1      PO_27  February 03, 2025                  None      Allen_Consulting      Hydraulic Pump   321   MA-2200
2      PO_09  February 20, 2025                  None   Anderson_Industries  Pneumatic Cylinder   270   PC-1122
2      PO_09  February 20, 2025                  None   Anderson_Industries        Gear Reducer   446   MA-2200
3      PO_22               None                urgent   Anderson_Industries        Gear Reducer    73   CB-3300
4      PO_52   January 29, 2025                  None   Anderson_Industries       Control Valve    53   

In [69]:
import re

filename = "PO_123_ClientName.txt"

# Remove file extension
client_name = re.sub(r"\.\w+$", "", filename)  

# Remove "PO_" followed by digits and an underscore
client_name = re.sub(r"^PO_\d+_", "", client_name)  

print(client_name) 

ClientName


#using deployed AZURE NER API

In [6]:
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: sample_recognize_custom_entities.py

DESCRIPTION:
    This sample demonstrates how to recognize custom entities in documents.
    Recognizing custom entities is also available as an action type through the begin_analyze_actions API.

    For information on regional support of custom features and how to train a model to
    recognize custom entities, see https://aka.ms/azsdk/textanalytics/customentityrecognition

USAGE:
    python sample_recognize_custom_entities.py

    Set the environment variables with your own values before running the sample:
    1) AZURE_LANGUAGE_ENDPOINT - the endpoint to your Language resource.
    2) AZURE_LANGUAGE_KEY - your Language subscription key
    3) CUSTOM_ENTITIES_PROJECT_NAME - your Language Studio project name
    4) CUSTOM_ENTITIES_DEPLOYMENT_NAME - your Language Studio deployment name
"""


def sample_recognize_custom_entities() -> None:
    # [START recognize_custom_entities]
    import os
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.textanalytics import TextAnalyticsClient

    RAW_DATA_DIR = "../data/raw/Thomas_Group_PO_16.txt"
    endpoint = os.environ[YOUR_ENDPOINT_VALUE]
    key = os.environ[YOUR_KEY_VALUE]
    project_name = os.environ[YOUR_PROJECT_NAME]
    deployment_name = os.environ[YOUR_DEPLOYMENT_NAME]
    path_to_sample_document = os.path.join(RAW_DATA_DIR)

    text_analytics_client = TextAnalyticsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key),
    )

    with open(path_to_sample_document) as fd:
        document = [fd.read()]

    poller = text_analytics_client.begin_recognize_custom_entities(
        document,
        project_name=project_name,
        deployment_name=deployment_name
    )

    document_results = poller.result()
    for custom_entities_result in document_results:
        if custom_entities_result.kind == "CustomEntityRecognition":
            for entity in custom_entities_result.entities:
                print(
                    "Entity '{}' has category '{}' with confidence score of '{}'".format(
                        entity.text, entity.category, entity.confidence_score
                    )
                )
        elif custom_entities_result.is_error is True:
            print("...Is an error with code '{}' and message '{}'".format(
                custom_entities_result.error.code, custom_entities_result.error.message
                )
            )
    # [END recognize_custom_entities]


if __name__ == "__main__":
    sample_recognize_custom_entities()

ImportError: cannot import name 'PiiEntityCategory' from 'azure.ai.textanalytics._generated.models' (c:\Users\91940\anaconda3\envs\novac\lib\site-packages\azure\ai\textanalytics\_generated\models.py)