### Libraires

In [2]:
# import required libraries
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component
from azureml.core import Dataset, Datastore

### Check credentials

In [2]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

### Get workspace

In [13]:
ml_client = MLClient.from_config("config.json")

Found the config file in: ./config.json


In [14]:
from azureml.core import Workspace
ws = Workspace.from_config("config.json")

### Check datastore

In [None]:
datastore = ws.get_default_datastore()
datastore

### Add files from blob to datastore

In [7]:
# Create the file dataset using the paths
file_dataset = Dataset.File.from_files(path=datastore_paths)

### Data update step in pipeline

In [15]:
from azureml.core import Workspace, Dataset, Run
import os

def get_workspace():
    # Try to get the workspace from the run context if online
    try:
        run = Run.get_context(allow_offline=False)  # This throws an Exception if offline
        return run.experiment.workspace
    except Exception:
        # If offline, load the workspace from the config file
        return Workspace.from_config("config.json")

def update_or_create_pdf_dataset():
    ws = get_workspace()
    
    # Access the existing datastore by its name
    datastore = ws.datastores['input_pdfs']
    
    # Define the path pattern to include all PDF files
    path_on_datastore = (datastore, '**/*.pdf')
    
    # Create or update the dataset
    try:
        # Create a FileDataset pointing to PDF files in the datastore
        pdf_dataset = Dataset.File.from_files(path=path_on_datastore)
        
        # Register the dataset in the workspace for future use
        pdf_dataset.register(workspace=ws,
                             name='PDF File Dataset',
                             description='Dataset containing all PDF files from the input_pdfs datastore',
                             create_new_version=True)
        print("Dataset updated or created successfully.")
    except Exception as e:
        print(f"Failed to create or update the dataset: {str(e)}")

In [18]:
update_or_create_pdf_dataset()

Dataset updated or created successfully.


### Make it a step in the pipeline

In [24]:
from azureml.core import ScriptRunConfig, Experiment, Environment
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import PipelineData

# Load your Azure ML Workspace
ws = Workspace.from_config()

# Optional: Define a custom run environment
env = Environment(name="pdf_processing_environment")
conda_dep = CondaDependencies()
conda_dep.add_pip_package("azureml-sdk")  # Ensure the SDK is included
env.python.conda_dependencies = conda_dep

# Create a run configuration
run_config = RunConfiguration()
run_config.environment = env

# Define the Python script step
update_pdf_step = PythonScriptStep(
    name='Update PDF Dataset',
    script_name='update_pdf_dataset.py',  # Your script file name
    arguments=[],  # Add script arguments here
    compute_target='test-compute-ns',  # Specify the Azure ML compute target
    runconfig=run_config,
    source_directory='src/',  # Directory containing your script
    allow_reuse=True
)


In [1]:
from azureml.core import ScriptRunConfig, Experiment, Environment, Workspace
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import PipelineData

# Load your Azure ML Workspace
ws = Workspace.from_config()

# Define a custom run environment
env = Environment(name="pdf_processing_environment")
conda_dep = CondaDependencies()
conda_dep.add_pip_package("azureml-sdk")  # Ensure the SDK is included
env.python.conda_dependencies = conda_dep

# Create a run configuration
run_config = RunConfiguration()
run_config.environment = env

# Define the output for the PDFs
output_pdfs = PipelineData(name='output_pdfs', datastore=ws.get_default_datastore())

# Define the Python script step
update_pdf_step = PythonScriptStep(
    name='Update PDF Dataset',
    script_name='update_pdf_dataset.py',  # Your script file name
    arguments=['--output_dir', output_pdfs],
    outputs=[output_pdfs],
    compute_target='test-compute-ns',  # Specify the Azure ML compute target
    runconfig=run_config,
    source_directory='src/',  # Directory containing your script
    allow_reuse=True
)

In [3]:
# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[update_pdf_step])

In [4]:
# Create an experiment and run the pipeline
experiment = Experiment(ws, 'Update_PDF_Dataset_Experiment')
pipeline_run = experiment.submit(pipeline)

Created step Update PDF Dataset [52ab1003][329c318a-094b-4eff-b4ce-27018676e516], (This step will run and generate new outputs)
Submitted PipelineRun f6347b2c-e3ca-4d72-adbc-3842d47f6ef6
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/f6347b2c-e3ca-4d72-adbc-3842d47f6ef6?wsid=/subscriptions/c85fd57e-ca5c-4866-9ae0-07accff31328/resourcegroups/rs-cloud-poc-ns/workspaces/aml-cloud-poc&tid=6be6e8b9-0525-4159-a288-e8c746abe0c3


### Add further steps to pipeline: Doc Ai custom model

In [4]:
import os
from azureml.core import Run, Dataset
import configparser
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
from azure.core.serialization import AzureJSONEncoder
import json


In [9]:
import os
from azureml.core import Run, Dataset
import configparser
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
from azure.core.serialization import AzureJSONEncoder
import json

# Set custom model and target form
model_id = "citi_test_model"


document_analysis_client = DocumentAnalysisClient(
    endpoint=doc_endpoint, credential=AzureKeyCredential(doc_key)
)

# Run custom model
model_id = "citi_test_model"
form = open('test_docs/3Q23.pdf', 'rb')
poller = document_analysis_client.begin_analyze_document(model_id, form)
result = poller.result()

# Result to dictionary
result_dict = result.to_dict()

with open(f'results.json' , 'w') as f:
    json.dump(result_dict, f, cls = AzureJSONEncoder)


In [11]:
# Result to dictionary
result_dict = result.to_dict()

with open(f'results.json' , 'w') as f:
    json.dump(result_dict, f, cls = AzureJSONEncoder)

In [7]:
import configparser

# Load credentials
config = configparser.ConfigParser(interpolation = None)
config.read('config.ini')

doc_endpoint = config['docintel']['endpoint']
doc_key = config['docintel']['key']
connection_str = config['storage']['connection_string']
storage_key = config['storage']['key']

In [8]:
from azureml.core import ScriptRunConfig, Experiment, Environment, Workspace
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import PipelineData

# Load your Azure ML Workspace
ws = Workspace.from_config()

# Optional: Define a custom run environment
env = Environment(name="pdf_processing_environment")
conda_dep = CondaDependencies()
conda_dep.add_pip_package("azureml-sdk")  # Ensure the SDK is included
conda_dep.add_pip_package("azure-ai-formrecognizer")  # Ensure the Form Recognizer SDK is included
env.python.conda_dependencies = conda_dep

# Create a run configuration
run_config = RunConfiguration()
run_config.environment = env

# Define the output for the PDFs
output_pdfs = PipelineData(name='output_pdfs', datastore=ws.get_default_datastore())

# Define the first step to update the PDF dataset
update_pdf_step = PythonScriptStep(
    name='Update PDF Dataset',
    script_name='update_pdf_dataset.py',  # Your script file name
    arguments=['--output_dir', output_pdfs],
    outputs=[output_pdfs],
    compute_target='test-compute-ns',  # Specify the Azure ML compute target
    runconfig=run_config,
    source_directory='src/',  # Directory containing your script
    allow_reuse=True
)

# Define the output for the JSON files
output_jsons = PipelineData(name='output_jsons', datastore=ws.get_default_datastore())

# Define the second step to process the PDFs
process_pdfs_step = PythonScriptStep(
    name='Process PDFs',
    script_name='process_pdfs.py',  # Your script file name
    arguments=[
        '--input_dir', output_pdfs,
        '--output_dir', output_jsons,
        '--doc_endpoint', doc_endpoint,  # Replace with your actual endpoint
        '--doc_key', doc_key,  # Replace with your actual key
    ],
    inputs=[output_pdfs],
    outputs=[output_jsons],
    compute_target='test-compute-ns',  # Specify the Azure ML compute target
    runconfig=run_config,
    source_directory='src/',  # Directory containing your script
    allow_reuse=True
)

In [9]:
# Create the pipeline with the steps
pipeline = Pipeline(workspace=ws, steps=[update_pdf_step, process_pdfs_step])

# Submit the pipeline
experiment = Experiment(ws, 'pdf_processing_pipeline')
pipeline_run = experiment.submit(pipeline)

pipeline_run.wait_for_completion(show_output=True)

Created step Update PDF Dataset [6a97e33b][ac57b0a4-5005-4993-bd56-f0dc1119fe69], (This step will run and generate new outputs)
Created step Process PDFs [ec410c82][5953a45d-e169-4412-abcd-62b1837383cf], (This step will run and generate new outputs)
Submitted PipelineRun 503441ba-6bdc-4f63-af96-350c4d723a51
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/503441ba-6bdc-4f63-af96-350c4d723a51?wsid=/subscriptions/c85fd57e-ca5c-4866-9ae0-07accff31328/resourcegroups/rs-cloud-poc-ns/workspaces/aml-cloud-poc&tid=6be6e8b9-0525-4159-a288-e8c746abe0c3
PipelineRunId: 503441ba-6bdc-4f63-af96-350c4d723a51
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/503441ba-6bdc-4f63-af96-350c4d723a51?wsid=/subscriptions/c85fd57e-ca5c-4866-9ae0-07accff31328/resourcegroups/rs-cloud-poc-ns/workspaces/aml-cloud-poc&tid=6be6e8b9-0525-4159-a288-e8c746abe0c3
PipelineRun Status: NotStarted
PipelineRun Status: Running




Expected a StepRun object but received <class 'azureml.core.run.Run'> instead.
This usually indicates a package conflict with one of the dependencies of azureml-core or azureml-pipeline-core.
Please check for package conflicts in your python environment
