In [None]:
from indico import IndicoClient, IndicoConfig
from indico.filters import SubmissionFilter, or_
from indico.queries import (
    JobStatus,
    ListSubmissions,
    RetrieveStorageObject,
    SubmissionResult,
    SubmitReview,
    UpdateSubmission,
    WaitForSubmissions,
    WorkflowSubmission,
    WorkflowSubmissionDetailed,
)
import pandas as pd
from indico.queries import GraphQLRequest
from indico.queries import SubmissionResult
from pyspark.sql.functions import *
import json
from pyspark.sql.types import *



# Overview

This notebook is intended to run in databricks, and be an example of how to work with the databricks file system

In [None]:
from indico import IndicoClient, IndicoConfig
API_TOKEN = dbutils.secrets.get(scope = "indico", key = "api_token")
my_config = IndicoConfig(
    host='try.indico.io',
api_token=API_TOKEN
)

In [None]:
client = IndicoClient(config=my_config)

In [None]:
workflow_id = 701

# Mounted Files

Files in S3 can be mounted as a native drive in databricks and accessed via a file system

In [None]:
%fs ls dbfs:/mnt/s3_databricks/indico/SEC

path,name,size,modificationTime
dbfs:/mnt/s3_databricks/indico/SEC/097CD46E-DC14-4774-B1F5-251B31BE9882.pdf,097CD46E-DC14-4774-B1F5-251B31BE9882.pdf,90174,1662494245000
dbfs:/mnt/s3_databricks/indico/SEC/0AAC4037-BC95-4FAA-ADD8-A004B37ECC04.pdf,0AAC4037-BC95-4FAA-ADD8-A004B37ECC04.pdf,89533,1662494225000
dbfs:/mnt/s3_databricks/indico/SEC/0D9402B9-F35C-4281-A30C-9772D0682333.pdf,0D9402B9-F35C-4281-A30C-9772D0682333.pdf,87390,1662494226000
dbfs:/mnt/s3_databricks/indico/SEC/0E98AEA9-4219-4B78-82A6-7EB918742E53.pdf,0E98AEA9-4219-4B78-82A6-7EB918742E53.pdf,86854,1662494227000
dbfs:/mnt/s3_databricks/indico/SEC/0F854348-4FFB-453E-84E9-82239C0BCCBD.pdf,0F854348-4FFB-453E-84E9-82239C0BCCBD.pdf,93775,1662494227000
dbfs:/mnt/s3_databricks/indico/SEC/124CA2A1-1AA7-44E9-9B84-72837C3E89E0.pdf,124CA2A1-1AA7-44E9-9B84-72837C3E89E0.pdf,87945,1662494246000
dbfs:/mnt/s3_databricks/indico/SEC/132D5D60-FA43-4CE3-BFA5-240CCCF80464.pdf,132D5D60-FA43-4CE3-BFA5-240CCCF80464.pdf,86854,1662494246000
dbfs:/mnt/s3_databricks/indico/SEC/19FE34F0-D702-41C9-A4E1-69CB6EDB6919.pdf,19FE34F0-D702-41C9-A4E1-69CB6EDB6919.pdf,91923,1662494236000
dbfs:/mnt/s3_databricks/indico/SEC/1E4B7542-E393-45AF-9AF7-F835BD4E0524.pdf,1E4B7542-E393-45AF-9AF7-F835BD4E0524.pdf,90174,1662494228000
dbfs:/mnt/s3_databricks/indico/SEC/1EB4A60A-1FE0-44E7-9C1F-AD409C17FE93.pdf,1EB4A60A-1FE0-44E7-9C1F-AD409C17FE93.pdf,87626,1662494228000


## Submitting file from the bucket

To submit files, we need to build a list using dbutils

In [None]:
files_list = dbutils.fs.ls("dbfs:/mnt/s3_databricks/indico/SEC")

In [None]:
files_list

Out[7]: [FileInfo(path='dbfs:/mnt/s3_databricks/indico/SEC/097CD46E-DC14-4774-B1F5-251B31BE9882.pdf', name='097CD46E-DC14-4774-B1F5-251B31BE9882.pdf', size=90174, modificationTime=1662494245000),
 FileInfo(path='dbfs:/mnt/s3_databricks/indico/SEC/0AAC4037-BC95-4FAA-ADD8-A004B37ECC04.pdf', name='0AAC4037-BC95-4FAA-ADD8-A004B37ECC04.pdf', size=89533, modificationTime=1662494225000),
 FileInfo(path='dbfs:/mnt/s3_databricks/indico/SEC/0D9402B9-F35C-4281-A30C-9772D0682333.pdf', name='0D9402B9-F35C-4281-A30C-9772D0682333.pdf', size=87390, modificationTime=1662494226000),
 FileInfo(path='dbfs:/mnt/s3_databricks/indico/SEC/0E98AEA9-4219-4B78-82A6-7EB918742E53.pdf', name='0E98AEA9-4219-4B78-82A6-7EB918742E53.pdf', size=86854, modificationTime=1662494227000),
 FileInfo(path='dbfs:/mnt/s3_databricks/indico/SEC/0F854348-4FFB-453E-84E9-82239C0BCCBD.pdf', name='0F854348-4FFB-453E-84E9-82239C0BCCBD.pdf', size=93775, modificationTime=1662494227000),
 FileInfo(path='dbfs:/mnt/s3_databricks/indico/SEC/1

### Accessing files

The path of the file can be accessed with the path key

In [None]:
first_file_path = files_list[0].path
first_file_path

Out[8]: 'dbfs:/mnt/s3_databricks/indico/SEC/097CD46E-DC14-4774-B1F5-251B31BE9882.pdf'

In [None]:
#Note: you can't pass the filename as is to Indico since you will get a "file not found error". You need to change dbfs: -> /dbfs

def format_dbfs_path(path):
  return "/dbfs"+path.split("dbfs:")[1]

In [None]:
first_file_path = format_dbfs_path(first_file_path)
first_file_path

Out[11]: '/dbfs/mnt/s3_databricks/indico/SEC/097CD46E-DC14-4774-B1F5-251B31BE9882.pdf'

In [None]:
#build the whole list
upload_files_list = []
for file in files_list:
  upload_files_list.append(format_dbfs_path(file.path))

In [None]:
submission_ids = client.call(WorkflowSubmission(workflow_id=workflow_id, files=upload_files_list))

In [None]:
submission_ids

Out[17]: [38212,
 38213,
 38214,
 38215,
 38216,
 38217,
 38218,
 38219,
 38220,
 38221,
 38222,
 38223,
 38224,
 38225,
 38226,
 38227,
 38228,
 38229,
 38230,
 38231,
 38232,
 38233,
 38234,
 38235,
 38236,
 38237,
 38238,
 38239,
 38240,
 38241,
 38242,
 38243,
 38244,
 38245,
 38246,
 38247,
 38248,
 38249,
 38250,
 38251]