In [None]:
import os
key = os.environ['VISION_API_KEY']
search_key = os.environ['AZURE_SEARCH_KEY']
translator_key = os.environ['TRANSLATOR_KEY']

search_service = "mmlspark-azure-search"
search_index = "form-demo-index"

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def blob_to_url(blob):
  [prefix, postfix] = blob.split("@")
  container = prefix.split("/")[-1]
  split_postfix = postfix.split("/")
  account = split_postfix[0]
  filepath = "/".join(split_postfix[1:])
  return "https://{}/{}/{}".format(account, container, filepath)


df2 = (spark.read.format("binaryFile")
       .load("wasbs://ignite2021@mmlsparkdemo.blob.core.windows.net/form_subset/*")
       .select("path")
       .limit(10)
       .select(udf(blob_to_url, StringType())("path").alias("url"))
       .cache()
      )


In [None]:
display(df2)

In [None]:
displayHTML("""
<embed src="https://mmlsparkdemo.blob.core.windows.net/ignite2021/form_svgs/Invoice11205.svg" width="40%"/>
""")

In [None]:
from synapse.ml.cognitive import AnalyzeInvoices

analyzed_df = (AnalyzeInvoices()
  .setSubscriptionKey(key)
  .setLocation("eastus")
  .setImageUrlCol("url")
  .setOutputCol("invoices")
  .setErrorCol("errors")
  .setConcurrency(5)
  .transform(df2)
  .cache())


In [None]:
display(analyzed_df)

In [None]:
from synapse.ml.cognitive import FormOntologyLearner

organized_df = (FormOntologyLearner()
  .setInputCol("invoices")
  .setOutputCol("extracted")
  .fit(analyzed_df)
  .transform(analyzed_df)
  .select("url", "extracted.*")
  .cache())

In [None]:
display(organized_df)

In [None]:
from pyspark.sql.functions import explode, col
itemized_df = (organized_df
        .select("*", explode(col("Items")).alias("Item"))
        .drop("Items")
        .select("Item.*", "*")
        .drop("Item"))


In [None]:
display(itemized_df)

In [None]:
display(itemized_df.where(col("ProductCode") == 48))

In [None]:
from synapse.ml.cognitive import Translate

translated_df = (Translate()
    .setSubscriptionKey(translator_key)
    .setLocation("eastus")
    .setTextCol("Description")
    .setErrorCol("TranslationError")
    .setOutputCol("output")
    .setToLanguage(["zh-Hans", "fr", "ru", "cy"])
    .setConcurrency(5)
    .transform(itemized_df)
    .withColumn("Translations", col("output.translations")[0])
    .drop("output", "TranslationError")
    .cache())


In [None]:
display(translated_df)

In [None]:
from synapse.ml.cognitive import *
from pyspark.sql.functions import monotonically_increasing_id, lit

(translated_df
  .withColumn("DocID", monotonically_increasing_id().cast("string"))
  .withColumn("SearchAction", lit("upload"))
  .writeToAzureSearch(
    subscriptionKey=search_key,
    actionCol="SearchAction",
    serviceName=search_service,
    indexName=search_index,
    keyCol="DocID")
)


In [None]:
import requests
url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)
requests.post(url, json={"search": "door"}, headers = {"api-key": search_key}).json()