In [1]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, flatten

import pyspark.sql.types as T
from synapse.ml.services.translate import *
from synapse.ml.services.openai import *
from synapse.ml.services import AzureSearchWriter

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 3, Finished, Available)

In [5]:
# Read BBC News from Files Area
bbcnews = spark.read.format("csv").option("header","true").option("delimiter", "\t").load("Files/bbcnews/bbc-news-data.csv")

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 7, Finished, Available)

## Use Azure AI Translator to translate 'title' and 'content' from English to French ##

In [7]:
# Declare Synapse ML Transformers
translate_title = (Translate()
    .setTextCol("title")
    .setToLanguage(["fr"])
    .setOutputCol("title_fr")
    .setConcurrency(5))

translate_content = (Translate()
    .setTextCol("content")
    .setToLanguage(["fr-ca"])
    .setOutputCol("content_fr")
    .setConcurrency(5))

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 9, Finished, Available)

In [8]:
# Apply Transformers on Apache Spark DataFrames
bbcnews_translated = translate_title.transform(bbcnews) 
bbcnews_translated = translate_content.transform(bbcnews_translated)

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 10, Finished, Available)

In [9]:
# Extract Text from Spark
bbcnews_translated = bbcnews_translated \
    .withColumn("title_fr", flatten(col("title_fr.translations"))) \
    .withColumn("title_fr", col("title_fr.text")[0]) \
    .withColumn("content_fr", flatten(col("content_fr.translations"))) \
    .withColumn("content_fr", col("content_fr.text")[0])

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 11, Finished, Available)

## Use Azure OpenAI Ada to Fetch Embedding for 'content' Column ##

In [10]:
# Get Azure OpenAI Embedding
embedding = (
    OpenAIEmbedding()
    .setDeploymentName("text-embedding-ada-002") # set deployment_name as text-embedding-ada-002
    .setTextCol("content")
    .setOutputCol("embedding")
)

bbcnews_embedding = embedding.transform(bbcnews_translated)

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 12, Finished, Available)

In [11]:
# Convert from Vector to Array
to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
bbcnews_embedding = bbcnews_embedding.withColumn('embedding', to_array('embedding'))

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 13, Finished, Available)

In [12]:
# Write to Silver Layer
bbcnews_embedding.write.mode("overwrite").format("delta").save(f"{YOUR_ABFSS_PATH}")

StatementMeta(, 46c984ab-cd8a-4a61-879a-36cfed4c4beb, 14, Submitted, Running)