In [1]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, flatten

import pyspark.sql.types as T
from synapse.ml.services.translate import *
from synapse.ml.services.openai import *
from synapse.ml.services import AzureSearchWriter

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 3, Finished, Available)

In [2]:
# Read BBC News from Files Area
bbcnews = spark.sql("SELECT * FROM SilverLakehouse.bbcnews_enriched").dropDuplicates(subset=['title'])

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 4, Finished, Available)

## Use Azure AI Translator to translate 'title' and 'content' from English to French ##

In [3]:
# Declare Synapse ML Transformers
translate_title = (Translate()
    .setTextCol("title")
    .setToLanguage(["fr"])
    .setOutputCol("title_fr")
    .setConcurrency(5))

translate_content = (Translate()
    .setTextCol("content")
    .setToLanguage(["fr-ca"])
    .setOutputCol("content_fr")
    .setConcurrency(5))

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 5, Finished, Available)

In [4]:
# Apply Transformers on Apache Spark DataFrames
bbcnews_translated = translate_title.transform(bbcnews) 
bbcnews_translated = translate_content.transform(bbcnews_translated)

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 6, Finished, Available)

In [5]:
# Extract Text from Spark
bbcnews_translated = bbcnews_translated \
    .withColumn("title_fr", flatten(col("title_fr.translations"))) \
    .withColumn("title_fr", col("title_fr.text")[0]) \
    .withColumn("content_fr", flatten(col("content_fr.translations"))) \
    .withColumn("content_fr", col("content_fr.text")[0])

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 7, Finished, Available)

## Use Azure OpenAI Ada to Fetch Embedding for 'content' Column ##

In [6]:
# Get Azure OpenAI Embedding
embedding = (
    OpenAIEmbedding()
    .setDeploymentName("text-embedding-ada-002") # set deployment_name as text-embedding-ada-002
    .setTextCol("content")
    .setOutputCol("embedding")
)

bbcnews_embedding = embedding.transform(bbcnews_translated)

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 8, Finished, Available)

In [7]:
# Convert from Vector to Array
to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
bbcnews_embedding = bbcnews_embedding.withColumn('embedding', to_array('embedding'))

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 9, Finished, Available)

In [8]:
# Write to Silver Layer
bbcnews_embedding.write.mode("overwrite").format("delta").save("abfss://7339dc5b-3819-4259-a1a5-4d776fa17255@msit-onelake.dfs.fabric.microsoft.com/7f69bba2-4ae3-44ae-93e8-1b0a6b8f2b81/Tables/bbcnews_silver")

StatementMeta(, 49fe56d5-e5fb-4333-a8a7-820be63728fc, 10, Finished, Available)