# 0. Read Parameters from Azure Data Factory

https://docs.microsoft.com/en-us/azure/data-factory/transform-data-using-databricks-notebook#create-a-pipeline

In [2]:
# Creating widgets for leveraging parameters, and printing the parameters
dbutils.widgets.text("input", "","")
dbutils.widgets.get("input")
parameter = getArgument("input")
print (parameter)

# 1. Access to Data Lake Gen 2

In [4]:
# set parameters
STORAGE_ACCOUNT_NAME = "<your-storageaccount-name>"
STORAGE_CONTAINER = "<your-storageaccount-container-name>"
LOCAL_FILE_PATH = "<your-local-file-path>"
SCOPE_NAME = "<your-db-secrets-scope-name>"
KEY_NAME = "<your-db-secrets-key-name>"

In [5]:
# set the credentials to access the azure data lake v2 storage resource
spark.conf.set(
  "fs.azure.account.key." + STORAGE_ACCOUNT_NAME + ".dfs.core.windows.net",
  dbutils.secrets.get(scope = SCOPE_NAME, key = KEY_NAME)
)

In [6]:
# list files in filepath
FOLDER_PATH = "abfss://" + STORAGE_CONTAINER + "@" + STORAGE_ACCOUNT_NAME + ".dfs.core.windows.net/"
file_list = dbutils.fs.ls(FOLDER_PATH)

In [7]:
data = spark.read.csv(file_list[0].path)
display(data)

_c0,_c1,_c2,_c3,_c4,_c5
00001.jpg,30,52,246,147,181
00002.jpg,100,19,576,203,103
00003.jpg,51,105,968,659,145
00004.jpg,67,84,581,407,187
00005.jpg,140,151,593,339,185
00006.jpg,20,77,420,301,78
00007.jpg,249,166,2324,1459,118
00008.jpg,119,215,1153,719,165
00009.jpg,1,7,275,183,32
00010.jpg,28,55,241,177,60


In [8]:
# Register table so it is accessible via R Context
data.createOrReplaceTempView("data")

# 2. Access Data in SQL

In [10]:
%sql
SELECT *
FROM data

_c0,_c1,_c2,_c3,_c4,_c5
00001.jpg,30,52,246,147,181
00002.jpg,100,19,576,203,103
00003.jpg,51,105,968,659,145
00004.jpg,67,84,581,407,187
00005.jpg,140,151,593,339,185
00006.jpg,20,77,420,301,78
00007.jpg,249,166,2324,1459,118
00008.jpg,119,215,1153,719,165
00009.jpg,1,7,275,183,32
00010.jpg,28,55,241,177,60


# 3. Access Data in SparkR

In [12]:
%r
library(SparkR)
data <- sql("SELECT * FROM data")
display(data)

_c0,_c1,_c2,_c3,_c4,_c5
00001.jpg,30,52,246,147,181
00002.jpg,100,19,576,203,103
00003.jpg,51,105,968,659,145
00004.jpg,67,84,581,407,187
00005.jpg,140,151,593,339,185
00006.jpg,20,77,420,301,78
00007.jpg,249,166,2324,1459,118
00008.jpg,119,215,1153,719,165
00009.jpg,1,7,275,183,32
00010.jpg,28,55,241,177,60


# 4. Convert Data to R data.frame

In [14]:
%r
# save data in R data.frame
r_data <- collect(data)

# 5. Execute R Script

In [16]:
%r
#####################################
# TODO
#####################################

# insert script here

# Remove this line. This is just a placeholder to demonstrate the next steps
output <- data

#####################################
# TODO
#####################################

# 6. Pass Data to PySpark

In [18]:
%r
# Register table so it is accessible via Python Context
createOrReplaceTempView(output, "output")

# 7. Access Data in PySpark

In [20]:
output = sql("SELECT * FROM output")
display(output)

_c0,_c1,_c2,_c3,_c4,_c5
00001.jpg,30,52,246,147,181
00002.jpg,100,19,576,203,103
00003.jpg,51,105,968,659,145
00004.jpg,67,84,581,407,187
00005.jpg,140,151,593,339,185
00006.jpg,20,77,420,301,78
00007.jpg,249,166,2324,1459,118
00008.jpg,119,215,1153,719,165
00009.jpg,1,7,275,183,32
00010.jpg,28,55,241,177,60


# 8. Save Output on Data Lake Gen 2

In [22]:
import os

OUTPUT_PATH = os.path.join(FOLDER_PATH, "output")
output.repartition(1).write.csv(OUTPUT_PATH, header=True)