# Use a notebook with Apache Spark to query a KQL database

Documentation for use of this notebook can be found here: [Use a notebook with Apache Spark to query a KQL database](https://learn.microsoft.com/fabric/real-time-analytics/spark-connector)

In [None]:
# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "green"
blob_sas_token = r""

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
# Display top 10 rows
print('Displaying top 10 rows: ')
df.printSchema

In [None]:
#The target where this data will be written to
kustoUri = "https://<enter-your-URI-here>.kusto.data.microsoft.com"
database="<enter-your-database-name>"
table="<enter-your-table-name>" #for example, GreenTaxiData

In [None]:
#This is an example of writing data to Kusto. The source data is read as a blob into a dataframe from Azure Open Data for GreenTaxi / Limousines in NYC.
#The access token is created using the user's credential and will be used to write the data to the Kusto table GreenTaxiData, therefore the user is required 
#for 'user' privileges or above on the target database and table 'admin' privileges if the table already exists. If the table does not exist, 
#it will be created with the DataFrame schema.
df.write.format("com.microsoft.kusto.spark.synapse.datasource").\
option("kustoCluster",kustoUri).\
option("kustoDatabase",database).\
option("kustoTable", table).\
option("accessToken", mssparkutils.credentials.getToken(kustoUri)).\
option("tableCreateOptions", "CreateIfNotExist").mode("Append").save()

In [None]:
#This is an example of Reading data from the KQL Database. Here the query retrieves the max,min fares and distances that the taxi recorded every month from the years 2014 to 2020
kustoQuery = "GreenTaxiData |  where puYear between (2014 .. 2020 ) | summarize  MaxDistance=max(tripDistance) , MaxFare = max(fareAmount) ,MinDistance=min(tripDistance) , MinFare = min(fareAmount) by puYear,puMonth | order by puYear,puMonth desc"
kustoDf  = spark.read\
            .format("com.microsoft.kusto.spark.synapse.datasource")\
            .option("accessToken", mssparkutils.credentials.getToken(kustoUri))\
            .option("kustoCluster", kustoUri)\
            .option("kustoDatabase", database) \
            .option("kustoQuery", kustoQuery).load()

In [None]:
kustoDf.show()