In [1]:
###   REAL-TIME TARGETED ADVERTISING ON MEETUP   ###

####################################
#REFERENCE DOCUMENTATION
####################################
#https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions
#https://databricks.com/blog/2017/02/23/working-complex-data-formats-structured-streaming-apache-spark-2-1.html
#https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html
#https://docs.databricks.com/spark/latest/dataframes-datasets/complex-nested-data.html
#https://docs.azuredatabricks.net/_static/notebooks/transform-complex-data-types-python.html
#https://databricks.com/blog/2018/03/13/introducing-stream-stream-joins-in-apache-spark-2-3.html
#https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
####################################


from pyspark.sql.types import *
from pyspark.sql.functions import from_json, to_json, explode, expr, col, lower, struct #array_contains

#Set up file paths
rootPath = "abfss://pbchamp@uwbigdatatechnologies.dfs.core.windows.net/FinalProject"
archivePath = rootPath + "/RSVPArchive"
archiveCheckpointPath = rootPath + "/RSVPArchiveCheckpoint"
advertiserPath = rootPath + "/Advertiser"
advertiserOutputPath = rootPath + "/AdvertiserOutput"
advertiserOutputCheckpointPath = rootPath + "/AdvertiserOutputCheckpoint"
advertiserOutputCheckpoint2Path = rootPath + "/AdvertiserOutputCheckpoint2"

#Empty directories (non-prod only)
dbutils.fs.rm(archivePath, True)
dbutils.fs.rm(archiveCheckpointPath, True)
dbutils.fs.rm(advertiserOutputPath, True)
dbutils.fs.rm(advertiserOutputCheckpointPath, True)
dbutils.fs.rm(advertiserOutputCheckpoint2Path, True)

#Load static advertiser data
advertiserDF = spark.read.json(advertiserPath + "/advertiser.json", multiLine = True)

####################################
#Read Meetup RSVP stream from Kafka
####################################
server = "ubuntuserver010.westus2.cloudapp.azure.com:9092"
input_topic = "meetupallrsvps"
output_topic = "meetupad"
offset = "latest"

kafkaMeetupDF = (
  spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", server)
    .option("subscribe", input_topic)
    .option("startingOffsets", offset)
    .load()
)

#Select key and value from Kafka data
meetupRawDF = kafkaMeetupDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
####################################

#Create nested schema for RSVP JSON
jsonSchemaRSVP = (
  StructType()
  .add("venue", StructType()
       .add("lon", DoubleType())
       .add("lat", DoubleType()))
  .add("response", StringType())
  .add("member", StructType()
       .add("member_id", LongType()))
  .add("rsvp_id", LongType())
  .add("mtime", LongType())
  .add("event", StructType()
       .add("event_name", StringType()))
  .add("group", StructType()
       .add("group_topics", ArrayType(StructType()
            .add("topic_name", StringType()))))  
)

#Structure JSON Meetup data; explode array of topic names/keywords into a row for each; filter for 'yes' responses
meetupDF = (
  meetupRawDF
     .select(from_json("value", jsonSchemaRSVP)
     .alias("meetup"))
      #select individual fields instead of "*" to flatten nested JSON
     .select("meetup.rsvp_id", "meetup.member.member_id", "meetup.mtime", "meetup.response", 
             "meetup.event.event_name", "meetup.venue.lon", "meetup.venue.lat", 
             "meetup.group.group_topics.topic_name")
).select("rsvp_id", "member_id", "mtime", "response", "event_name", "lon", "lat", 
         explode("topic_name").alias("keyword")
  ).filter("response = 'yes'")

#Join Meetup DF with Advertiser DF on an expression (rather than column(s)), to determine 
# whether the Meetup event is within an advertiser's target lat/lon
joinPredicate = "mu.lon between ad.minlon and ad.maxlon AND mu.lat between ad.minlat and ad.maxlat"

meetupAdDF = (
  meetupDF.alias("mu").join(
    advertiserDF.alias("ad"),
    expr(joinPredicate)
  )
)

#Explode advertiser's keywords and look for match between those and the Meetup group's keywords (topics) 
#Note: Joining the two dataframe's on an expression using the function array_intersect() (e.g. "size(array_intersect(t.topic_name, f.keywords)) > 0") 
#  was explored, but the Meetup keywords are mixed case and no efficient way to use lower() function was found.
# Thus, explode() is used.
meetupAd2DF = (
  meetupAdDF
    .withColumn("adKeywords", explode("keywords"))
    .filter(lower(col("keyword")) == col("adKeywords"))
    .select("rsvp_id", "member_id", "mtime", "response", "event_name", "lon", "lat", 
            "advertiser", "city", "keywords", "minlon", "maxlon", "minlat", "maxlat")
    .distinct()
)

#Archive Meetup records that meet Advertiser's targets to disk
meetupArchiveQuery = (
  meetupAd2DF
    .writeStream
    .format("parquet")
    .option("path", archivePath)  
    .option("checkpointLocation", archiveCheckpointPath)
    .start()
)

meetupAdFinalDF = meetupAd2DF.select("rsvp_id", "member_id", "event_name", "advertiser", "city").distinct()

#Write key data for advertising to disk (will also send back into Kafka topic for advertising consumption)
meetupAdArchiveQuery = (
  meetupAdFinalDF
    .writeStream
    .format("parquet")
    .option("path", advertiserOutputPath)  
    .option("checkpointLocation", advertiserOutputCheckpointPath)
    .start()
)

meetupAdKafkaDF = (
  meetupAdFinalDF
    .select(
        col("rsvp_id").cast("string").alias("key"),
        to_json(struct("member_id", "event_name", "advertiser", "city")).alias("value")
    )
)

meetupAdKafkaQuery = (
  meetupAdKafkaDF
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", server)
    .option("topic", output_topic)
    .option("checkpointLocation", advertiserOutputCheckpoint2Path)
    #.outputMode("complete")
    .start()
)


In [2]:
display(meetupAd2DF)

rsvp_id,member_id,mtime,response,event_name,lon,lat,advertiser,city,keywords,minlon,maxlon,minlat,maxlat
1833241628,212641076,1584292353986,yes,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,-122.399445,37.788803,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833242400,187579401,1584293059792,yes,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,-122.399445,37.788803,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833242400,187579401,1584293060000,yes,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,-122.399445,37.788803,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833243534,193660532,1584294086745,yes,"Deep Learning (Tensor Flow, DJL and DL4J ) for Java Developers",-121.894905,37.332855,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833245145,184104333,1584294910544,yes,"Deep Learning (Tensor Flow, DJL and DL4J ) for Java Developers",-121.894905,37.332855,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833245725,214186432,1584295467396,yes,Scalable Systems Study Group,-121.95524,37.354107,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833245908,161070382,1584295631151,yes,Deep Learning 101: Artificial Intelligence Based On The Brain,-122.0096,37.383553,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833246282,11411429,1584295953103,yes,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,-122.399445,37.788803,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833246282,11411429,1584295957414,yes,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,-122.399445,37.788803,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294
1833247406,110703232,1584296963212,yes,"Butterflies, big data and the multifarious stressors of the Anthropocene",-122.27363,38.023796,Databricks,San Francisco,"List(spark, hadoop, big data, machine learning, ai, data pipeline)",-122.938614,-121.763077,37.178392,38.52294


In [3]:
display(meetupAdFinalDF)

rsvp_id,member_id,event_name,advertiser,city
1833241628,212641076,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,Databricks,San Francisco
1833242400,187579401,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,Databricks,San Francisco
1833243534,193660532,"Deep Learning (Tensor Flow, DJL and DL4J ) for Java Developers",Databricks,San Francisco
1833245145,184104333,"Deep Learning (Tensor Flow, DJL and DL4J ) for Java Developers",Databricks,San Francisco
1833245725,214186432,Scalable Systems Study Group,Databricks,San Francisco
1833245908,161070382,Deep Learning 101: Artificial Intelligence Based On The Brain,Databricks,San Francisco
1833246282,11411429,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,Databricks,San Francisco
1833247406,110703232,"Butterflies, big data and the multifarious stressors of the Anthropocene",Databricks,San Francisco
1833247571,185424938,KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch,Databricks,San Francisco


In [4]:
display(meetupAdKafkaDF)

key,value
1833241628,"{""member_id"":212641076,""event_name"":""KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833242400,"{""member_id"":187579401,""event_name"":""KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833243534,"{""member_id"":193660532,""event_name"":""Deep Learning (Tensor Flow, DJL and DL4J ) for Java Developers"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833245145,"{""member_id"":184104333,""event_name"":""Deep Learning (Tensor Flow, DJL and DL4J ) for Java Developers"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833245725,"{""member_id"":214186432,""event_name"":""Scalable Systems Study Group"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833245908,"{""member_id"":161070382,""event_name"":""Deep Learning 101: Artificial Intelligence Based On The Brain"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833246282,"{""member_id"":11411429,""event_name"":""KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833247406,"{""member_id"":110703232,""event_name"":""Butterflies, big data and the multifarious stressors of the Anthropocene"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
1833247571,"{""member_id"":185424938,""event_name"":""KubeFlow +Keras/TensorFlow 2.0 +TF Extended (TFX) +Kubernetes +Airflow +PyTorch"",""advertiser"":""Databricks"",""city"":""San Francisco""}"
