In [None]:
# Hosted notebook environments may not have the local features package
import os
from IPython import get_ipython


def need_download_modules():
    if 'google.colab' in str(get_ipython()):
        return True
    if 'HOPSWORKS_PROJECT_ID' in os.environ:
        return True
    return False

if need_download_modules():
    print("⚙️ Downloading modules...")
    os.system('mkdir -p synthetic_data')
    os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py')
    os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py')
    os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py')
    os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py')
    print('✅ Done!')
else:
    print("Local environment")

In [None]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import (
    from_json,
    window,
    avg,
    count,
    stddev,
    explode,
    date_format,
    col,
    mean,
    pandas_udf,
    PandasUDFType)

from pyspark.sql.types import (
    LongType,
    DoubleType,
    StringType,
    TimestampType,
    StructType,
    StructField,
)

In [None]:
import hopsworks
import hsfs
from hsfs import engine

project = hopsworks.login()
fs = project.get_feature_store()
hsfs.connection()
kafka_config = engine.get_instance()._get_kafka_config(feature_store_id=fs.id)
KAFKA_TOPIC_NAME = "credit_card_transactions"

In [None]:
# get data from the source
df_read = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_config) \
    .option("startingOffsets", "earliest") \
    .option("maxOffsetsPerTrigger", 100) \
    .option("subscribe", KAFKA_TOPIC_NAME) \
    .load()

In [None]:
parse_schema = StructType([StructField("tid", StringType(), True),
                           StructField("datetime", TimestampType(), True),
                           StructField("cc_num", LongType(), True),
                           StructField("category", StringType(), True),
                           StructField("amount", DoubleType(), True),
                           StructField("latitude", DoubleType(), True),
                           StructField("longitude", DoubleType(), True),
                           StructField("city", StringType(), True),
                           StructField("country", StringType(), True),
                           ])

In [None]:
# Deserialize data from and create streaming query
transaction_streaming_df = df_read.selectExpr("CAST(value AS STRING)") \
    .select(from_json("value", parse_schema).alias("value")) \
    .select("value.tid",
            "value.datetime",
            "value.cc_num",
            "value.category",
            "value.amount",
            "value.latitude",
            "value.longitude",
            "value.city",
            "value.country") \
    .selectExpr("CAST(tid as string)",
                "CAST(datetime as timestamp)",
                "CAST(cc_num as long)",
                "CAST(category as string)",
                "CAST(amount as double)",
                "CAST(latitude as double)",
                "CAST(longitude as double)",
                "CAST(city as string)",
                "CAST(country as string)"
                )

In [None]:
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transaction data",
    primary_key=['cc_num'],
    event_time='datetime',
    #partition_key=['month'],
    stream=True,
    online_enabled=True
)

In [None]:
q = trans_fg.insert_stream(transaction_streaming_df)

In [None]:
q.status