From efc9c31be9d5f356cd08af305637b926772b8ca0 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 10 Jan 2024 11:00:08 +0100 Subject: [PATCH 01/16] Draft pyspark streaming --- .../pyspark_streaming/Feature Pipeline.ipynb | 182 +++++++++++ .../create_transaction_stream.py | 32 ++ .../synthetic_data/init_kafka.py | 85 +++++ .../synthetic_data/synthetic_data.py | 294 ++++++++++++++++++ 4 files changed, 593 insertions(+) create mode 100644 advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb create mode 100644 advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py create mode 100644 advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py create mode 100644 advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb new file mode 100644 index 00000000..fb1e4a0a --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -0,0 +1,182 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "6c7fb51b-602a-4cff-85d4-6aceb4ef9f29", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "from pyspark.sql.functions import (\n", + " from_json,\n", + " window,\n", + " avg,\n", + " count,\n", + " stddev,\n", + " explode,\n", + " date_format,\n", + " col,\n", + " mean,\n", + " pandas_udf,\n", + " PandasUDFType)\n", + "\n", + "from pyspark.sql.types import (\n", + " LongType,\n", + " DoubleType,\n", + " StringType,\n", + " TimestampType,\n", + " StructType,\n", + " StructField,\n", + ")\n", + "from synthetic_data import init_kafka" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8b68b2d-3236-463f-a872-ba76cf506b64", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "import hsfs\n", + "from hsfs import engine\n", + "\n", + "project = hopsworks.login()\n", + "fs = project.get_feature_store()\n", + "hsfs.connection()\n", + "kafka_config = engine.get_instance()._get_kafka_config(feature_store_id=fs.id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c67cfb8-c336-42c0-ba94-e65679efa5b8", + "metadata": {}, + "outputs": [], + "source": [ + "# get data from the source\n", + "df_read = spark \\\n", + " .readStream \\\n", + " .format(\"kafka\") \\\n", + " .options(**kafka_config) \\\n", + " .option(\"startingOffsets\", \"earliest\") \\\n", + " .option(\"maxOffsetsPerTrigger\", 100) \\\n", + " .option(\"subscribe\", init_kafka.KAFKA_TOPIC_NAME) \\\n", + " .load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ddf926c-9ea7-4997-859f-745da0618d51", + "metadata": {}, + "outputs": [], + "source": [ + "parse_schema = StructType([StructField(\"tid\", StringType(), True),\n", + " StructField(\"datetime\", TimestampType(), True),\n", + " StructField(\"cc_num\", LongType(), True),\n", + " StructField(\"category\", StringType(), True),\n", + " StructField(\"amount\", DoubleType(), True),\n", + " StructField(\"latitude\", DoubleType(), True),\n", + " StructField(\"longitude\", DoubleType(), True),\n", + " StructField(\"city\", StringType(), True),\n", + " StructField(\"country\", StringType(), True),\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4543ea17-1ae4-4aa1-a91a-bc00ecea45d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Deserialize data from and create streaming query\n", + "transaction_streaming_df = df_read.selectExpr(\"CAST(value AS STRING)\") \\\n", + " .select(from_json(\"value\", parse_schema).alias(\"value\")) \\\n", + " .select(\"value.tid\",\n", + " \"value.datetime\",\n", + " \"value.cc_num\",\n", + " \"value.category\",\n", + " \"value.amount\",\n", + " \"value.latitude\",\n", + " \"value.longitude\",\n", + " \"value.city\",\n", + " \"value.country\") \\\n", + " .selectExpr(\"CAST(tid as string)\",\n", + " \"CAST(datetime as timestamp)\",\n", + " \"CAST(cc_num as long)\",\n", + " \"CAST(category as string)\",\n", + " \"CAST(amount as double)\",\n", + " \"CAST(latitude as double)\",\n", + " \"CAST(longitude as double)\",\n", + " \"CAST(city as string)\",\n", + " \"CAST(country as string)\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80600685-7ba1-449f-a005-33fe4cda29cd", + "metadata": {}, + "outputs": [], + "source": [ + "trans_fg = fs.get_or_create_feature_group(\n", + " name=\"transactions\",\n", + " version=1,\n", + " description=\"Transaction data\",\n", + " primary_key=['cc_num'],\n", + " event_time='datetime',\n", + " #partition_key=['month'],\n", + " stream=True,\n", + " online_enabled=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b6f8a91-fe60-4e97-be7f-f17ee4b27c17", + "metadata": {}, + "outputs": [], + "source": [ + "q = trans_fg.insert_stream(transaction_streaming_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bed738f7-72c5-497c-816c-0d3f0cafdc0e", + "metadata": {}, + "outputs": [], + "source": [ + "q.status" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py b/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py new file mode 100644 index 00000000..3f640f98 --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py @@ -0,0 +1,32 @@ +import init_kafka +import synthetic_data +import hsfs +from hsfs.core.storage_connector_api import StorageConnectorApi +from confluent_kafka import Producer + +# creating kafka topic and schema +init_kafka.init() + +# Creating simulated data +data_simulater = synthetic_data.synthetic_data() +credit_cards, trans_df = data_simulater.create_simulated_transactions() +#TODO : Remove and make this into a different dataframe +trans_df = trans_df.drop(["fraud_label"], axis = 1) + +# Getting Kafka Configurations +connection = hsfs.connection() +fs = connection.get_feature_store() +sc_api = StorageConnectorApi() +sc = sc_api.get_kafka_connector(fs.id, False) + +kafka_config = sc.confluent_options() +kafka_config["group.id"] = "test_groupid" + +producer = Producer(kafka_config) + +for index, transaction in trans_df.iterrows(): + producer.produce(init_kafka.KAFKA_TOPIC_NAME, transaction.to_json()) + producer.flush() # Synchronising read and write blocking call + if index % 100 == 0 : + print(f'index {index}') + diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py b/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py new file mode 100644 index 00000000..1eddd7e3 --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py @@ -0,0 +1,85 @@ +import hopsworks + +# create kafka topic +KAFKA_TOPIC_NAME = "credit_card_transactions" +SCHEMA_NAME = "credit_card_transactions_schema" + +schema = { + "type": "record", + "name": SCHEMA_NAME, + "namespace": "io.hops.examples.flink.examples", + "fields": [ + { + "name": "tid", + "type": [ + "null", + "string" + ] + }, + { + "name": "datetime", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ] + }, + { + "name": "cc_num", + "type": [ + "null", + "long" + ] + }, + { + "name": "category", + "type": [ + "null", + "string" + ] + }, + { + "name": "amount", + "type": [ + "null", + "double" + ] + }, + { + "name": "latitude", + "type": [ + "null", + "double" + ] + }, + { + "name": "longitude", + "type": [ + "null", + "double" + ] + }, + { + "name": "city", + "type": [ + "null", + "string" + ] + }, + { + "name": "country", + "type": [ + "null", + "string" + ] + }, + ] +} + +def init(): + project = hopsworks.login() + kafka_api = project.get_kafka_api() + kafka_api.create_schema(SCHEMA_NAME, schema) + kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1) diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py b/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py new file mode 100644 index 00000000..d92f442a --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py @@ -0,0 +1,294 @@ +from faker import Faker +import numpy as np +import pandas as pd +import datetime +import random +import bisect +from dataclasses import dataclass +from typing import List +import hashlib + +@dataclass +class transaction_probablity: + min_value : float + max_value : float + probablity : float + +@dataclass +class categories_price_probality: + category : str + min_price : float + max_price : float + probablity : float + +@dataclass +class transaction_price_distribution: + price_dist : List[transaction_probablity] + +@dataclass +class categories_price_distribution: + category_dist : List[categories_price_probality] + +class synthetic_data: + def __init__(self, total_unique_users : int = 100, + total_unique_transactions : int = 54000, + cash_withrawal_cards_cards_total : int = 2000, + total_unique_cash_withdrawals : int = 1200, + fraud_ratio : int = 0.0025, + atm_withrawal_seq_length : list = [3, 4, 5, 6, 7, 8, 9, 10], + attack_chain_length = [3, 4, 5, 6, 7, 8, 9, 10], + normal_atm_radius : float = 0.01, + transaction_distribution : transaction_price_distribution = transaction_price_distribution(price_dist=[transaction_probablity(min_value=0.01, max_value=1.01, probablity=0.05), + transaction_probablity(min_value=1, max_value=11.01, probablity=0.075), + transaction_probablity(min_value=10, max_value=100.01, probablity=0.525), + transaction_probablity(min_value=100, max_value=1000.01, probablity=0.25), + transaction_probablity(min_value=1000, max_value=10000.01, probablity=0.099), + transaction_probablity(min_value=10000, max_value=30000.01, probablity=0.001),]), + categories_distribution : categories_price_distribution = categories_price_distribution(category_dist=[categories_price_probality(category="Grocery", min_price=0.01, max_price=100, probablity=0.5), + categories_price_probality(category="Restaurant/Cafeteria", min_price=1, max_price=100, probablity=0.2), + categories_price_probality(category="Health/Beauty", min_price=10, max_price=500.01, probablity=0.1), + categories_price_probality(category="Domestic Transport", min_price=10, max_price=100.01, probablity=0.1), + categories_price_probality(category="Clothing", min_price=10, max_price=2000.01, probablity=0.05), + categories_price_probality(category="Electronics", min_price=100, max_price=10000.01, probablity=0.02), + categories_price_probality(category="Sports/Outdoors", min_price=10, max_price=100.01, probablity=0.015), + categories_price_probality(category="Holliday/Travel", min_price=10, max_price=100.01, probablity=0.014), + categories_price_probality(category="Jewelery", min_price=10, max_price=100.01, probablity=0.001), + ])): + + # Setting up parameter for simulated data generation + self.TOTAL_UNIQUE_USERS = total_unique_users + self.TOTAL_UNIQUE_TRANSACTIONS = total_unique_transactions + self.TOTAL_CASH_WITHRAWALS = cash_withrawal_cards_cards_total + self.TOTAL_UNIQUE_CASH_WITHDRAWALS = total_unique_cash_withdrawals + self.ATM_WITHRAWAL_SEQ_LENGTH = atm_withrawal_seq_length + self.NORMAL_ATM_RADIUS = normal_atm_radius + self.AMOUNT_DISTRIBUTION_PERCENTAGES = transaction_distribution + self.CATEGORY_PERC_PRICE = categories_distribution + self.NUMBER_OF_FRAUDULENT_TRANSACTIONS = self.TOTAL_UNIQUE_TRANSACTIONS * fraud_ratio + self.ATTACK_CHAIN_LENGTHS = attack_chain_length + self.NUMBER_OF_FRAUDULENT_ATM_TRANSACTIONS = self.TOTAL_CASH_WITHRAWALS * fraud_ratio + + self.DATE_FORMAT = '%Y-%m-%d %H:%M:%S' + self.END_DATE = datetime.datetime.now().strftime(self.DATE_FORMAT) + self.START_DATE = (datetime.datetime.now() - datetime.timedelta(days=30 * 6)).strftime(self.DATE_FORMAT) + + # Setting up faker + self.faker = Faker() + + # TODO : Make setting seeds a funcion and create a parameter for it + self.faker.seed_locale('en_US', 0) + seed = 12345 + random.seed(seed) + np.random.seed(seed) + self.faker.seed_instance(seed) + + + + + + def generate_list_credit_cards(self) -> pd.DataFrame: + """ Function that generates and returns a dataframe of credit card numbers + """ + credit_cards = [] + delta_time_object = datetime.datetime.strptime(self.START_DATE, self.DATE_FORMAT) + + # Generating credit card number, expiry date using faker + for _ in range(self.TOTAL_UNIQUE_USERS): + address = self.faker.local_latlng(country_code='US') + age = 0 + profile = None + while age < 18 or age > 100: + profile = self.faker.profile(fields=['name', 'mail', 'birthdate']) + bday = profile['birthdate'] + delta = datetime.datetime.now() - datetime.datetime(bday.year, bday.month, bday.day) + age = int(delta.days / 365) + + credit_cards.append({'cc_num': self.faker.credit_card_number(card_type='visa'), + 'cc_provider': random.choice(['visa', 'mastercard']), + 'cc_type' : random.choice(["credit", "debit"]), + 'cc_expiration_date': self.faker.credit_card_expire(start=delta_time_object, end="+5y",date_format="%m/%y"), + 'name' : profile["name"], + 'mail' : profile["name"], + 'birthdate' : profile["birthdate"], + 'age' : age, + 'city' : address[2], + 'country_of_residence' : address[3]}) + return pd.DataFrame(credit_cards) + + def generate_transaction_amounts(self, number_of_transactions : int) -> list: + """Function that returns all a simulated list of transaction amounts based on the transaction probablity distributions + """ + amounts = [] + for price_dist in self.AMOUNT_DISTRIBUTION_PERCENTAGES.price_dist: + n = int(number_of_transactions * price_dist.probablity) + start, end = price_dist.min_value, price_dist.max_value + for _ in range(n): + amounts.append(round(np.random.uniform(start, end + 1),2)) + return amounts + + def create_transaction(self, timestamp, credit_card_number, category, amount, latitude, longitude, city, country, fraud_label): + transaction_id = self.generate_transaction_id(timestamp, credit_card_number, category) + return {'tid': transaction_id, + 'datetime': timestamp.strftime(self.DATE_FORMAT), + 'cc_num': credit_card_number, + 'category': category, + 'amount': amount, + 'latitude': latitude, + 'longitude': longitude, + 'city': city, + 'country': country, + 'fraud_label': fraud_label} + + def generate_card_transactions(self, card_amounts, credit_card_numbers) -> list: + categories = [] + transaction_data_start = datetime.datetime.strptime(self.START_DATE, self.DATE_FORMAT) + transaction_data_end = datetime.datetime.strptime(self.END_DATE, self.DATE_FORMAT) + total_transactions = len(card_amounts) + + # Catgeories for Card transactions + for category_dist in self.CATEGORY_PERC_PRICE.category_dist: + n = round(total_transactions * category_dist.probablity) + for _ in range(n): + credit_card_number = random.choice(credit_card_numbers) + point_of_tr = self.faker.local_latlng(country_code="US") + timestamp = self.faker.date_time_between(start_date=transaction_data_start, end_date=transaction_data_end, tzinfo=None) + min_price_i = bisect.bisect_left(card_amounts, category_dist.min_price) + max_price_i = bisect.bisect_right(card_amounts, category_dist.max_price, lo=min_price_i) + + transaction = self.create_transaction(timestamp=timestamp, + credit_card_number=credit_card_number, + category=category_dist.category, + amount=random.choice(card_amounts[min_price_i:max_price_i]), + latitude=point_of_tr[0], + longitude=point_of_tr[1], + city=point_of_tr[2], + country=point_of_tr[3], + fraud_label=0) + categories.append(transaction) + random.shuffle(categories) + return categories + + def generate_atm_withdrawal(self, withdrawal_amounts, credit_card_numbers): + + transaction_data_start = datetime.datetime.strptime(self.START_DATE, self.DATE_FORMAT) + transaction_data_end = datetime.datetime.strptime(self.END_DATE, self.DATE_FORMAT) + + cash_withdrawals = [] + atm_count = 0 + while atm_count < self.TOTAL_UNIQUE_CASH_WITHDRAWALS: + for ATM_WITHRAWAL_SEQ in self.ATM_WITHRAWAL_SEQ_LENGTH: + # interval in hours between normal cash withdrawals + delta = random.randint(6, 168) + credit_card_number = random.choice(credit_card_numbers) + point_of_tr = self.faker.local_latlng(country_code="US") + withdrawal_timestamp = self.faker.date_time_between(start_date=transaction_data_start, end_date=transaction_data_end, tzinfo=None) + + # Generating sequence of transactions using the generated data + for _ in range(ATM_WITHRAWAL_SEQ): + withdrawal_timestamp = withdrawal_timestamp - datetime.timedelta(hours=delta) + transaction = self.create_transaction(timestamp=withdrawal_timestamp, + credit_card_number=credit_card_number, + category="Cash Withdrawal", + amount=random.choice(withdrawal_amounts), + latitude=self.faker.coordinate(point_of_tr[0], self.NORMAL_ATM_RADIUS), # updating latitude for sequence of transactions + longitude=self.faker.coordinate(point_of_tr[1], self.NORMAL_ATM_RADIUS), # updating longitude for sequence of transactions + city=point_of_tr[2], + country=point_of_tr[3], + fraud_label=0) + cash_withdrawals.append(transaction) + atm_count+=ATM_WITHRAWAL_SEQ + return cash_withdrawals + + def generate_transaction_id(self, timestamp: str, credit_card_number: str, transaction_amount: float) -> str: + """.""" + hashable = f'{timestamp}{credit_card_number}{transaction_amount}' + hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest() + return hexdigest + + def create_transactions_data(self, credit_card_numbers): + # Schema transaction dataframe - tid, datetime, cc_num, category, amount, latitude, longitude, city, country + + # Creating transaction amounts + card_transactions_amounts = self.generate_transaction_amounts(self.TOTAL_UNIQUE_TRANSACTIONS) + cash_withdrawal_amounts = self.generate_transaction_amounts(self.TOTAL_UNIQUE_CASH_WITHDRAWALS) + + # Creating corresponding transaction categories + card_transactions = self.generate_card_transactions(card_transactions_amounts, credit_card_numbers) + atm_transactions = self.generate_atm_withdrawal(cash_withdrawal_amounts, credit_card_numbers) + + # Creating fraud transactions + fraud_card_transactions = self.create_fraud_card_transactions(card_transactions) + fraud_atm_transactions = self.create_atm_fraud_transactions(atm_transactions) + + transactions_dataframe = pd.DataFrame(card_transactions + atm_transactions + fraud_card_transactions + fraud_atm_transactions) + + return transactions_dataframe + + def create_fraud_card_transactions(self, card_transactions): + # Create frauds labels for normal transactions using attack chains, attack happens when multiple transactions happen in very close timeperiods + # for different categories of data + + # Creating attack chains -> selecting card for which fraud happens and adding fraud transactions for it + tot_fraud_tractions = 0 + fraud_transactions = [] + + transaction_data_start = datetime.datetime.strptime(self.START_DATE, self.DATE_FORMAT) + transaction_data_end = datetime.datetime.strptime(self.END_DATE, self.DATE_FORMAT) + + while tot_fraud_tractions < self.NUMBER_OF_FRAUDULENT_TRANSACTIONS: + selected_target_transaction = random.choice(card_transactions) + attack_chain_length = random.choice(self.ATTACK_CHAIN_LENGTHS) + + attack_amounts = self.generate_transaction_amounts(attack_chain_length) + attack_transactions = self.generate_card_transactions(attack_amounts, [selected_target_transaction["cc_num"]]) + timestamp = self.faker.date_time_between(start_date=transaction_data_start, end_date=transaction_data_end, tzinfo=None) + + for i in range(len(attack_transactions)): + # interval in seconds between fraudulent attacks + delta = random.randint(30, 120) + attack_transactions[i]["datetime"] = (timestamp + datetime.timedelta(seconds=delta)).strftime(self.DATE_FORMAT) + attack_transactions[i]["fraud_label"] = 1 + fraud_transactions.append(attack_transactions[i]) + tot_fraud_tractions+=1 + + if(tot_fraud_tractions == self.NUMBER_OF_FRAUDULENT_TRANSACTIONS): + break + + return fraud_transactions + + + def create_atm_fraud_transactions(self, atm_transactions): + # Fraud ATM transactions are transactions that are out if US in a sequence of ATM transactions and been done in quick sequence + fraud_atm_transactions = [] + tot_fraud_tractions = 0 + + attack_amounts = self.generate_transaction_amounts(self.NUMBER_OF_FRAUDULENT_ATM_TRANSACTIONS) + + while tot_fraud_tractions < self.NUMBER_OF_FRAUDULENT_ATM_TRANSACTIONS: + selected_target_transaction = random.choice(atm_transactions) + delta = random.randint(1, 5) + fraudulent_atm_location = self.faker.location_on_land() + while fraudulent_atm_location[3] == 'US': + fraudulent_atm_location = self.faker.location_on_land() + withdrawal_timestamp = datetime.datetime.strptime(selected_target_transaction["datetime"], self.DATE_FORMAT) + datetime.timedelta(delta) + transaction = self.create_transaction(timestamp=withdrawal_timestamp, + credit_card_number=selected_target_transaction["cc_num"], + category="Cash Withdrawal", + amount=random.choice(attack_amounts), + latitude=fraudulent_atm_location[0], + longitude=fraudulent_atm_location[1], + city=fraudulent_atm_location[2], + country=fraudulent_atm_location[3], + fraud_label=1) + fraud_atm_transactions.append(transaction) + tot_fraud_tractions+=1 + return fraud_atm_transactions + + + + def create_simulated_transactions(self): + credit_cards = self.generate_list_credit_cards() + + transactions = self.create_transactions_data(credit_cards["cc_num"].tolist()) + + return credit_cards, transactions \ No newline at end of file From 743704d4c4cb1717e2f3021e50fd43dad7b276fe Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 10 Jan 2024 11:06:59 +0100 Subject: [PATCH 02/16] adding init.py for module --- advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py b/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py new file mode 100644 index 00000000..e69de29b From 1d93b79647854777dae6abf360f15b573365e131 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 10 Jan 2024 11:50:48 +0100 Subject: [PATCH 03/16] adding code to download libaries in hopsworks jupyter --- .../pyspark_streaming/Feature Pipeline.ipynb | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index fb1e4a0a..d94705c2 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -2,8 +2,45 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, - "id": "6c7fb51b-602a-4cff-85d4-6aceb4ef9f29", + "execution_count": 6, + "id": "4891664b-4f4f-4201-adab-a47ee90820b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Local environment\n" + ] + } + ], + "source": [ + "# Hosted notebook environments may not have the local features package\n", + "import os\n", + "\n", + "def need_download_modules():\n", + " if 'google.colab' in str(get_ipython()):\n", + " return True\n", + " if 'HOPSWORKS_PROJECT_ID' in os.environ:\n", + " return True\n", + " return False\n", + "\n", + "if need_download_modules():\n", + " print(\"⚙️ Downloading modules...\")\n", + " os.system('mkdir -p synthetic_data')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py')\n", + " print('✅ Done!')\n", + "else:\n", + " print(\"Local environment\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823ec8e3-6363-4d69-baf0-f1caf8ad75ef", "metadata": {}, "outputs": [], "source": [ From 485d6e0094c5026a8910a351cc0a73d1b437d3ff Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 10 Jan 2024 13:25:39 +0100 Subject: [PATCH 04/16] adding topic for testing --- .../pyspark_streaming/Feature Pipeline.ipynb | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index d94705c2..305daaca 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -2,21 +2,15 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "4891664b-4f4f-4201-adab-a47ee90820b8", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Local environment\n" - ] - } - ], + "outputs": [], "source": [ "# Hosted notebook environments may not have the local features package\n", "import os\n", + "from IPython import get_ipython\n", + "\n", "\n", "def need_download_modules():\n", " if 'google.colab' in str(get_ipython()):\n", @@ -66,8 +60,7 @@ " TimestampType,\n", " StructType,\n", " StructField,\n", - ")\n", - "from synthetic_data import init_kafka" + ")" ] }, { @@ -84,7 +77,8 @@ "project = hopsworks.login()\n", "fs = project.get_feature_store()\n", "hsfs.connection()\n", - "kafka_config = engine.get_instance()._get_kafka_config(feature_store_id=fs.id)" + "kafka_config = engine.get_instance()._get_kafka_config(feature_store_id=fs.id)\n", + "KAFKA_TOPIC_NAME = \"credit_card_transactions\"" ] }, { @@ -101,7 +95,7 @@ " .options(**kafka_config) \\\n", " .option(\"startingOffsets\", \"earliest\") \\\n", " .option(\"maxOffsetsPerTrigger\", 100) \\\n", - " .option(\"subscribe\", init_kafka.KAFKA_TOPIC_NAME) \\\n", + " .option(\"subscribe\", KAFKA_TOPIC_NAME) \\\n", " .load()" ] }, From a8278d28fe2dc6281ca5e6d361e8675a1fc7bbc9 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 10 Jan 2024 13:36:02 +0100 Subject: [PATCH 05/16] adding notebook for creating simulated data stream --- .../Simulated Stream Creation.ipynb | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb new file mode 100644 index 00000000..e3ad15f9 --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ba683f8a-3ba8-40aa-bce1-75009fe337b1", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO : File Duplicate of create_transaction_stream.py and will be removed recreated for testing" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bc61381c-9f8e-4fce-aaa2-e25f52216fce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Local environment\n" + ] + } + ], + "source": [ + "# Hosted notebook environments may not have the local features package\n", + "import os\n", + "from IPython import get_ipython\n", + "\n", + "\n", + "def need_download_modules():\n", + " if 'google.colab' in str(get_ipython()):\n", + " return True\n", + " if 'HOPSWORKS_PROJECT_ID' in os.environ:\n", + " return True\n", + " return False\n", + "\n", + "if need_download_modules():\n", + " print(\"⚙️ Downloading modules...\")\n", + " os.system('mkdir -p synthetic_data')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py')\n", + " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py')\n", + " print('✅ Done!')\n", + "else:\n", + " print(\"Local environment\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3aacce52-7c45-4910-8cad-f58a261fcd9d", + "metadata": {}, + "outputs": [], + "source": [ + "import synthetic_data.init_kafka\n", + "import synthetic_data.synthetic_data\n", + "import hsfs\n", + "from hsfs.core.storage_connector_api import StorageConnectorApi\n", + "from confluent_kafka import Producer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3dd27d1-7e9c-4ea4-9937-6c6cc0e381e4", + "metadata": {}, + "outputs": [], + "source": [ + "# creating kafka topic and schema\n", + "init_kafka.init() \n", + "\n", + "# Creating simulated data\n", + "data_simulater = synthetic_data.synthetic_data()\n", + "credit_cards, trans_df = data_simulater.create_simulated_transactions()\n", + "#TODO : Remove and make this into a different dataframe\n", + "trans_df = trans_df.drop([\"fraud_label\"], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39843ef7-ab0b-475e-aa83-da271146f631", + "metadata": {}, + "outputs": [], + "source": [ + "# Getting Kafka Configurations\n", + "connection = hsfs.connection()\n", + "fs = connection.get_feature_store()\n", + "sc_api = StorageConnectorApi()\n", + "sc = sc_api.get_kafka_connector(fs.id, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a4a51b6-144e-4c3d-8d31-62cba5dd9dd0", + "metadata": {}, + "outputs": [], + "source": [ + "kafka_config = sc.confluent_options()\n", + "kafka_config[\"group.id\"] = \"test_groupid\"\n", + "\n", + "producer = Producer(kafka_config)\n", + "\n", + "for index, transaction in trans_df.iterrows():\n", + " producer.produce(init_kafka.KAFKA_TOPIC_NAME, transaction.to_json())\n", + " producer.flush() # Synchronising read and write blocking call\n", + " if index % 100 == 0 :\n", + " print(f'index {index}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 10c2673fa2570b5e593711fee6096f8aa8296820 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 10 Jan 2024 13:46:10 +0100 Subject: [PATCH 06/16] correcting imports --- .../pyspark_streaming/Simulated Stream Creation.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb index e3ad15f9..6af088c1 100644 --- a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -56,8 +56,8 @@ "metadata": {}, "outputs": [], "source": [ - "import synthetic_data.init_kafka\n", - "import synthetic_data.synthetic_data\n", + "from synthetic_data import init_kafka\n", + "from synthetic_data import synthetic_data\n", "import hsfs\n", "from hsfs.core.storage_connector_api import StorageConnectorApi\n", "from confluent_kafka import Producer" From dce72c36f613011bcee6868742fb7be1aa23d921 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 7 Mar 2024 16:43:08 +0100 Subject: [PATCH 07/16] working base pyspark streaming --- .../pyspark_streaming/Feature Pipeline.ipynb | 289 +++++++++++++++--- .../Simulated Stream Creation.ipynb | 286 +++++++++++++---- .../features/transactions.py | 150 +++++++++ .../create_transaction_stream.py | 32 -- .../synthetic_data/init_kafka.py | 85 ------ 5 files changed, 629 insertions(+), 213 deletions(-) create mode 100644 advanced_tutorials/pyspark_streaming/features/transactions.py delete mode 100644 advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py delete mode 100644 advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index 305daaca..fbb94bf9 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -1,11 +1,48 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "23feb96b", + "metadata": {}, + "source": [ + "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", + "\n", + "**Note**: This tutorial does not support Google Colab.\n", + "\n", + "This is the first part of the quick start series of tutorials about Hopsworks Feature Store. As part of this first module, you will work with data related to credit card transactions. \n", + "The objective of this tutorial is to demonstrate how to work with the **Hopworks Feature Store** for batch data with a goal of training and saving a model that can predict fraudulent transactions. Then try it on retrieved from Feature Store batch data.\n", + "\n", + "\n", + "## 🗒️ This notebook is divided in 3 sections:\n", + "1. Loading the data and feature engineeing,\n", + "2. Connect to the Hopsworks Feature Store,\n", + "3. Create feature groups and upload them to the Feature Store.\n", + "\n", + "![tutorial-flow](../images/01_featuregroups.png)\n", + "\n", + "First of all you will load the data and do some feature engineering on it." + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "4891664b-4f4f-4201-adab-a47ee90820b8", + "execution_count": 60, + "id": "3b9be52b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⚙️ Downloading modules...\n", + "0\n", + "0\n", + "0\n", + "0\n", + "0\n", + "✅ Done!" + ] + } + ], "source": [ "# Hosted notebook environments may not have the local features package\n", "import os\n", @@ -31,12 +68,48 @@ " print(\"Local environment\")" ] }, + { + "cell_type": "markdown", + "id": "5a1b2a31", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "823ec8e3-6363-4d69-baf0-f1caf8ad75ef", + "execution_count": 1, + "id": "709b0826", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting Spark application\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
IDApplication IDKindStateSpark UIDriver log
11application_1709431794369_0071pysparkidleLinkLink
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SparkSession available as 'spark'.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession\n", "\n", @@ -63,28 +136,75 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "aa510030", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "d8b68b2d-3236-463f-a872-ba76cf506b64", + "execution_count": 30, + "id": "6a6f8e70", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection closed.\n", + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://staging.cloud.hopsworks.ai/p/124\n", + "Connected. Call `.close()` to terminate connection gracefully." + ] + } + ], "source": [ "import hopsworks\n", - "import hsfs\n", - "from hsfs import engine\n", + "from hsfs.core.storage_connector_api import StorageConnectorApi\n", "\n", "project = hopsworks.login()\n", "fs = project.get_feature_store()\n", - "hsfs.connection()\n", - "kafka_config = engine.get_instance()._get_kafka_config(feature_store_id=fs.id)\n", - "KAFKA_TOPIC_NAME = \"credit_card_transactions\"" + "sc_api = StorageConnectorApi()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "1c67cfb8-c336-42c0-ba94-e65679efa5b8", + "execution_count": 31, + "id": "50cbe0d3", + "metadata": {}, + "outputs": [], + "source": [ + "sc_api = StorageConnectorApi()\n", + "kafka_connector = sc_api.get_kafka_connector(feature_store_id=fs.id, external=False)\n", + "kafka_config = kafka_connector.spark_options()" + ] + }, + { + "cell_type": "markdown", + "id": "15bf280a", + "metadata": {}, + "source": [ + "## 🗂 Reading from Kakfa Stream " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "7c337242", + "metadata": {}, + "outputs": [], + "source": [ + "KAFKA_TOPIC_NAME = \"transactions_topic\"" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "b9bc53b5", "metadata": {}, "outputs": [], "source": [ @@ -101,8 +221,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "7ddf926c-9ea7-4997-859f-745da0618d51", + "execution_count": 46, + "id": "ed65282b", "metadata": {}, "outputs": [], "source": [ @@ -115,13 +235,14 @@ " StructField(\"longitude\", DoubleType(), True),\n", " StructField(\"city\", StringType(), True),\n", " StructField(\"country\", StringType(), True),\n", + " StructField(\"fraud_label\", StringType(), True),\n", " ])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4543ea17-1ae4-4aa1-a91a-bc00ecea45d8", + "execution_count": 47, + "id": "b1827d0d", "metadata": {}, "outputs": [], "source": [ @@ -136,7 +257,8 @@ " \"value.latitude\",\n", " \"value.longitude\",\n", " \"value.city\",\n", - " \"value.country\") \\\n", + " \"value.country\",\n", + " \"value.fraud_label\") \\\n", " .selectExpr(\"CAST(tid as string)\",\n", " \"CAST(datetime as timestamp)\",\n", " \"CAST(cc_num as long)\",\n", @@ -145,14 +267,72 @@ " \"CAST(latitude as double)\",\n", " \"CAST(longitude as double)\",\n", " \"CAST(city as string)\",\n", - " \"CAST(country as string)\"\n", + " \"CAST(country as string)\",\n", + " \"CAST(fraud_label as string)\"\n", " )" ] }, + { + "cell_type": "markdown", + "id": "c83f2598", + "metadata": {}, + "source": [ + "## 🛠️ Feature Engineering " + ] + }, { "cell_type": "code", "execution_count": null, - "id": "80600685-7ba1-449f-a005-33fe4cda29cd", + "id": "177a1de5", + "metadata": {}, + "outputs": [], + "source": [ + "# read profile data to cogroup with streaming dataframe\n", + "profile_fg = fs.get_or_create_feature_group(\n", + " name=\"profile\",\n", + " version=1)\n", + "\n", + "profile_df = profile_fg.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d821a64", + "metadata": {}, + "outputs": [], + "source": [ + "windowed_transaction_df = transaction_streaming_df \\\n", + " .selectExpr(\"tid\",\n", + " \"datetime\",\n", + " \"cc_num\",\n", + " \"category\",\n", + " \"CAST(amount as double)\",\n", + " \"radians(latitude) as latitude\",\n", + " \"radians(longitude) as longitude\",\n", + " \"city\",\n", + " \"country\") \\\n", + " .withWatermark(\"datetime\", \"24 hours\") \\\n", + " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", + " .applyInPandas(transactions.loc_delta_t_minus_1, schema=schema1) \\\n", + " .withWatermark(\"datetime\", \"24 hours\") \\\n", + " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", + " .applyInPandas(transactions.time_delta_t_minus_1, schema=schema2) \\\n", + " .withColumn(\"month\", udf(col(\"datetime\"))) \\\n", + " .withWatermark(\"datetime\", \"24 hours\") \\\n", + " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", + " .cogroup(profile_df.groupby(\"cc_provider\")) \\\n", + " .applyInPandas(transactions.card_owner_age, schema=schema3) \\\n", + " .withWatermark(\"datetime\", \"24 hours\") \\\n", + " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", + " .cogroup(profile_df.groupby(\"cc_provider\")) \\\n", + " .applyInPandas(transactions.expiry_days, schema=schema4)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "cc2d81a2", "metadata": {}, "outputs": [], "source": [ @@ -161,7 +341,6 @@ " version=1,\n", " description=\"Transaction data\",\n", " primary_key=['cc_num'],\n", - " event_time='datetime',\n", " #partition_key=['month'],\n", " stream=True,\n", " online_enabled=True\n", @@ -170,42 +349,76 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "6b6f8a91-fe60-4e97-be7f-f17ee4b27c17", + "execution_count": 49, + "id": "7905c4a2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://staging.cloud.hopsworks.ai/p/124/fs/72/fg/154" + ] + } + ], "source": [ "q = trans_fg.insert_stream(transaction_streaming_df)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "bed738f7-72c5-497c-816c-0d3f0cafdc0e", + "execution_count": 58, + "id": "4f0202d1", "metadata": {}, "outputs": [], "source": [ - "q.status" + "q.stop()" ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "c6b0d90a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: transactions_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://staging.cloud.hopsworks.ai/p/124/jobs/named/transactions_1_offline_fg_materialization/executions" + ] + } + ], + "source": [ + "trans_fg.materialization_job.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8216031", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "PySpark", "language": "python", - "name": "python3" + "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { - "name": "ipython", + "name": "python", "version": 3 }, - "file_extension": ".py", "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.18" + "name": "pyspark", + "pygments_lexer": "python3" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb index 6af088c1..87bee961 100644 --- a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -1,122 +1,292 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "d48f2deb", + "metadata": {}, + "source": [ + "# **Hopsworks Feature Store** \n", + "\n", + "This notebook creates a data stream using Hopsworks Internal Kafka\n", + "\n", + "## 🗒️ This notebook is divided into the following sections:\n", + "\n", + "1. Creating Simulated Data\n", + "2. Creating Kafka Topic and Schema in Hopsworks Feature Store\n", + "3. Sending Data to Kafka" + ] + }, + { + "cell_type": "markdown", + "id": "15e3878a", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, { "cell_type": "code", "execution_count": 1, - "id": "ba683f8a-3ba8-40aa-bce1-75009fe337b1", + "id": "c633d356", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install faker --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bfc5f6e0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from synthetic_data import synthetic_data\n", + "from confluent_kafka import Producer" + ] + }, + { + "cell_type": "markdown", + "id": "154e65f1", + "metadata": {}, + "source": [ + "## ✏️ Creating Simulated Data " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9444c0b2", "metadata": {}, "outputs": [], "source": [ - "# TODO : File Duplicate of create_transaction_stream.py and will be removed recreated for testing" + "data_simulater = synthetic_data.synthetic_data()\n", + "\n", + "credit_cards, trans_df = data_simulater.create_simulated_transactions()" + ] + }, + { + "cell_type": "markdown", + "id": "2f3fb67b", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " ] }, { "cell_type": "code", - "execution_count": 2, - "id": "bc61381c-9f8e-4fce-aaa2-e25f52216fce", + "execution_count": 12, + "id": "83a2d0d3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Local environment\n" + "Connection closed.\n", + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://staging.cloud.hopsworks.ai/p/124\n" ] } ], "source": [ - "# Hosted notebook environments may not have the local features package\n", - "import os\n", - "from IPython import get_ipython\n", - "\n", + "import hopsworks\n", "\n", - "def need_download_modules():\n", - " if 'google.colab' in str(get_ipython()):\n", - " return True\n", - " if 'HOPSWORKS_PROJECT_ID' in os.environ:\n", - " return True\n", - " return False\n", + "project = hopsworks.login()\n", "\n", - "if need_download_modules():\n", - " print(\"⚙️ Downloading modules...\")\n", - " os.system('mkdir -p synthetic_data')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py')\n", - " print('✅ Done!')\n", - "else:\n", - " print(\"Local environment\")" + "kafka_api = project.get_kafka_api()" + ] + }, + { + "cell_type": "markdown", + "id": "de5aa4a9", + "metadata": {}, + "source": [ + "## ⚙️ Kafka Topic and Schema Creation " ] }, { "cell_type": "code", - "execution_count": 3, - "id": "3aacce52-7c45-4910-8cad-f58a261fcd9d", + "execution_count": 13, + "id": "dbed6eb4", "metadata": {}, "outputs": [], "source": [ - "from synthetic_data import init_kafka\n", - "from synthetic_data import synthetic_data\n", - "import hsfs\n", - "from hsfs.core.storage_connector_api import StorageConnectorApi\n", - "from confluent_kafka import Producer" + "# create kafka topic\n", + "KAFKA_TOPIC_NAME = \"transactions_topic\"\n", + "SCHEMA_NAME = \"transactions_schema\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e3dd27d1-7e9c-4ea4-9937-6c6cc0e381e4", + "execution_count": 14, + "id": "3b388b1a", "metadata": {}, "outputs": [], "source": [ - "# creating kafka topic and schema\n", - "init_kafka.init() \n", - "\n", - "# Creating simulated data\n", - "data_simulater = synthetic_data.synthetic_data()\n", - "credit_cards, trans_df = data_simulater.create_simulated_transactions()\n", - "#TODO : Remove and make this into a different dataframe\n", - "trans_df = trans_df.drop([\"fraud_label\"], axis = 1)" + "schema = {\n", + " \"type\": \"record\",\n", + " \"name\": SCHEMA_NAME,\n", + " \"namespace\": \"io.hops.examples.pyspark.example\",\n", + " \"fields\": [\n", + " {\n", + " \"name\": \"tid\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"string\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"datetime\",\n", + " \"type\": [\n", + " \"null\",\n", + " {\n", + " \"type\": \"long\",\n", + " \"logicalType\": \"timestamp-micros\"\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"cc_num\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"long\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"category\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"string\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"amount\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"double\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"latitude\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"double\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"longitude\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"double\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"city\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"string\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"country\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"string\"\n", + " ]\n", + " },\n", + " {\n", + " \"name\": \"fraud_label\",\n", + " \"type\": [\n", + " \"null\",\n", + " \"string\"\n", + " ]\n", + " },\n", + " ]\n", + "}" ] }, { "cell_type": "code", - "execution_count": null, - "id": "39843ef7-ab0b-475e-aa83-da271146f631", + "execution_count": 15, + "id": "8a093dd7", "metadata": {}, "outputs": [], "source": [ - "# Getting Kafka Configurations\n", - "connection = hsfs.connection()\n", - "fs = connection.get_feature_store()\n", - "sc_api = StorageConnectorApi()\n", - "sc = sc_api.get_kafka_connector(fs.id, False)" + "if KAFKA_TOPIC_NAME not in [topic.name for topic in kafka_api.get_topics()]:\n", + " kafka_api.create_schema(SCHEMA_NAME, schema)\n", + " kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1)" + ] + }, + { + "cell_type": "markdown", + "id": "135c4072", + "metadata": {}, + "source": [ + "## 📡 Sending Data using created Kafka Topic " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2d47085f", + "metadata": {}, + "outputs": [], + "source": [ + "trans_df[\"tid\"] = trans_df[\"tid\"].astype(\"string\")\n", + "trans_df[\"datetime\"] = pd.to_datetime(trans_df[\"datetime\"])\n", + "trans_df[\"cc_num\"] = trans_df[\"cc_num\"].astype(\"int64\")\n", + "trans_df[\"category\"] = trans_df[\"cc_num\"].astype(\"string\")\n", + "trans_df[\"amount\"] = trans_df[\"amount\"].astype(\"double\")\n", + "trans_df[\"latitude\"] = trans_df[\"latitude\"].astype(\"double\")\n", + "trans_df[\"longitude\"] = trans_df[\"longitude\"].astype(\"double\")\n", + "trans_df[\"city\"] = trans_df[\"city\"].astype(\"string\")\n", + "trans_df[\"country\"] = trans_df[\"country\"].astype(\"string\")\n", + "trans_df[\"fraud_label\"] = trans_df[\"fraud_label\"].astype(\"string\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "2a4a51b6-144e-4c3d-8d31-62cba5dd9dd0", + "id": "61263e9b", "metadata": {}, "outputs": [], "source": [ - "kafka_config = sc.confluent_options()\n", - "kafka_config[\"group.id\"] = \"test_groupid\"\n", + "kafka_config = kafka_api.get_default_config()\n", "\n", "producer = Producer(kafka_config)\n", "\n", "for index, transaction in trans_df.iterrows():\n", - " producer.produce(init_kafka.KAFKA_TOPIC_NAME, transaction.to_json())\n", - " producer.flush() # Synchronising read and write blocking call\n", - " if index % 100 == 0 :\n", - " print(f'index {index}')" + " producer.produce(KAFKA_TOPIC_NAME, transaction.to_json())\n", + " producer.flush()" + ] + }, + { + "cell_type": "markdown", + "id": "6a5d0799", + "metadata": {}, + "source": [ + "---\n", + "## ⏭️ **Next:** Part 01: Feature Pipeline\n", + "\n", + "In the following notebook you will use the created Kafka stream to insert data into a Feature Group" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -130,7 +300,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/features/transactions.py b/advanced_tutorials/pyspark_streaming/features/transactions.py new file mode 100644 index 00000000..c5cecd42 --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/features/transactions.py @@ -0,0 +1,150 @@ +import numpy as np +import pandas as pd + + +def haversine(long: int, lat: int, shift: int) -> float: + """Compute Haversine distance between each consecutive coordinate in (long, lat). + + Args: + - long: int ... + - lat: int ... + - shift: int ... + Returns: + - float: ... + """ + + long_shifted = long.shift(shift) + lat_shifted = lat.shift(shift) + long_diff = long_shifted - long + lat_diff = lat_shifted - lat + + a = np.sin(lat_diff / 2.0) ** 2 + b = np.cos(lat) * np.cos(lat_shifted) * np.sin(long_diff / 2.0) ** 2 + c = 2 * np.arcsin(np.sqrt(a + b)) + + return c + + +def get_year_month(datetime_col: pd.Series) -> pd.Series: + """Compute year and month string from datetime column. + + - datetime_col: pd.Series of datetime + Returns: + - pd.Series: year and month string + """ + + year_month = datetime_col.map(lambda x: str(x.year) + "-" + str(x.month)) + return year_month + + +def time_shift(datetime_col: pd.Series, shift: int) -> pd.Series: + """Compute time difference between each consecutive transaction. + + Args: + - datetime_col: pd.Series of datetime + - shift: int value of time step + Returns: + - pd.Series: + """ + time_shifted = datetime_col.shift(shift) + return time_shifted + + +def loc_delta_t_minus_1(df: pd.DataFrame) -> pd.DataFrame: + """Computes previous location of the transaction + + Args: + - df: DataFrame that contains the transaction data + Returns: + - DataFrame: containing the new feature + """ + df["loc_delta_t_minus_1"] = df.groupby("cc_num") \ + .apply(lambda x: haversine(x["longitude"], x["latitude"], -1)) \ + .reset_index(level=0, drop=True) \ + .fillna(0) + df = df.drop_duplicates(subset=['cc_num', 'datetime']).reset_index(drop=True) + return df + + +def time_delta_t_minus_1(df: pd.DataFrame) -> pd.DataFrame: + """Computes time difference in days between current and previous transaction + + Args: + - df: DataFrame that contains the transaction data + Returns: + - DataFrame: containing the new feature + """ + df["time_delta_t_minus_1"] = df.groupby("cc_num") \ + .apply(lambda x: time_shift(x["datetime"], -1)) \ + .reset_index(level=0, drop=True) + df["time_delta_t_minus_1"] = time_delta(df.time_delta_t_minus_1, df.datetime, 'D') + df["time_delta_t_minus_1"] = df.time_delta_t_minus_1.fillna(0) + df["country"] = df["country"].fillna("US") + df = df.drop_duplicates(subset=['cc_num', 'datetime']).reset_index(drop=True) + return df + + +# +def card_owner_age(trans_df: pd.DataFrame, profiles_df: pd.DataFrame) -> pd.DataFrame: + """Computes age of card owner at the time of transaction in years + Args: + - trans_df: pd.DataFrame + - credit_cards_df: pd.DataFrame + Returns: + - pd.DataFrame: + """ + age_df = trans_df.merge(profiles_df, on="cc_num", how="left") + trans_df["age_at_transaction"] = time_delta(age_df["datetime"], age_df["birthdate"], 'Y') + return trans_df + + +def expiry_days(trans_df: pd.DataFrame, profiles_df: pd.DataFrame) -> pd.DataFrame: + """Computes days until card expires at the time of transaction + Args: + - trans_df: pd.DataFrame + - credit_cards_df: pd.DataFrame + Returns: + - pd.DataFrame: + """ + card_expiry_df = trans_df.merge(profiles_df, on="cc_num", how="left") + trans_df["days_until_card_expires"] = \ + time_delta(card_expiry_df["cc_expiration_date"], card_expiry_df["datetime"], 'D') + return trans_df + + +def is_merchant_abroad(trans_df: pd.DataFrame, profiles_df: pd.DataFrame) -> pd.DataFrame: + """Computes if merchant location is abroad from card holders location + Args: + - trans_df: pd.DataFrame + - credit_cards_df: pd.DataFrame + Returns: + - pd.DataFrame: + """ + merged_df = trans_df.merge(profiles_df, on="cc_num", how="left") + trans_df["is_merchant_abroad"] = merged_df["country"] == merged_df["country_of_residence"] + return trans_df + + +def time_delta(date1: pd.Series, date2: pd.Series, unit: str) -> pd.Series: + """Computes time difference in days between 2 pandas datetime series + + Args: + - date1: pd.Series that contains datetime + - date2: pd.Series that contains datetime + - unit: time unit: 'D' or 'Y' days or years respectively + Returns: + - pd.Series: containing the time delta in units provided + """ + return (date1 - date2) / np.timedelta64(1, unit) + + +def select_features(df: pd.DataFrame) -> pd.DataFrame: + """ + Args: + - df: DataFrame + Returns: + - DataFrame: + """ + return df[ + ["tid", "datetime", "month", "cc_num", "amount", "country", "loc_delta_t_minus_1", "time_delta_t_minus_1", + "days_until_card_expires", "age_at_transaction"]] \ No newline at end of file diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py b/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py deleted file mode 100644 index 3f640f98..00000000 --- a/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py +++ /dev/null @@ -1,32 +0,0 @@ -import init_kafka -import synthetic_data -import hsfs -from hsfs.core.storage_connector_api import StorageConnectorApi -from confluent_kafka import Producer - -# creating kafka topic and schema -init_kafka.init() - -# Creating simulated data -data_simulater = synthetic_data.synthetic_data() -credit_cards, trans_df = data_simulater.create_simulated_transactions() -#TODO : Remove and make this into a different dataframe -trans_df = trans_df.drop(["fraud_label"], axis = 1) - -# Getting Kafka Configurations -connection = hsfs.connection() -fs = connection.get_feature_store() -sc_api = StorageConnectorApi() -sc = sc_api.get_kafka_connector(fs.id, False) - -kafka_config = sc.confluent_options() -kafka_config["group.id"] = "test_groupid" - -producer = Producer(kafka_config) - -for index, transaction in trans_df.iterrows(): - producer.produce(init_kafka.KAFKA_TOPIC_NAME, transaction.to_json()) - producer.flush() # Synchronising read and write blocking call - if index % 100 == 0 : - print(f'index {index}') - diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py b/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py deleted file mode 100644 index 1eddd7e3..00000000 --- a/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py +++ /dev/null @@ -1,85 +0,0 @@ -import hopsworks - -# create kafka topic -KAFKA_TOPIC_NAME = "credit_card_transactions" -SCHEMA_NAME = "credit_card_transactions_schema" - -schema = { - "type": "record", - "name": SCHEMA_NAME, - "namespace": "io.hops.examples.flink.examples", - "fields": [ - { - "name": "tid", - "type": [ - "null", - "string" - ] - }, - { - "name": "datetime", - "type": [ - "null", - { - "type": "long", - "logicalType": "timestamp-micros" - } - ] - }, - { - "name": "cc_num", - "type": [ - "null", - "long" - ] - }, - { - "name": "category", - "type": [ - "null", - "string" - ] - }, - { - "name": "amount", - "type": [ - "null", - "double" - ] - }, - { - "name": "latitude", - "type": [ - "null", - "double" - ] - }, - { - "name": "longitude", - "type": [ - "null", - "double" - ] - }, - { - "name": "city", - "type": [ - "null", - "string" - ] - }, - { - "name": "country", - "type": [ - "null", - "string" - ] - }, - ] -} - -def init(): - project = hopsworks.login() - kafka_api = project.get_kafka_api() - kafka_api.create_schema(SCHEMA_NAME, schema) - kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1) From d96dfc27d1e2b418c37887cceb9ec6245ad46cc5 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 12 Mar 2024 11:10:19 +0100 Subject: [PATCH 08/16] adding all pipeline pyspark streaming --- .../Batch Inference Pipeline.ipynb | 219 +++++++ .../pyspark_streaming/Feature Pipeline.ipynb | 584 ++++++++++++----- .../Simulated Stream Creation.ipynb | 189 +++++- .../pyspark_streaming/Training Pipeline.ipynb | 592 ++++++++++++++++++ 4 files changed, 1408 insertions(+), 176 deletions(-) create mode 100644 advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb create mode 100644 advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb diff --git a/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb new file mode 100644 index 00000000..ddffe655 --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c9fec917", + "metadata": {}, + "source": [ + "# **Hopsworks Feature Store** - Part 03: Batch Inference\n", + "\n", + "\n", + "## 🗒️ This notebook is divided into the following sections:\n", + "\n", + "1. Load batch data.\n", + "2. Predict using model from Model Registry." + ] + }, + { + "cell_type": "markdown", + "id": "731cffd4", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96c942c", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib" + ] + }, + { + "cell_type": "markdown", + "id": "dfb09860", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c71483d", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "c637bf4c", + "metadata": {}, + "source": [ + "## ⚙️ Feature View Retrieval\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8bb207", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'transactions_view_fraud_batch_fv' feature view\n", + "feature_view = fs.get_feature_view(\n", + " name='transactions_view_fraud_batch_fv',\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8ec3cafe", + "metadata": {}, + "source": [ + "## 🗄 Model Registry\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b412d1e", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the model registry\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "markdown", + "id": "d0587a64", + "metadata": {}, + "source": [ + "## 🚀 Fetch and test the model\n", + "\n", + "Finally you can start making predictions with your model! Retrieve your model from Hopsworks model registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0be44ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the model from the model registry\n", + "retrieved_model = mr.get_model(\n", + " name=\"xgboost_fraud_batch_model\",\n", + " version=1,\n", + ")\n", + "\n", + "# Download the saved model files to a local directory\n", + "saved_model_dir = retrieved_model.download()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b09ee2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the saved XGBoost model using joblib from the downloaded model directory\n", + "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/xgboost_fraud_batch_model.pkl\")\n", + "\n", + "# Display the retrieved XGBoost model\n", + "retrieved_xgboost_model" + ] + }, + { + "cell_type": "markdown", + "id": "aba7238a", + "metadata": {}, + "source": [ + "---\n", + "## 🔮 Batch Prediction \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ac6398", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize batch scoring\n", + "feature_view.init_batch_scoring(1)\n", + "\n", + "# Get the batch data\n", + "batch_data = feature_view.get_batch_data()\n", + "\n", + "# Drop the \"datetime\" column from the batch_data DataFrame along the specified axis (axis=1 means columns)\n", + "batch_data.drop([\"datetime\"], axis=1, inplace=True)\n", + "\n", + "# Display the first 3 rows\n", + "batch_data.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0695d933", + "metadata": {}, + "outputs": [], + "source": [ + "# Use the retrieved XGBoost model to make predictions on the batch data\n", + "predictions = retrieved_xgboost_model.predict(batch_data)\n", + "\n", + "# Display the first five predictions\n", + "predictions[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "5ccd12a2", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### 🥳 Next Steps \n", + "Congratulations you've now completed the Fraud Batch tutorial for Managed Hopsworks.\n", + "\n", + "Check out our other tutorials on ➡ https://github.com/logicalclocks/hopsworks-tutorials\n", + "\n", + "Or documentation at ➡ https://docs.hopsworks.ai" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index fbb94bf9..77954411 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "23feb96b", + "id": "a49874d9", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", @@ -10,7 +10,7 @@ "**Note**: This tutorial does not support Google Colab.\n", "\n", "This is the first part of the quick start series of tutorials about Hopsworks Feature Store. As part of this first module, you will work with data related to credit card transactions. \n", - "The objective of this tutorial is to demonstrate how to work with the **Hopworks Feature Store** for batch data with a goal of training and saving a model that can predict fraudulent transactions. Then try it on retrieved from Feature Store batch data.\n", + "The objective of this tutorial is to demonstrate how to work with the **Hopworks Feature Store** for streaming data with a goal of training and saving a model that can predict fraudulent transactions. Then try it on retrieved from Feature Store batch data.\n", "\n", "\n", "## 🗒️ This notebook is divided in 3 sections:\n", @@ -18,59 +18,14 @@ "2. Connect to the Hopsworks Feature Store,\n", "3. Create feature groups and upload them to the Feature Store.\n", "\n", - "![tutorial-flow](../images/01_featuregroups.png)\n", + "![tutorial-flow](../../images/01_featuregroups.png)\n", "\n", "First of all you will load the data and do some feature engineering on it." ] }, - { - "cell_type": "code", - "execution_count": 60, - "id": "3b9be52b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "⚙️ Downloading modules...\n", - "0\n", - "0\n", - "0\n", - "0\n", - "0\n", - "✅ Done!" - ] - } - ], - "source": [ - "# Hosted notebook environments may not have the local features package\n", - "import os\n", - "from IPython import get_ipython\n", - "\n", - "\n", - "def need_download_modules():\n", - " if 'google.colab' in str(get_ipython()):\n", - " return True\n", - " if 'HOPSWORKS_PROJECT_ID' in os.environ:\n", - " return True\n", - " return False\n", - "\n", - "if need_download_modules():\n", - " print(\"⚙️ Downloading modules...\")\n", - " os.system('mkdir -p synthetic_data')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/create_transaction_stream.py')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/init_kafka.py')\n", - " os.system('cd synthetic_data && wget https://raw.githubusercontent.com/manu-sj/hopsworks-tutorials/FSTORE-1107/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py')\n", - " print('✅ Done!')\n", - "else:\n", - " print(\"Local environment\")" - ] - }, { "cell_type": "markdown", - "id": "5a1b2a31", + "id": "a3f58f72", "metadata": {}, "source": [ "## 📝 Imports" @@ -79,7 +34,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "709b0826", + "id": "8ab11319", "metadata": {}, "outputs": [ { @@ -93,7 +48,7 @@ "data": { "text/html": [ "\n", - "
IDApplication IDKindStateSpark UIDriver log
11application_1709431794369_0071pysparkidleLinkLink
" + "IDApplication IDKindStateSpark UIDriver log8application_1710149630761_0019pysparkidleLinkLink" ], "text/plain": [ "" @@ -111,53 +66,46 @@ } ], "source": [ + "import datetime\n", "from pyspark.sql import SparkSession\n", "\n", - "from pyspark.sql.functions import (\n", - " from_json,\n", - " window,\n", - " avg,\n", - " count,\n", - " stddev,\n", - " explode,\n", - " date_format,\n", - " col,\n", - " mean,\n", - " pandas_udf,\n", - " PandasUDFType)\n", + "from pyspark.sql.types import * \n", + "from pyspark.sql.functions import * \n", "\n", - "from pyspark.sql.types import (\n", - " LongType,\n", - " DoubleType,\n", - " StringType,\n", - " TimestampType,\n", - " StructType,\n", - " StructField,\n", - ")" + "from pyspark.sql.functions import pandas_udf" ] }, { "cell_type": "markdown", - "id": "aa510030", + "id": "5b2fa848", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " ] }, + { + "cell_type": "markdown", + "id": "2cc70118-2fcf-4978-9821-3348fcef6873", + "metadata": {}, + "source": [ + "In this tutorial a simulated data stream was created using Hopsworks internal Kafka.\n", + "\n", + "Hopsworks allows to access internal Kafka using the storage connector api. See more information in the [documention](https://docs.hopsworks.ai/feature-store-api/3.7/generated/api/storage_connector_api/#kafka)." + ] + }, { "cell_type": "code", - "execution_count": 30, - "id": "6a6f8e70", + "execution_count": 2, + "id": "5513a1fb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Connection closed.\n", "Connected. Call `.close()` to terminate connection gracefully.\n", "\n", - "Logged in to project, explore it here https://staging.cloud.hopsworks.ai/p/124\n", + "Logged in to project, explore it here https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192\n", "Connected. Call `.close()` to terminate connection gracefully." ] } @@ -167,14 +115,24 @@ "from hsfs.core.storage_connector_api import StorageConnectorApi\n", "\n", "project = hopsworks.login()\n", + "\n", "fs = project.get_feature_store()\n", + "\n", "sc_api = StorageConnectorApi()" ] }, + { + "cell_type": "markdown", + "id": "1faab2a4-6ec6-49c5-8a4d-d6494f7191a9", + "metadata": {}, + "source": [ + "Let get the kafka configurations needed for read from hopsworks internal Kafka using the storage connector api." + ] + }, { "cell_type": "code", - "execution_count": 31, - "id": "50cbe0d3", + "execution_count": 3, + "id": "5728b511", "metadata": {}, "outputs": [], "source": [ @@ -185,30 +143,29 @@ }, { "cell_type": "markdown", - "id": "15bf280a", + "id": "7cf43aa8", "metadata": {}, "source": [ "## 🗂 Reading from Kakfa Stream " ] }, { - "cell_type": "code", - "execution_count": 32, - "id": "7c337242", + "cell_type": "markdown", + "id": "9dae009c-69e6-4197-b5ac-121afd59e52b", "metadata": {}, - "outputs": [], "source": [ - "KAFKA_TOPIC_NAME = \"transactions_topic\"" + "After obatining the Kafka configurations we can use it along with the topic name to create a streaming dataframe." ] }, { "cell_type": "code", - "execution_count": 45, - "id": "b9bc53b5", + "execution_count": 5, + "id": "37e018f4", "metadata": {}, "outputs": [], "source": [ - "# get data from the source\n", + "KAFKA_TOPIC_NAME = \"transactions_topic\"\n", + "\n", "df_read = spark \\\n", " .readStream \\\n", " .format(\"kafka\") \\\n", @@ -219,10 +176,18 @@ " .load()" ] }, + { + "cell_type": "markdown", + "id": "0cf31aaa-5d50-4696-a440-647a3d971910", + "metadata": {}, + "source": [ + "To extract the requierd data from streaming dataframe the correct schema has be defined and used" + ] + }, { "cell_type": "code", - "execution_count": 46, - "id": "ed65282b", + "execution_count": 6, + "id": "21bb5954", "metadata": {}, "outputs": [], "source": [ @@ -239,15 +204,23 @@ " ])" ] }, + { + "cell_type": "markdown", + "id": "ff3eb48e-f427-4b76-95d8-c310e268998d", + "metadata": {}, + "source": [ + "Extracting data from the streaming dataframe and casting it to get the required schema" + ] + }, { "cell_type": "code", - "execution_count": 47, - "id": "b1827d0d", + "execution_count": 7, + "id": "4fc025cb", "metadata": {}, "outputs": [], "source": [ "# Deserialize data from and create streaming query\n", - "transaction_streaming_df = df_read.selectExpr(\"CAST(value AS STRING)\") \\\n", + "streaming_df = df_read.selectExpr(\"CAST(value AS STRING)\") \\\n", " .select(from_json(\"value\", parse_schema).alias(\"value\")) \\\n", " .select(\"value.tid\",\n", " \"value.datetime\",\n", @@ -274,20 +247,29 @@ }, { "cell_type": "markdown", - "id": "c83f2598", + "id": "8b1022b9", "metadata": {}, "source": [ "## 🛠️ Feature Engineering " ] }, + { + "cell_type": "markdown", + "id": "379e0fea-35f3-4aa6-8807-224d827d38d8", + "metadata": {}, + "source": [ + "Now that we have a streaming dataframe that contains the data we can use it to engineer features. We would need the also need profiles to effectively engineer features.\n", + "\n", + "So next you can read data from profiles feature group" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "177a1de5", + "execution_count": 8, + "id": "e44035c3", "metadata": {}, "outputs": [], "source": [ - "# read profile data to cogroup with streaming dataframe\n", "profile_fg = fs.get_or_create_feature_group(\n", " name=\"profile\",\n", " version=1)\n", @@ -295,62 +277,199 @@ "profile_df = profile_fg.read()" ] }, + { + "cell_type": "markdown", + "id": "8d352f12-06e6-46d7-b29b-48d0f8dc6aa9", + "metadata": {}, + "source": [ + "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", + "1. **Features that aggregate data from different data sources**. This could for instance be the age of a customer at the time of a transaction, which combines the `birthdate` feature from `profiles` with the `datetime` feature from `transactions`.\n", + "2. **Features that aggregate data from multiple time steps**. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function." + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "3d821a64", + "execution_count": 9, + "id": "9c834a5c", "metadata": {}, "outputs": [], "source": [ - "windowed_transaction_df = transaction_streaming_df \\\n", - " .selectExpr(\"tid\",\n", - " \"datetime\",\n", - " \"cc_num\",\n", - " \"category\",\n", - " \"CAST(amount as double)\",\n", - " \"radians(latitude) as latitude\",\n", - " \"radians(longitude) as longitude\",\n", - " \"city\",\n", - " \"country\") \\\n", - " .withWatermark(\"datetime\", \"24 hours\") \\\n", - " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", - " .applyInPandas(transactions.loc_delta_t_minus_1, schema=schema1) \\\n", - " .withWatermark(\"datetime\", \"24 hours\") \\\n", - " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", - " .applyInPandas(transactions.time_delta_t_minus_1, schema=schema2) \\\n", - " .withColumn(\"month\", udf(col(\"datetime\"))) \\\n", - " .withWatermark(\"datetime\", \"24 hours\") \\\n", - " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", - " .cogroup(profile_df.groupby(\"cc_provider\")) \\\n", - " .applyInPandas(transactions.card_owner_age, schema=schema3) \\\n", - " .withWatermark(\"datetime\", \"24 hours\") \\\n", - " .groupBy(window(\"datetime\", \"168 hours\")) \\\n", - " .cogroup(profile_df.groupby(\"cc_provider\")) \\\n", - " .applyInPandas(transactions.expiry_days, schema=schema4)" + "transaction_streaming_df = (\n", + " streaming_df.join(profile_df.drop(\"city\"), on=\"cc_num\", how=\"left\")\n", + " .withColumn(\"cc_expiration_date\", to_timestamp(\"cc_expiration_date\", \"mm/dd\"))\n", + " .withColumn(\"age_at_transaction\", datediff(col(\"datetime\"),col(\"birthdate\")))\n", + " .withColumn(\"days_until_card_expires\", datediff(col(\"datetime\"),col(\"cc_expiration_date\")))\n", + " .select([\"tid\", \"datetime\", \"cc_num\", \"category\", \"amount\", \"latitude\", \"longitude\", \"city\", \"country\", \"fraud_label\", \"age_at_transaction\", \"days_until_card_expires\", \"cc_expiration_date\"])\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f08e38b0-9c86-4a66-acec-6a56cda2073c", + "metadata": {}, + "source": [ + "Next, you will create features that aggregate credit card data over a period of time.\n", + "\n", + "Here for simplicity we take the average, standard deviation and frequency of transaction amount over a period of 4 hours" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3745b224", + "metadata": {}, + "outputs": [], + "source": [ + "windowed_4h_aggregation_df = (\n", + " streaming_df.withWatermark(\"datetime\", \"168 hours\")\n", + " .groupBy(window(\"datetime\", \"4 hours\", \"1 hour\"), \"cc_num\")\n", + " .agg(\n", + " avg(\"amount\").alias(\"avg_amt_per_4h\"),\n", + " stddev(\"amount\").alias(\"stdev_amt_per_4h\"),\n", + " count(\"cc_num\").alias(\"num_trans_per_4h\"),\n", + " )\n", + " .na.fill({\"stdev_amt_per_4h\":0})\n", + " .selectExpr(\n", + " \"cc_num\",\n", + " \"current_timestamp() as event_time\",\n", + " \"num_trans_per_4h\",\n", + " \"avg_amt_per_4h\",\n", + " \"stdev_amt_per_4h\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a7a8cfae", + "metadata": {}, + "source": [ + "## 👮🏻‍♂️ Great Expectations " ] }, { "cell_type": "code", - "execution_count": 48, - "id": "cc2d81a2", + "execution_count": 11, + "id": "9fe44d86", + "metadata": {}, + "outputs": [], + "source": [ + "import great_expectations as ge\n", + "from great_expectations.core import ExpectationSuite, ExpectationConfiguration\n", + "\n", + "# Set the expectation suite name to \"transactions_suite\"\n", + "expectation_suite_transactions = ge.core.ExpectationSuite(\n", + " expectation_suite_name=\"transactions_suite\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f234cb69", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"kwargs\": {\"column\": \"tid\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", + "{\"kwargs\": {\"column\": \"datetime\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", + "{\"kwargs\": {\"column\": \"cc_num\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}" + ] + } + ], + "source": [ + "# Check binary fraud_label column to be in set [0,1]\n", + "expectation_suite_transactions.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_distinct_values_to_be_in_set\",\n", + " kwargs={\n", + " \"column\": \"fraud_label\",\n", + " \"value_set\": [0, 1],\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Check amount column to be not negative\n", + "expectation_suite_transactions.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\n", + " \"column\": \"amount\",\n", + " \"min_value\": 0.0,\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Loop through specified columns ('tid', 'datetime', 'cc_num') and add expectations for null values\n", + "for column in ['tid', 'datetime', 'cc_num']:\n", + " expectation_suite_transactions.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_null\",\n", + " kwargs={\n", + " \"column\": column,\n", + " \"mostly\": 0.0,\n", + " }\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "2e3adf53", + "metadata": {}, + "source": [ + "## 💾 Storing streaming dataframes in Hopsworks Feature Store " + ] + }, + { + "cell_type": "markdown", + "id": "fb8dee9a-de08-4ab7-a9ae-2cd667d0cff7", + "metadata": {}, + "source": [ + "### 🪄 Creating Feature Groups \n", + "\n", + "A [feature group](https://docs.hopsworks.ai/3.0/concepts/fs/feature_group/fg_overview/) can be seen as a collection of conceptually related features. In this case, you will create a feature group for the transaction data and a feature group for the windowed aggregations on the transaction data. Both will have `cc_num` as primary key, which will allow you to join them when creating a dataset in the next tutorial.\n", + "\n", + "Feature groups can also be used to define a namespace for features. For instance, in a real-life setting you would likely want to experiment with different window lengths. In that case, you can create feature groups with identical schema for each window length. \n", + "\n", + "Before you can create a feature group you need to connect to Hopsworks feature store." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5ceb25c5", "metadata": {}, "outputs": [], "source": [ "trans_fg = fs.get_or_create_feature_group(\n", - " name=\"transactions\",\n", + " name=\"transactions_fraud_streaming_fg\",\n", " version=1,\n", " description=\"Transaction data\",\n", - " primary_key=['cc_num'],\n", - " #partition_key=['month'],\n", + " primary_key=[\"cc_num\"],\n", + " event_time=\"datetime\",\n", + " online_enabled=True,\n", " stream=True,\n", - " online_enabled=True\n", + " expectation_suite=expectation_suite_transactions,\n", ")" ] }, + { + "cell_type": "markdown", + "id": "ef421264-6956-4226-87d5-4e7b1c5dd0b6", + "metadata": {}, + "source": [ + "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", + "\n", + "At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To insert a streaming dataframe into a feature group you can use the streaming feature `insert_stream` function. You can find more details in the [documentation](https://docs.hopsworks.ai/feature-store-api/3.7/generated/api/feature_group_api/#insert_stream)." + ] + }, { "cell_type": "code", - "execution_count": 49, - "id": "7905c4a2", + "execution_count": 14, + "id": "7b370459", "metadata": {}, "outputs": [ { @@ -358,51 +477,226 @@ "output_type": "stream", "text": [ "Feature Group created successfully, explore it at \n", - "https://staging.cloud.hopsworks.ai/p/124/fs/72/fg/154" + "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/fs/3140/fg/3118\n", + "StatisticsWarning: Stream ingestion for feature group `transactions_fraud_streaming_fg`, with version `1` will not compute statistics." ] } ], "source": [ - "q = trans_fg.insert_stream(transaction_streaming_df)" + "# Insert data into feature group\n", + "trans_fg_query = trans_fg.insert_stream(transaction_streaming_df)" + ] + }, + { + "cell_type": "markdown", + "id": "e2d564f6-c8f6-48ea-aca2-0ae22bd0bcd1", + "metadata": {}, + "source": [ + "\n", + "The `insert_stream` function returns a `StreamingQuery` object which be used to check the status of the streaming query.\n" ] }, { "cell_type": "code", - "execution_count": 58, - "id": "4f0202d1", + "execution_count": 16, + "id": "52b86e8c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}" + ] + } + ], "source": [ - "q.stop()" + "trans_fg_query.status" + ] + }, + { + "cell_type": "markdown", + "id": "40f288ea-f0ad-48a0-901e-de06cbea5617", + "metadata": {}, + "source": [ + "The `insert_stream` function inserts the data into the online feature store so to materialize the data in the offline store you need to manually run the materialization job. The materialization job can also be run on schedule using the `schedule` function. You can find more details in the [documentation](https://docs.hopsworks.ai/hopsworks-api/3.7/generated/api/jobs/#schedule)." ] }, { "cell_type": "code", - "execution_count": 56, - "id": "c6b0d90a", + "execution_count": 15, + "id": "de766b09", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Launching job: transactions_1_offline_fg_materialization\n", + "Launching job: transactions_fraud_streaming_fg_1_offline_fg_materialization\n", "Job started successfully, you can follow the progress at \n", - "https://staging.cloud.hopsworks.ai/p/124/jobs/named/transactions_1_offline_fg_materialization/executions" + "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/jobs/named/transactions_fraud_streaming_fg_1_offline_fg_materialization/executions" ] } ], "source": [ - "trans_fg.materialization_job.run()" + "trans_fg.materialization_job.schedule(cron_expression = \"0 10 * ? * * *\", start_time=datetime.datetime.now(tz=timezone.utc))" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d8216031", + "execution_count": 17, + "id": "0e2c385b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ] + } + ], + "source": [ + "# Update feature descriptions\n", + "feature_descriptions = [\n", + " {\"name\": \"tid\", \"description\": \"Transaction id\"},\n", + " {\"name\": \"datetime\", \"description\": \"Transaction time\"},\n", + " {\"name\": \"cc_num\", \"description\": \"Number of the credit card performing the transaction\"},\n", + " {\"name\": \"category\", \"description\": \"Expense category\"},\n", + " {\"name\": \"amount\", \"description\": \"Dollar amount of the transaction\"},\n", + " {\"name\": \"latitude\", \"description\": \"Transaction location latitude\"},\n", + " {\"name\": \"longitude\", \"description\": \"Transaction location longitude\"},\n", + " {\"name\": \"city\", \"description\": \"City in which the transaction was made\"},\n", + " {\"name\": \"country\", \"description\": \"Country in which the transaction was made\"},\n", + " {\"name\": \"fraud_label\", \"description\": \"Whether the transaction was fraudulent or not\"},\n", + " {\"name\": \"age_at_transaction\", \"description\": \"Age of the card holder when the transaction was made\"},\n", + " {\"name\": \"days_until_card_expires\", \"description\": \"Card validity days left when the transaction was made\"},\n", + "]\n", + "\n", + "for desc in feature_descriptions: \n", + " trans_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "d3832bb4-7786-4893-ad50-8aa37e29d278", + "metadata": {}, + "source": [ + "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group." + ] + }, + { + "cell_type": "markdown", + "id": "3103a4fc-c298-4760-98d3-953ad3f7e7cb", + "metadata": {}, + "source": [ + "You can move on and do the same thing for the feature group with our windows aggregation." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3f56bbff", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Get or create the 'transactions' feature group with specified window aggregations\n", + "window_aggs_streaming_fg = fs.get_or_create_feature_group(\n", + " name=f\"transactions_aggs_fraud_streaming_fg\",\n", + " version=1,\n", + " description=f\"Aggregate transaction data over 5 minute windows.\",\n", + " primary_key=[\"cc_num\"],\n", + " event_time=\"event_time\",\n", + " online_enabled=True,\n", + " stream=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c054288f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/fs/3140/fg/3119\n", + "StatisticsWarning: Stream ingestion for feature group `transactions_aggs_fraud_streaming_fg`, with version `1` will not compute statistics." + ] + } + ], + "source": [ + "window_aggs_streaming_fg_query = window_aggs_streaming_fg.insert_stream(windowed_4h_aggregation_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b2c6e16f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'message': 'Getting offsets from KafkaV2[Subscribe[transactions_topic]]', 'isDataAvailable': False, 'isTriggerActive': True}" + ] + } + ], + "source": [ + "window_aggs_streaming_fg_query.status" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "aa973de6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "An error was encountered:\n", + "The Hopsworks Job failed, use the Hopsworks UI to access the job logs\n", + "Traceback (most recent call last):\n", + " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/job.py\", line 124, in run\n", + " self._wait_for_job(await_termination=await_termination)\n", + " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/job.py\", line 242, in _wait_for_job\n", + " raise FeatureStoreException(\n", + "hsfs.client.exceptions.FeatureStoreException: The Hopsworks Job failed, use the Hopsworks UI to access the job logs\n", + "\n" + ] + } + ], + "source": [ + "window_aggs_streaming_fg.materialization_job.schedule(cron_expression = \"0 10 * ? * * *\", start_time=datetime.datetime.now(tz=timezone.utc))" + ] + }, + { + "cell_type": "markdown", + "id": "ce70361d", + "metadata": {}, + "source": [ + "## ⏭️ **Next:** Part 02: Training Pipeline\n", + " \n", + "\n", + "In the following notebook you will use your feature groups to create a dataset you can train a model on." + ] } ], "metadata": { diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb index 87bee961..109b386a 100644 --- a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -2,10 +2,10 @@ "cells": [ { "cell_type": "markdown", - "id": "d48f2deb", + "id": "aa514fb7", "metadata": {}, "source": [ - "# **Hopsworks Feature Store** \n", + "# **Hopsworks Feature Store** \n", "\n", "This notebook creates a data stream using Hopsworks Internal Kafka\n", "\n", @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "15e3878a", + "id": "8d2f5c11", "metadata": {}, "source": [ "## 📝 Imports" @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "c633d356", + "id": "df72129e", "metadata": {}, "outputs": [ { @@ -45,8 +45,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "bfc5f6e0", + "execution_count": 12, + "id": "cbeaa9ed", "metadata": {}, "outputs": [], "source": [ @@ -57,46 +57,71 @@ }, { "cell_type": "markdown", - "id": "154e65f1", + "id": "ca2b2938", "metadata": {}, "source": [ "## ✏️ Creating Simulated Data " ] }, + { + "cell_type": "markdown", + "id": "1ef161e6-3e87-4bd0-a190-c37fd1a3e444", + "metadata": {}, + "source": [ + "A simulated dataset for credit card Transactions is created so that the data can be send using a Kafka stream. The data created is split into two different dataframes:\n", + "\n", + "* profiles_df: credit card user information such as birthdate and city of residence, along with credict card information such as the expiration date and provider.\n", + "* trans_df: events containing information about when a credit card was used, such as a timestamp, location, and the amount spent. A boolean fraud_label variable (True/False) tells us whether a transaction was fraudulent or not.\n", + "\n", + "In a production system, these data would originate from separate data sources or tables, and probably separate data pipelines. Both files have a common credit card number column cc_num, which you will use later to join features together from the different datasets.\n", + "\n", + "Now you can go ahead and create the data." + ] + }, { "cell_type": "code", - "execution_count": 11, - "id": "9444c0b2", + "execution_count": 13, + "id": "40038992", "metadata": {}, "outputs": [], "source": [ "data_simulater = synthetic_data.synthetic_data()\n", "\n", - "credit_cards, trans_df = data_simulater.create_simulated_transactions()" + "profiles_df, trans_df = data_simulater.create_simulated_transactions()" ] }, { "cell_type": "markdown", - "id": "2f3fb67b", + "id": "f8c852a2", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " ] }, + { + "cell_type": "markdown", + "id": "e5dd189e-2424-4462-9f5f-a634950be10a", + "metadata": {}, + "source": [ + "After creating the simulated data let us connect with Hopsworks Feature Store.\n", + "\n", + "Hopsworks provides an internal Kafka which can be accessed using the KafkaAPI. See [documentation](https://docs.hopsworks.ai/3.7/user_guides/projects/kafka/create_schema/#introduction) for more details." + ] + }, { "cell_type": "code", - "execution_count": 12, - "id": "83a2d0d3", + "execution_count": 4, + "id": "d7aff338", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Connection closed.\n", "Connected. Call `.close()` to terminate connection gracefully.\n", "\n", - "Logged in to project, explore it here https://staging.cloud.hopsworks.ai/p/124\n" + "Logged in to project, explore it here https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], @@ -105,21 +130,79 @@ "\n", "project = hopsworks.login()\n", "\n", - "kafka_api = project.get_kafka_api()" + "kafka_api = project.get_kafka_api()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "c7e69f9f", + "metadata": {}, + "source": [ + "## 🪄 Creating Feature Groups " + ] + }, + { + "cell_type": "markdown", + "id": "ebdd5944-dea4-4a31-bbea-87aafbef0bc2", + "metadata": {}, + "source": [ + "Profiles data can be directly inserted as a feature group directly since they are not update fequently.\n", + "\n", + "To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`.\n", + "\n", + "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2838cb7a", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'fs' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m profile_fg \u001b[38;5;241m=\u001b[39m \u001b[43mfs\u001b[49m\u001b[38;5;241m.\u001b[39mget_or_create_feature_group(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprofile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m primary_key\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcc_num\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 4\u001b[0m partition_key\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcc_provider\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 5\u001b[0m version\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 7\u001b[0m profile_df \u001b[38;5;241m=\u001b[39m profile_fg\u001b[38;5;241m.\u001b[39minsert(profiles_df)\n", + "\u001b[0;31mNameError\u001b[0m: name 'fs' is not defined" + ] + } + ], + "source": [ + "profile_fg = fs.get_or_create_feature_group(\n", + " name=\"profile\",\n", + " primary_key=[\"cc_num\"],\n", + " partition_key=[\"cc_provider\"],\n", + " version=1)\n", + "\n", + "profile_df = profile_fg.insert(profiles_df)" ] }, { "cell_type": "markdown", - "id": "de5aa4a9", + "id": "13e85615", "metadata": {}, "source": [ "## ⚙️ Kafka Topic and Schema Creation " ] }, + { + "cell_type": "markdown", + "id": "7d973e90-66fd-47c1-9b11-f9aaebdf0caf", + "metadata": {}, + "source": [ + "To create a Kafka stream for transactions a topic and schema must be create. The schema used must follow Apache Avro specification, more details can be found in the [documentation](https://avro.apache.org/docs/1.11.1/specification/).\n" + ] + }, { "cell_type": "code", - "execution_count": 13, - "id": "dbed6eb4", + "execution_count": 6, + "id": "5ba10159", "metadata": {}, "outputs": [], "source": [ @@ -130,8 +213,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "3b388b1a", + "execution_count": 7, + "id": "e39dfb68", "metadata": {}, "outputs": [], "source": [ @@ -217,10 +300,18 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "fab6d27f-72f3-4142-ae4a-8a28803b5490", + "metadata": {}, + "source": [ + "After the schema is created the topic and the associated schema must be registered in Hopsworks so that the topic can be used." + ] + }, { "cell_type": "code", - "execution_count": 15, - "id": "8a093dd7", + "execution_count": 8, + "id": "99ca36ed", "metadata": {}, "outputs": [], "source": [ @@ -231,16 +322,26 @@ }, { "cell_type": "markdown", - "id": "135c4072", + "id": "51408f5f", "metadata": {}, "source": [ "## 📡 Sending Data using created Kafka Topic " ] }, + { + "cell_type": "markdown", + "id": "53de5bb3-1fcd-4136-8c02-d5003aa18b13", + "metadata": {}, + "source": [ + "While sending data through Kafka we must make sure that the data types are in the same format specified in the schema. \n", + "\n", + "Let's make sure that the dataframe has all the components in the correct format." + ] + }, { "cell_type": "code", - "execution_count": 16, - "id": "2d47085f", + "execution_count": 9, + "id": "ff97db95", "metadata": {}, "outputs": [], "source": [ @@ -256,15 +357,41 @@ "trans_df[\"fraud_label\"] = trans_df[\"fraud_label\"].astype(\"string\")" ] }, + { + "cell_type": "markdown", + "id": "ab5c49b5-cfd6-451b-bdf7-a8130c261b5e", + "metadata": {}, + "source": [ + "Lets get the configuration needed for the producer to used Hopsworks internal kafka using the KafkaAPI" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "61263e9b", + "id": "88f58727-2ca3-4c23-9c5f-a86fa934d63b", "metadata": {}, "outputs": [], "source": [ - "kafka_config = kafka_api.get_default_config()\n", + "kafka_config = kafka_api.get_default_config()" + ] + }, + { + "cell_type": "markdown", + "id": "ef6ea6ba-677d-4155-aea9-6d1be073086a", + "metadata": {}, + "source": [ + "Finally, lets create a producer using the Kafka configuration and send data into it.\n", "\n", + "It is important to note that the data passed to the producer must be a json or it must be avro encoded." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f70cf209", + "metadata": {}, + "outputs": [], + "source": [ "producer = Producer(kafka_config)\n", "\n", "for index, transaction in trans_df.iterrows():\n", @@ -274,7 +401,7 @@ }, { "cell_type": "markdown", - "id": "6a5d0799", + "id": "aeb76a7b", "metadata": {}, "source": [ "---\n", @@ -286,7 +413,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -300,7 +427,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.18" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb new file mode 100644 index 00000000..a037b423 --- /dev/null +++ b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb @@ -0,0 +1,592 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Hopsworks Feature Store** - Part 02: Training Pipeline\n", + "\n", + "This notebook explains how to read from a feature group, create training dataset within the feature store, train a model and save it to model registry.\n", + "\n", + "## 🗒️ This notebook is divided into the following sections:\n", + "\n", + "1. Fetch Feature Groups.\n", + "2. Define Transformation functions.\n", + "3. Create Feature Views.\n", + "4. Create Training Dataset with training, validation and test splits.\n", + "5. Train the model.\n", + "6. Register model in Hopsworks Model Registry.\n", + "\n", + "![part2](../../images/02_training-dataset.png) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U xgboost --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "import os\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from matplotlib import pyplot\n", + "import seaborn as sns\n", + "\n", + "import xgboost as xgb\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import f1_score\n", + "\n", + "# Mute warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🔪 Feature Selection \n", + "\n", + "You will start by selecting all the features you want to include for model training/inference." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve feature groups\n", + "trans_fg = fs.get_feature_group(\n", + " name='transactions_fraud_streaming_fg', \n", + " version=1,\n", + ")\n", + "window_aggs_fg = fs.get_feature_group(\n", + " name='transactions_aggs_fraud_streaming_fg', \n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Select features for training data.\n", + "selected_features = trans_fg.select([\"fraud_label\", \"category\", \"amount\", \"datetime\", \"age_at_transaction\", \"days_until_card_expires\"])\\\n", + " .join(window_aggs_fg.select_except([\"cc_num\", \"event_time\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DeprecationWarning: ssl.PROTOCOL_TLS is deprecated\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using Hive (21.07s) \n" + ] + } + ], + "source": [ + "# Uncomment this if you would like to view your selected features\n", + "df = selected_features.read(read_options={\"use_hive\":True})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall that you computed the features in `transactions_4h_aggs_fraud_batch_fg` using 4-hour aggregates. If you had created multiple feature groups with identical schema for different window lengths, and wanted to include them in the join you would need to include a prefix argument in the join to avoid feature name clash. See the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/query_api/#join) for more details." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🤖 Transformation Functions \n", + "\n", + "\n", + "You will preprocess our data using *min-max scaling* on numerical features and *label encoding* on categorical features. To do this you simply define a mapping between our features and transformation functions. This ensures that transformation functions such as *min-max scaling* are fitted only on the training data (and not the validation/test data), which ensures that there is no data leakage." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Load transformation functions.\n", + "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", + "\n", + "# Map features to transformations.\n", + "transformation_functions = {\n", + " \"category\": label_encoder,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ⚙️ Feature View Creation \n", + "\n", + "The Feature Views allows schema in form of a query with filters, define a model target feature/label and additional transformation functions.\n", + "In order to create a Feature View you may use `fs.create_feature_view()`. Here we try first to get the feature view, and if we can't an exception is thrown and we create the feature view." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature view created successfully, explore it at \n", + "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/fs/3140/fv/transactions_view_fraud_batch_fv/version/1\n" + ] + } + ], + "source": [ + "# Get or create the 'transactions_view_fraud_batch_fv' feature view\n", + "feature_view = fs.get_or_create_feature_view(\n", + " name='transactions_view_fraud_batch_fv',\n", + " version=1,\n", + " query=selected_features,\n", + " labels=[\"fraud_label\"],\n", + " transformation_functions=transformation_functions,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The feature view is now visible in the UI.\n", + "\n", + "![fg-overview](../images/fv_overview.gif)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🏋️ Training Dataset Creation\n", + "\n", + "In Hopsworks training data is a query where the projection (set of features) is determined by the parent FeatureView with an optional snapshot on disk of the data returned by the query.\n", + "\n", + "**Training Dataset may contain splits such as:** \n", + "* Training set - the subset of training data used to train a model.\n", + "* Validation set - the subset of training data used to evaluate hparams when training a model\n", + "* Test set - the holdout subset of training data used to evaluate a mode\n", + "\n", + "Training dataset is created using `feature_view.train_validation_test_split()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DeprecationWarning: ssl.PROTOCOL_TLS is deprecated\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using Hive (25.04s) \n" + ] + }, + { + "ename": "KeyError", + "evalue": "DataType(null)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mTEST_SIZE\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m X_train, X_test, y_train, y_test = feature_view.train_test_split(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mtest_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTEST_SIZE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"use_hive\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m )\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/usage.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;31m# Disable usage AFTER import hsfs, return function itself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0m_is_enabled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 198\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 199\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mperf_counter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/feature_view.py\u001b[0m in \u001b[0;36mtrain_test_split\u001b[0;34m(self, test_size, train_start, train_end, test_start, test_end, description, extra_filter, statistics_config, read_options, spine, primary_keys, event_time, training_helper_columns)\u001b[0m\n\u001b[1;32m 2203\u001b[0m \u001b[0mextra_filter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mextra_filter\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2204\u001b[0m )\n\u001b[0;32m-> 2205\u001b[0;31m td, df = self._feature_view_engine.get_training_data(\n\u001b[0m\u001b[1;32m 2206\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2207\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/feature_view_engine.py\u001b[0m in \u001b[0;36mget_training_data\u001b[0;34m(self, feature_view_obj, read_options, splits, training_dataset_obj, training_dataset_version, spine, primary_keys, event_time, training_helper_columns)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[0mspine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspine\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 353\u001b[0m )\n\u001b[0;32m--> 354\u001b[0;31m split_df = engine.get_instance().get_training_data(\n\u001b[0m\u001b[1;32m 355\u001b[0m \u001b[0mtd_updated\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m )\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/engine/python.py\u001b[0m in \u001b[0;36mget_training_data\u001b[0;34m(self, training_dataset_obj, feature_view_obj, query_obj, read_options)\u001b[0m\n\u001b[1;32m 628\u001b[0m ):\n\u001b[1;32m 629\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplits\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 630\u001b[0;31m return self._prepare_transform_split_df(\n\u001b[0m\u001b[1;32m 631\u001b[0m \u001b[0mquery_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 632\u001b[0m )\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/engine/python.py\u001b[0m in \u001b[0;36m_prepare_transform_split_df\u001b[0;34m(self, query_obj, training_dataset_obj, feature_view_obj, read_option)\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;31m# apply transformations\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 687\u001b[0m \u001b[0;31m# 1st parametrise transformation functions with dt split stats\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 688\u001b[0;31m transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(\n\u001b[0m\u001b[1;32m 689\u001b[0m \u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_dfs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 690\u001b[0m )\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/transformation_function_engine.py\u001b[0m in \u001b[0;36mpopulate_builtin_transformation_functions\u001b[0;34m(training_dataset, feature_view_obj, dataset)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;31m# compute statistics before transformations are applied\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m stats = (\n\u001b[0;32m--> 276\u001b[0;31m TransformationFunctionEngine.compute_transformation_fn_statistics(\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0mtraining_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0mbuiltin_tffn_features\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/transformation_function_engine.py\u001b[0m in \u001b[0;36mcompute_transformation_fn_statistics\u001b[0;34m(training_dataset_obj, builtin_tffn_features, label_encoder_features, feature_dataframe, feature_view_obj)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m ) -> statistics.Statistics:\n\u001b[0;32m--> 240\u001b[0;31m return training_dataset_obj._statistics_engine.compute_transformation_fn_statistics(\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0mtd_metadata_instance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuiltin_tffn_features\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# excluding label encoded features\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/statistics_engine.py\u001b[0m in \u001b[0;36mcompute_transformation_fn_statistics\u001b[0;34m(self, td_metadata_instance, columns, label_encoder_features, feature_dataframe, feature_view_obj)\u001b[0m\n\u001b[1;32m 237\u001b[0m \"\"\"\n\u001b[1;32m 238\u001b[0m \u001b[0mcomputation_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimestamp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 239\u001b[0;31m stats_str = self._profile_transformation_fn_statistics(\n\u001b[0m\u001b[1;32m 240\u001b[0m \u001b[0mfeature_dataframe\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel_encoder_features\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m )\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/statistics_engine.py\u001b[0m in \u001b[0;36m_profile_transformation_fn_statistics\u001b[0;34m(self, feature_dataframe, columns, label_encoder_features)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# compute statistics for all features with transformation fn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mall_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlabel_encoder_features\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m stats_str = engine.get_instance().profile(\n\u001b[0m\u001b[1;32m 386\u001b[0m \u001b[0mfeature_dataframe\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_columns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m )\n", + "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/engine/python.py\u001b[0m in \u001b[0;36mprofile\u001b[0;34m(self, df, relevant_columns, correlations, histograms, exact_uniqueness)\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_large_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_struct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 409\u001b[0;31m ) and PYARROW_HOPSWORKS_DTYPE_MAPPING[field.type] in [\"timestamp\", \"date\"]:\n\u001b[0m\u001b[1;32m 410\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: DataType(null)" + ] + } + ], + "source": [ + "TEST_SIZE = 0.2\n", + "\n", + "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", + " test_size = TEST_SIZE, read_options={\"use_hive\":True}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort the X_train DataFrame based on the \"datetime\" column in ascending order\n", + "X_train = X_train.sort_values(\"datetime\")\n", + "\n", + "# Reindex the y_train Series to match the order of rows in the sorted X_train DataFrame\n", + "y_train = y_train.reindex(X_train.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort the X_test DataFrame based on the \"datetime\" column in ascending order\n", + "X_test = X_test.sort_values(\"datetime\")\n", + "\n", + "# Reindex the y_test Series to match the order of rows in the sorted X_test DataFrame\n", + "y_test = y_test.reindex(X_test.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the \"datetime\" column from the X_train DataFrame along the specified axis (axis=1 means columns)\n", + "X_train.drop([\"datetime\"], axis=1, inplace=True)\n", + "\n", + "# Drop the \"datetime\" column from the X_test DataFrame along the specified axis (axis=1 means columns)\n", + "X_test.drop([\"datetime\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the normalized value counts of the y_train Series\n", + "y_train.value_counts(normalize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the distribution is extremely skewed, which is natural considering that fraudulent transactions make up a tiny part of all transactions. Thus you should somehow address the class imbalance. There are many approaches for this, such as weighting the loss function, over- or undersampling, creating synthetic data, or modifying the decision threshold. In this example, you will use the simplest method which is to just supply a class weight parameter to our learning algorithm. The class weight will affect how much importance is attached to each class, which in our case means that higher importance will be placed on positive (fraudulent) samples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🧬 Modeling\n", + "\n", + "Next you will train a model. Here, you set larger class weight for the positive class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an instance of the XGBClassifier\n", + "clf = xgb.XGBClassifier()\n", + "\n", + "# Fit the classifier on the training data\n", + "clf.fit(X_train.values, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Predict the training data using the trained classifier\n", + "y_pred_train = clf.predict(X_train.values)\n", + "\n", + "# Predict the test data using the trained classifier\n", + "y_pred_test = clf.predict(X_test.values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute f1 score\n", + "metrics = {\n", + " \"f1_score\": f1_score(y_test, y_pred_test, average='macro')\n", + "}\n", + "metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the confusion matrix using the true labels (y_test) and predicted labels (y_pred_test)\n", + "results = confusion_matrix(y_test, y_pred_test)\n", + "\n", + "# Print the confusion matrix\n", + "print(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a DataFrame from the confusion matrix results with appropriate labels\n", + "df_cm = pd.DataFrame(\n", + " results, \n", + " ['True Normal', 'True Fraud'],\n", + " ['Pred Normal', 'Pred Fraud'],\n", + ")\n", + "\n", + "# Create a heatmap using seaborn with annotations\n", + "cm = sns.heatmap(df_cm, annot=True)\n", + "\n", + "# Get the figure from the heatmap and display it\n", + "fig = cm.get_figure()\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### ⚙️ Model Schema\n", + "\n", + "The model needs to be set up with a [Model Schema](https://docs.hopsworks.ai/3.0/user_guides/mlops/registry/model_schema/), which describes the inputs and outputs for a model.\n", + "\n", + "A Model Schema can be automatically generated from training examples, as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.schema import Schema\n", + "from hsml.model_schema import ModelSchema\n", + "\n", + "# Define the input schema using the values of X_train\n", + "input_schema = Schema(X_train.values)\n", + "\n", + "# Define the output schema using y_train\n", + "output_schema = Schema(y_train)\n", + "\n", + "# Create a ModelSchema object specifying the input and output schemas\n", + "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", + "\n", + "# Convert the model schema to a dictionary for further inspection or serialization\n", + "model_schema.to_dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📝 Register model\n", + "\n", + "One of the features in Hopsworks is the model registry. This is where we can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the directory where the model will be saved\n", + "model_dir = \"fraud_batch_model\"\n", + "\n", + "# Check if the directory exists, and create it if it doesn't\n", + "if not os.path.isdir(model_dir):\n", + " os.mkdir(model_dir)\n", + "\n", + "# Save the trained XGBoost model using joblib\n", + "joblib.dump(clf, model_dir + '/xgboost_fraud_batch_model.pkl')\n", + "\n", + "# Save the confusion matrix heatmap as an image in the model directory\n", + "fig.savefig(model_dir + \"/confusion_matrix.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the model registry\n", + "mr = project.get_model_registry()\n", + "\n", + "# Create a new model in the model registry\n", + "fraud_model = mr.python.create_model(\n", + " name=\"xgboost_fraud_batch_model\", # Name for the model\n", + " metrics=metrics, # Metrics used for evaluation\n", + " model_schema=model_schema, # Schema defining the model's input and output\n", + " input_example=X_train.sample(), # Example input data for reference\n", + " description=\"Fraud Batch Predictor\", # Description of the model\n", + ")\n", + "\n", + "# Save the model to the specified directory\n", + "fraud_model.save(model_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ⏭️ **Next:** Part 03: Batch Inference\n", + "\n", + "In the following notebook you will use your model for batch inference.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From d543c687550751109e709a240ef3d89f92a8e75f Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 12 Mar 2024 13:54:35 +0100 Subject: [PATCH 09/16] updated tutorial --- .../pyspark_streaming/Feature Pipeline.ipynb | 298 +++++------------- .../Simulated Stream Creation.ipynb | 116 +++---- .../pyspark_streaming/Training Pipeline.ipynb | 4 +- 3 files changed, 128 insertions(+), 290 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index 77954411..bed4d8c2 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a49874d9", + "id": "965b4213", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "a3f58f72", + "id": "aab60e44", "metadata": {}, "source": [ "## 📝 Imports" @@ -33,38 +33,10 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "8ab11319", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting Spark application\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
IDApplication IDKindStateSpark UIDriver log
8application_1710149630761_0019pysparkidleLinkLink
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SparkSession available as 'spark'.\n" - ] - } - ], + "execution_count": null, + "id": "a11ab3b5", + "metadata": {}, + "outputs": [], "source": [ "import datetime\n", "from pyspark.sql import SparkSession\n", @@ -77,7 +49,7 @@ }, { "cell_type": "markdown", - "id": "5b2fa848", + "id": "34923d04", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -85,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "2cc70118-2fcf-4978-9821-3348fcef6873", + "id": "51a0f1c2", "metadata": {}, "source": [ "In this tutorial a simulated data stream was created using Hopsworks internal Kafka.\n", @@ -95,21 +67,10 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "5513a1fb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected. Call `.close()` to terminate connection gracefully.\n", - "\n", - "Logged in to project, explore it here https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192\n", - "Connected. Call `.close()` to terminate connection gracefully." - ] - } - ], + "execution_count": null, + "id": "c9865406", + "metadata": {}, + "outputs": [], "source": [ "import hopsworks\n", "from hsfs.core.storage_connector_api import StorageConnectorApi\n", @@ -123,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "1faab2a4-6ec6-49c5-8a4d-d6494f7191a9", + "id": "ac509612", "metadata": {}, "source": [ "Let get the kafka configurations needed for read from hopsworks internal Kafka using the storage connector api." @@ -131,8 +92,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "5728b511", + "execution_count": null, + "id": "0de9afd8", "metadata": {}, "outputs": [], "source": [ @@ -143,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "7cf43aa8", + "id": "0eab6707", "metadata": {}, "source": [ "## 🗂 Reading from Kakfa Stream " @@ -151,7 +112,7 @@ }, { "cell_type": "markdown", - "id": "9dae009c-69e6-4197-b5ac-121afd59e52b", + "id": "6ed0a756", "metadata": {}, "source": [ "After obatining the Kafka configurations we can use it along with the topic name to create a streaming dataframe." @@ -159,8 +120,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "37e018f4", + "execution_count": null, + "id": "6695bfd7", "metadata": {}, "outputs": [], "source": [ @@ -178,7 +139,7 @@ }, { "cell_type": "markdown", - "id": "0cf31aaa-5d50-4696-a440-647a3d971910", + "id": "aeae1061", "metadata": {}, "source": [ "To extract the requierd data from streaming dataframe the correct schema has be defined and used" @@ -186,8 +147,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "21bb5954", + "execution_count": null, + "id": "4796c9f2", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "ff3eb48e-f427-4b76-95d8-c310e268998d", + "id": "0bd61722", "metadata": {}, "source": [ "Extracting data from the streaming dataframe and casting it to get the required schema" @@ -214,8 +175,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "4fc025cb", + "execution_count": null, + "id": "bf6c49bf", "metadata": {}, "outputs": [], "source": [ @@ -247,7 +208,7 @@ }, { "cell_type": "markdown", - "id": "8b1022b9", + "id": "8d2cd3af", "metadata": {}, "source": [ "## 🛠️ Feature Engineering " @@ -255,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "379e0fea-35f3-4aa6-8807-224d827d38d8", + "id": "e82fa500", "metadata": {}, "source": [ "Now that we have a streaming dataframe that contains the data we can use it to engineer features. We would need the also need profiles to effectively engineer features.\n", @@ -265,8 +226,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "e44035c3", + "execution_count": null, + "id": "53cee8d1", "metadata": {}, "outputs": [], "source": [ @@ -279,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "8d352f12-06e6-46d7-b29b-48d0f8dc6aa9", + "id": "0384d06c", "metadata": {}, "source": [ "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", @@ -289,8 +250,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "9c834a5c", + "execution_count": null, + "id": "13893981", "metadata": {}, "outputs": [], "source": [ @@ -305,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "f08e38b0-9c86-4a66-acec-6a56cda2073c", + "id": "4f289b0e", "metadata": {}, "source": [ "Next, you will create features that aggregate credit card data over a period of time.\n", @@ -315,8 +276,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "3745b224", + "execution_count": null, + "id": "0d7bef5d", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +302,7 @@ }, { "cell_type": "markdown", - "id": "a7a8cfae", + "id": "79ad3597", "metadata": {}, "source": [ "## 👮🏻‍♂️ Great Expectations " @@ -349,8 +310,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "9fe44d86", + "execution_count": null, + "id": "05e258e9", "metadata": {}, "outputs": [], "source": [ @@ -365,20 +326,10 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "f234cb69", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"kwargs\": {\"column\": \"tid\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", - "{\"kwargs\": {\"column\": \"datetime\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", - "{\"kwargs\": {\"column\": \"cc_num\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}" - ] - } - ], + "execution_count": null, + "id": "c7c0ed99", + "metadata": {}, + "outputs": [], "source": [ "# Check binary fraud_label column to be in set [0,1]\n", "expectation_suite_transactions.add_expectation(\n", @@ -417,7 +368,7 @@ }, { "cell_type": "markdown", - "id": "2e3adf53", + "id": "7a96dda9", "metadata": {}, "source": [ "## 💾 Storing streaming dataframes in Hopsworks Feature Store " @@ -425,7 +376,7 @@ }, { "cell_type": "markdown", - "id": "fb8dee9a-de08-4ab7-a9ae-2cd667d0cff7", + "id": "6a9dc0b0", "metadata": {}, "source": [ "### 🪄 Creating Feature Groups \n", @@ -439,8 +390,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "5ceb25c5", + "execution_count": null, + "id": "d852d5da", "metadata": {}, "outputs": [], "source": [ @@ -458,7 +409,7 @@ }, { "cell_type": "markdown", - "id": "ef421264-6956-4226-87d5-4e7b1c5dd0b6", + "id": "9a1ad822", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -468,20 +419,10 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "7b370459", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature Group created successfully, explore it at \n", - "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/fs/3140/fg/3118\n", - "StatisticsWarning: Stream ingestion for feature group `transactions_fraud_streaming_fg`, with version `1` will not compute statistics." - ] - } - ], + "execution_count": null, + "id": "eecffe7d", + "metadata": {}, + "outputs": [], "source": [ "# Insert data into feature group\n", "trans_fg_query = trans_fg.insert_stream(transaction_streaming_df)" @@ -489,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "e2d564f6-c8f6-48ea-aca2-0ae22bd0bcd1", + "id": "f491c169", "metadata": {}, "source": [ "\n", @@ -498,25 +439,17 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "52b86e8c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}" - ] - } - ], + "execution_count": null, + "id": "0ccd5221", + "metadata": {}, + "outputs": [], "source": [ "trans_fg_query.status" ] }, { "cell_type": "markdown", - "id": "40f288ea-f0ad-48a0-901e-de06cbea5617", + "id": "0408766d", "metadata": {}, "source": [ "The `insert_stream` function inserts the data into the online feature store so to materialize the data in the offline store you need to manually run the materialization job. The materialization job can also be run on schedule using the `schedule` function. You can find more details in the [documentation](https://docs.hopsworks.ai/hopsworks-api/3.7/generated/api/jobs/#schedule)." @@ -524,49 +457,20 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "de766b09", + "execution_count": null, + "id": "46411431", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Launching job: transactions_fraud_streaming_fg_1_offline_fg_materialization\n", - "Job started successfully, you can follow the progress at \n", - "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/jobs/named/transactions_fraud_streaming_fg_1_offline_fg_materialization/executions" - ] - } - ], + "outputs": [], "source": [ - "trans_fg.materialization_job.schedule(cron_expression = \"0 10 * ? * * *\", start_time=datetime.datetime.now(tz=timezone.utc))" + "trans_fg.materialization_job.schedule(cron_expression = \"0 /10 * ? * * *\", start_time=datetime.datetime.now(tz=datetime.timezone.utc))" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "0e2c385b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "" - ] - } - ], + "execution_count": null, + "id": "e0671d72", + "metadata": {}, + "outputs": [], "source": [ "# Update feature descriptions\n", "feature_descriptions = [\n", @@ -590,7 +494,7 @@ }, { "cell_type": "markdown", - "id": "d3832bb4-7786-4893-ad50-8aa37e29d278", + "id": "2dfa5c7e", "metadata": {}, "source": [ "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group." @@ -598,7 +502,7 @@ }, { "cell_type": "markdown", - "id": "3103a4fc-c298-4760-98d3-953ad3f7e7cb", + "id": "35a41ff0", "metadata": {}, "source": [ "You can move on and do the same thing for the feature group with our windows aggregation." @@ -606,8 +510,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "3f56bbff", + "execution_count": null, + "id": "1316fac2", "metadata": {}, "outputs": [], "source": [ @@ -625,71 +529,37 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "c054288f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature Group created successfully, explore it at \n", - "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/fs/3140/fg/3119\n", - "StatisticsWarning: Stream ingestion for feature group `transactions_aggs_fraud_streaming_fg`, with version `1` will not compute statistics." - ] - } - ], + "execution_count": null, + "id": "7e81cdde", + "metadata": {}, + "outputs": [], "source": [ "window_aggs_streaming_fg_query = window_aggs_streaming_fg.insert_stream(windowed_4h_aggregation_df)" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "b2c6e16f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'message': 'Getting offsets from KafkaV2[Subscribe[transactions_topic]]', 'isDataAvailable': False, 'isTriggerActive': True}" - ] - } - ], + "execution_count": null, + "id": "854e2099", + "metadata": {}, + "outputs": [], "source": [ "window_aggs_streaming_fg_query.status" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "aa973de6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "An error was encountered:\n", - "The Hopsworks Job failed, use the Hopsworks UI to access the job logs\n", - "Traceback (most recent call last):\n", - " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/job.py\", line 124, in run\n", - " self._wait_for_job(await_termination=await_termination)\n", - " File \"/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/job.py\", line 242, in _wait_for_job\n", - " raise FeatureStoreException(\n", - "hsfs.client.exceptions.FeatureStoreException: The Hopsworks Job failed, use the Hopsworks UI to access the job logs\n", - "\n" - ] - } - ], - "source": [ - "window_aggs_streaming_fg.materialization_job.schedule(cron_expression = \"0 10 * ? * * *\", start_time=datetime.datetime.now(tz=timezone.utc))" + "execution_count": null, + "id": "d7c881d4", + "metadata": {}, + "outputs": [], + "source": [ + "window_aggs_streaming_fg.materialization_job.schedule(cron_expression = \"0 /10 * ? * * *\", start_time=datetime.datetime.now(tz=datetime.timezone.utc))" ] }, { "cell_type": "markdown", - "id": "ce70361d", + "id": "55c1c129", "metadata": {}, "source": [ "## ⏭️ **Next:** Part 02: Training Pipeline\n", diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb index 109b386a..9dcb52d5 100644 --- a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "aa514fb7", + "id": "01134591", "metadata": {}, "source": [ "# **Hopsworks Feature Store** \n", @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "8d2f5c11", + "id": "b11b8bbb", "metadata": {}, "source": [ "## 📝 Imports" @@ -26,27 +26,18 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "df72129e", + "execution_count": null, + "id": "6291c85d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "outputs": [], "source": [ "!pip install faker --quiet" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "cbeaa9ed", + "execution_count": null, + "id": "69221b06", "metadata": {}, "outputs": [], "source": [ @@ -57,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "ca2b2938", + "id": "cb89600a", "metadata": {}, "source": [ "## ✏️ Creating Simulated Data " @@ -65,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "1ef161e6-3e87-4bd0-a190-c37fd1a3e444", + "id": "e8c40f77", "metadata": {}, "source": [ "A simulated dataset for credit card Transactions is created so that the data can be send using a Kafka stream. The data created is split into two different dataframes:\n", @@ -80,8 +71,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "40038992", + "execution_count": null, + "id": "f52e17b5", "metadata": {}, "outputs": [], "source": [ @@ -92,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "f8c852a2", + "id": "a25dab2e", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -100,7 +91,7 @@ }, { "cell_type": "markdown", - "id": "e5dd189e-2424-4462-9f5f-a634950be10a", + "id": "d2724ae1", "metadata": {}, "source": [ "After creating the simulated data let us connect with Hopsworks Feature Store.\n", @@ -110,21 +101,10 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "d7aff338", + "execution_count": null, + "id": "9f8d54d4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected. Call `.close()` to terminate connection gracefully.\n", - "\n", - "Logged in to project, explore it here https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192\n", - "Connected. Call `.close()` to terminate connection gracefully.\n" - ] - } - ], + "outputs": [], "source": [ "import hopsworks\n", "\n", @@ -137,7 +117,7 @@ }, { "cell_type": "markdown", - "id": "c7e69f9f", + "id": "bc7e3a00", "metadata": {}, "source": [ "## 🪄 Creating Feature Groups " @@ -145,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "ebdd5944-dea4-4a31-bbea-87aafbef0bc2", + "id": "1d04f813", "metadata": {}, "source": [ "Profiles data can be directly inserted as a feature group directly since they are not update fequently.\n", @@ -157,22 +137,10 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "2838cb7a", + "execution_count": null, + "id": "45d1c575", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'fs' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m profile_fg \u001b[38;5;241m=\u001b[39m \u001b[43mfs\u001b[49m\u001b[38;5;241m.\u001b[39mget_or_create_feature_group(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprofile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m primary_key\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcc_num\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 4\u001b[0m partition_key\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcc_provider\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 5\u001b[0m version\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 7\u001b[0m profile_df \u001b[38;5;241m=\u001b[39m profile_fg\u001b[38;5;241m.\u001b[39minsert(profiles_df)\n", - "\u001b[0;31mNameError\u001b[0m: name 'fs' is not defined" - ] - } - ], + "outputs": [], "source": [ "profile_fg = fs.get_or_create_feature_group(\n", " name=\"profile\",\n", @@ -185,7 +153,7 @@ }, { "cell_type": "markdown", - "id": "13e85615", + "id": "c795ab24", "metadata": {}, "source": [ "## ⚙️ Kafka Topic and Schema Creation " @@ -193,7 +161,7 @@ }, { "cell_type": "markdown", - "id": "7d973e90-66fd-47c1-9b11-f9aaebdf0caf", + "id": "3c7b477c", "metadata": {}, "source": [ "To create a Kafka stream for transactions a topic and schema must be create. The schema used must follow Apache Avro specification, more details can be found in the [documentation](https://avro.apache.org/docs/1.11.1/specification/).\n" @@ -201,8 +169,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "5ba10159", + "execution_count": null, + "id": "a0599566", "metadata": {}, "outputs": [], "source": [ @@ -213,8 +181,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "e39dfb68", + "execution_count": null, + "id": "8faf4d44", "metadata": {}, "outputs": [], "source": [ @@ -302,7 +270,7 @@ }, { "cell_type": "markdown", - "id": "fab6d27f-72f3-4142-ae4a-8a28803b5490", + "id": "038abaad", "metadata": {}, "source": [ "After the schema is created the topic and the associated schema must be registered in Hopsworks so that the topic can be used." @@ -310,8 +278,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "99ca36ed", + "execution_count": null, + "id": "3e00b681", "metadata": {}, "outputs": [], "source": [ @@ -322,7 +290,7 @@ }, { "cell_type": "markdown", - "id": "51408f5f", + "id": "535a692b", "metadata": {}, "source": [ "## 📡 Sending Data using created Kafka Topic " @@ -330,7 +298,7 @@ }, { "cell_type": "markdown", - "id": "53de5bb3-1fcd-4136-8c02-d5003aa18b13", + "id": "b9ba16ec", "metadata": {}, "source": [ "While sending data through Kafka we must make sure that the data types are in the same format specified in the schema. \n", @@ -340,8 +308,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "ff97db95", + "execution_count": null, + "id": "375b2760", "metadata": {}, "outputs": [], "source": [ @@ -359,7 +327,7 @@ }, { "cell_type": "markdown", - "id": "ab5c49b5-cfd6-451b-bdf7-a8130c261b5e", + "id": "b2c27d9e", "metadata": {}, "source": [ "Lets get the configuration needed for the producer to used Hopsworks internal kafka using the KafkaAPI" @@ -368,7 +336,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88f58727-2ca3-4c23-9c5f-a86fa934d63b", + "id": "1d44e3f1", "metadata": {}, "outputs": [], "source": [ @@ -377,7 +345,7 @@ }, { "cell_type": "markdown", - "id": "ef6ea6ba-677d-4155-aea9-6d1be073086a", + "id": "58d8ef0a", "metadata": {}, "source": [ "Finally, lets create a producer using the Kafka configuration and send data into it.\n", @@ -387,8 +355,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "f70cf209", + "execution_count": null, + "id": "9b416972", "metadata": {}, "outputs": [], "source": [ @@ -401,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "aeb76a7b", + "id": "1bbc57f9", "metadata": {}, "source": [ "---\n", @@ -413,7 +381,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -427,7 +395,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb index a037b423..cd8b7b61 100644 --- a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb @@ -570,7 +570,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -584,7 +584,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, From 775321804184d1924f84acd6290acfc52fec55ed Mon Sep 17 00:00:00 2001 From: manu-sj Date: Tue, 12 Mar 2024 17:35:37 +0100 Subject: [PATCH 10/16] fixed pyspark streaming --- .../pyspark_streaming/Feature Pipeline.ipynb | 296 ++++++++++++------ .../Simulated Stream Creation.ipynb | 73 ++--- .../pyspark_streaming/Training Pipeline.ipynb | 124 ++------ 3 files changed, 268 insertions(+), 225 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index bed4d8c2..ff906266 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "965b4213", + "id": "f557a124", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "aab60e44", + "id": "3d3533e4", "metadata": {}, "source": [ "## 📝 Imports" @@ -33,10 +33,38 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a11ab3b5", - "metadata": {}, - "outputs": [], + "execution_count": 1, + "id": "abcf4927", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting Spark application\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
IDApplication IDKindStateSpark UIDriver log
4application_1710240545690_0034pysparkidleLinkLink
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SparkSession available as 'spark'.\n" + ] + } + ], "source": [ "import datetime\n", "from pyspark.sql import SparkSession\n", @@ -49,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "34923d04", + "id": "0d91d9b3", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -57,7 +85,7 @@ }, { "cell_type": "markdown", - "id": "51a0f1c2", + "id": "08de4fa1", "metadata": {}, "source": [ "In this tutorial a simulated data stream was created using Hopsworks internal Kafka.\n", @@ -67,10 +95,22 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c9865406", - "metadata": {}, - "outputs": [], + "execution_count": 28, + "id": "887e0fe2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection closed.\n", + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://aeb68620-e05a-11ee-8a54-7b257af950ba.cloud.hopsworks.ai/p/120\n", + "Connected. Call `.close()` to terminate connection gracefully." + ] + } + ], "source": [ "import hopsworks\n", "from hsfs.core.storage_connector_api import StorageConnectorApi\n", @@ -84,7 +124,7 @@ }, { "cell_type": "markdown", - "id": "ac509612", + "id": "7d385ec3", "metadata": {}, "source": [ "Let get the kafka configurations needed for read from hopsworks internal Kafka using the storage connector api." @@ -92,8 +132,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "0de9afd8", + "execution_count": 29, + "id": "45559bf5", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +144,7 @@ }, { "cell_type": "markdown", - "id": "0eab6707", + "id": "25e08835", "metadata": {}, "source": [ "## 🗂 Reading from Kakfa Stream " @@ -112,7 +152,7 @@ }, { "cell_type": "markdown", - "id": "6ed0a756", + "id": "1c6e4941", "metadata": {}, "source": [ "After obatining the Kafka configurations we can use it along with the topic name to create a streaming dataframe." @@ -120,8 +160,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "6695bfd7", + "execution_count": 30, + "id": "ceb8f4e3", "metadata": {}, "outputs": [], "source": [ @@ -132,14 +172,14 @@ " .format(\"kafka\") \\\n", " .options(**kafka_config) \\\n", " .option(\"startingOffsets\", \"earliest\") \\\n", - " .option(\"maxOffsetsPerTrigger\", 100) \\\n", + " .option(\"maxOffsetsPerTrigger\", 1000) \\\n", " .option(\"subscribe\", KAFKA_TOPIC_NAME) \\\n", " .load()" ] }, { "cell_type": "markdown", - "id": "aeae1061", + "id": "3e262255", "metadata": {}, "source": [ "To extract the requierd data from streaming dataframe the correct schema has be defined and used" @@ -147,8 +187,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "4796c9f2", + "execution_count": 31, + "id": "34f7b4d8", "metadata": {}, "outputs": [], "source": [ @@ -161,13 +201,13 @@ " StructField(\"longitude\", DoubleType(), True),\n", " StructField(\"city\", StringType(), True),\n", " StructField(\"country\", StringType(), True),\n", - " StructField(\"fraud_label\", StringType(), True),\n", + " StructField(\"fraud_label\", IntegerType(), True),\n", " ])" ] }, { "cell_type": "markdown", - "id": "0bd61722", + "id": "697fc7c3", "metadata": {}, "source": [ "Extracting data from the streaming dataframe and casting it to get the required schema" @@ -175,8 +215,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "bf6c49bf", + "execution_count": 32, + "id": "98da586d", "metadata": {}, "outputs": [], "source": [ @@ -202,13 +242,13 @@ " \"CAST(longitude as double)\",\n", " \"CAST(city as string)\",\n", " \"CAST(country as string)\",\n", - " \"CAST(fraud_label as string)\"\n", + " \"CAST(fraud_label as integer)\"\n", " )" ] }, { "cell_type": "markdown", - "id": "8d2cd3af", + "id": "a76f2831", "metadata": {}, "source": [ "## 🛠️ Feature Engineering " @@ -216,7 +256,7 @@ }, { "cell_type": "markdown", - "id": "e82fa500", + "id": "9a807fe3", "metadata": {}, "source": [ "Now that we have a streaming dataframe that contains the data we can use it to engineer features. We would need the also need profiles to effectively engineer features.\n", @@ -226,8 +266,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "53cee8d1", + "execution_count": 33, + "id": "e316e63e", "metadata": {}, "outputs": [], "source": [ @@ -240,7 +280,7 @@ }, { "cell_type": "markdown", - "id": "0384d06c", + "id": "ccf9356d", "metadata": {}, "source": [ "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", @@ -250,14 +290,14 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "13893981", + "execution_count": 34, + "id": "06625b5b", "metadata": {}, "outputs": [], "source": [ "transaction_streaming_df = (\n", " streaming_df.join(profile_df.drop(\"city\"), on=\"cc_num\", how=\"left\")\n", - " .withColumn(\"cc_expiration_date\", to_timestamp(\"cc_expiration_date\", \"mm/dd\"))\n", + " .withColumn(\"cc_expiration_date\", to_timestamp(\"cc_expiration_date\", \"mm/yy\"))\n", " .withColumn(\"age_at_transaction\", datediff(col(\"datetime\"),col(\"birthdate\")))\n", " .withColumn(\"days_until_card_expires\", datediff(col(\"datetime\"),col(\"cc_expiration_date\")))\n", " .select([\"tid\", \"datetime\", \"cc_num\", \"category\", \"amount\", \"latitude\", \"longitude\", \"city\", \"country\", \"fraud_label\", \"age_at_transaction\", \"days_until_card_expires\", \"cc_expiration_date\"])\n", @@ -266,7 +306,7 @@ }, { "cell_type": "markdown", - "id": "4f289b0e", + "id": "033491cc", "metadata": {}, "source": [ "Next, you will create features that aggregate credit card data over a period of time.\n", @@ -276,8 +316,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "0d7bef5d", + "execution_count": 35, + "id": "6fc06022", "metadata": {}, "outputs": [], "source": [ @@ -288,11 +328,12 @@ " avg(\"amount\").alias(\"avg_amt_per_4h\"),\n", " stddev(\"amount\").alias(\"stdev_amt_per_4h\"),\n", " count(\"cc_num\").alias(\"num_trans_per_4h\"),\n", + " collect_list(\"datetime\").alias(\"datetime\"),\n", " )\n", " .na.fill({\"stdev_amt_per_4h\":0})\n", " .selectExpr(\n", " \"cc_num\",\n", - " \"current_timestamp() as event_time\",\n", + " \"explode(datetime) as datetime\",\n", " \"num_trans_per_4h\",\n", " \"avg_amt_per_4h\",\n", " \"stdev_amt_per_4h\",\n", @@ -302,7 +343,7 @@ }, { "cell_type": "markdown", - "id": "79ad3597", + "id": "4e7956d9", "metadata": {}, "source": [ "## 👮🏻‍♂️ Great Expectations " @@ -310,8 +351,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "05e258e9", + "execution_count": 36, + "id": "dbb409eb", "metadata": {}, "outputs": [], "source": [ @@ -326,10 +367,20 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c7c0ed99", - "metadata": {}, - "outputs": [], + "execution_count": 37, + "id": "b55183a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"kwargs\": {\"column\": \"tid\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", + "{\"kwargs\": {\"column\": \"datetime\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", + "{\"kwargs\": {\"column\": \"cc_num\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}" + ] + } + ], "source": [ "# Check binary fraud_label column to be in set [0,1]\n", "expectation_suite_transactions.add_expectation(\n", @@ -368,7 +419,7 @@ }, { "cell_type": "markdown", - "id": "7a96dda9", + "id": "481467d9", "metadata": {}, "source": [ "## 💾 Storing streaming dataframes in Hopsworks Feature Store " @@ -376,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "6a9dc0b0", + "id": "e7d78c59", "metadata": {}, "source": [ "### 🪄 Creating Feature Groups \n", @@ -390,8 +441,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "d852d5da", + "execution_count": 38, + "id": "5df5ec28", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "9a1ad822", + "id": "56069df2", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -419,10 +470,19 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "eecffe7d", - "metadata": {}, - "outputs": [], + "execution_count": 39, + "id": "72f5141a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://aeb68620-e05a-11ee-8a54-7b257af950ba.cloud.hopsworks.ai/p/120/fs/68/fg/40" + ] + } + ], "source": [ "# Insert data into feature group\n", "trans_fg_query = trans_fg.insert_stream(transaction_streaming_df)" @@ -430,7 +490,7 @@ }, { "cell_type": "markdown", - "id": "f491c169", + "id": "cceb282f", "metadata": {}, "source": [ "\n", @@ -439,17 +499,25 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "0ccd5221", - "metadata": {}, - "outputs": [], + "execution_count": 41, + "id": "8b8a7765", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'message': 'Waiting for data to arrive', 'isDataAvailable': False, 'isTriggerActive': False}" + ] + } + ], "source": [ "trans_fg_query.status" ] }, { "cell_type": "markdown", - "id": "0408766d", + "id": "e65f9f97", "metadata": {}, "source": [ "The `insert_stream` function inserts the data into the online feature store so to materialize the data in the offline store you need to manually run the materialization job. The materialization job can also be run on schedule using the `schedule` function. You can find more details in the [documentation](https://docs.hopsworks.ai/hopsworks-api/3.7/generated/api/jobs/#schedule)." @@ -457,20 +525,47 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "46411431", - "metadata": {}, - "outputs": [], + "execution_count": 47, + "id": "8d56c6c8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "" + ] + } + ], "source": [ "trans_fg.materialization_job.schedule(cron_expression = \"0 /10 * ? * * *\", start_time=datetime.datetime.now(tz=datetime.timezone.utc))" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e0671d72", - "metadata": {}, - "outputs": [], + "execution_count": 42, + "id": "5a48ff86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ] + } + ], "source": [ "# Update feature descriptions\n", "feature_descriptions = [\n", @@ -494,7 +589,7 @@ }, { "cell_type": "markdown", - "id": "2dfa5c7e", + "id": "89aa87ab", "metadata": {}, "source": [ "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group." @@ -502,7 +597,7 @@ }, { "cell_type": "markdown", - "id": "35a41ff0", + "id": "11b45eff", "metadata": {}, "source": [ "You can move on and do the same thing for the feature group with our windows aggregation." @@ -510,8 +605,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1316fac2", + "execution_count": 43, + "id": "37c05dfe", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +616,7 @@ " version=1,\n", " description=f\"Aggregate transaction data over 5 minute windows.\",\n", " primary_key=[\"cc_num\"],\n", - " event_time=\"event_time\",\n", + " event_time=\"datetime\",\n", " online_enabled=True,\n", " stream=True,\n", ")" @@ -529,37 +624,62 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "7e81cdde", - "metadata": {}, - "outputs": [], + "execution_count": 44, + "id": "88ae7e96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://aeb68620-e05a-11ee-8a54-7b257af950ba.cloud.hopsworks.ai/p/120/fs/68/fg/41" + ] + } + ], "source": [ "window_aggs_streaming_fg_query = window_aggs_streaming_fg.insert_stream(windowed_4h_aggregation_df)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "854e2099", - "metadata": {}, - "outputs": [], + "execution_count": 45, + "id": "f3d24fc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'message': 'Getting offsets from KafkaV2[Subscribe[transactions_topic2]]', 'isDataAvailable': False, 'isTriggerActive': True}" + ] + } + ], "source": [ "window_aggs_streaming_fg_query.status" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d7c881d4", - "metadata": {}, - "outputs": [], + "execution_count": 46, + "id": "e345d63f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "" + ] + } + ], "source": [ "window_aggs_streaming_fg.materialization_job.schedule(cron_expression = \"0 /10 * ? * * *\", start_time=datetime.datetime.now(tz=datetime.timezone.utc))" ] }, { "cell_type": "markdown", - "id": "55c1c129", + "id": "b58a0d0f", "metadata": {}, "source": [ "## ⏭️ **Next:** Part 02: Training Pipeline\n", diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb index 9dcb52d5..e33d7fd0 100644 --- a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "01134591", + "id": "ec8c8032", "metadata": {}, "source": [ "# **Hopsworks Feature Store** \n", @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "b11b8bbb", + "id": "aded425e", "metadata": {}, "source": [ "## 📝 Imports" @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6291c85d", + "id": "2f19234f", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69221b06", + "id": "b3af7bd7", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "cb89600a", + "id": "3f518fdc", "metadata": {}, "source": [ "## ✏️ Creating Simulated Data " @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "e8c40f77", + "id": "c1d00b58", "metadata": {}, "source": [ "A simulated dataset for credit card Transactions is created so that the data can be send using a Kafka stream. The data created is split into two different dataframes:\n", @@ -72,7 +72,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f52e17b5", + "id": "ac0ba76a", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "a25dab2e", + "id": "7f6e693a", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -91,7 +91,7 @@ }, { "cell_type": "markdown", - "id": "d2724ae1", + "id": "875fd4cf", "metadata": {}, "source": [ "After creating the simulated data let us connect with Hopsworks Feature Store.\n", @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f8d54d4", + "id": "f99d8fa9", "metadata": {}, "outputs": [], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "markdown", - "id": "bc7e3a00", + "id": "39963af6", "metadata": {}, "source": [ "## 🪄 Creating Feature Groups " @@ -125,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "1d04f813", + "id": "37ea20cb", "metadata": {}, "source": [ "Profiles data can be directly inserted as a feature group directly since they are not update fequently.\n", @@ -138,7 +138,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45d1c575", + "id": "a0c25ad8", "metadata": {}, "outputs": [], "source": [ @@ -146,14 +146,15 @@ " name=\"profile\",\n", " primary_key=[\"cc_num\"],\n", " partition_key=[\"cc_provider\"],\n", + " online_enabled=True,\n", " version=1)\n", "\n", - "profile_df = profile_fg.insert(profiles_df)" + "profile_fg.insert(profiles_df)" ] }, { "cell_type": "markdown", - "id": "c795ab24", + "id": "23ecb7e0", "metadata": {}, "source": [ "## ⚙️ Kafka Topic and Schema Creation " @@ -161,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "3c7b477c", + "id": "261658ea", "metadata": {}, "source": [ "To create a Kafka stream for transactions a topic and schema must be create. The schema used must follow Apache Avro specification, more details can be found in the [documentation](https://avro.apache.org/docs/1.11.1/specification/).\n" @@ -170,7 +171,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a0599566", + "id": "995e9d7f", "metadata": {}, "outputs": [], "source": [ @@ -182,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8faf4d44", + "id": "add5051d", "metadata": {}, "outputs": [], "source": [ @@ -261,7 +262,7 @@ " \"name\": \"fraud_label\",\n", " \"type\": [\n", " \"null\",\n", - " \"string\"\n", + " \"int\"\n", " ]\n", " },\n", " ]\n", @@ -270,7 +271,7 @@ }, { "cell_type": "markdown", - "id": "038abaad", + "id": "76977aa9", "metadata": {}, "source": [ "After the schema is created the topic and the associated schema must be registered in Hopsworks so that the topic can be used." @@ -279,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e00b681", + "id": "03a0085a", "metadata": {}, "outputs": [], "source": [ @@ -290,7 +291,7 @@ }, { "cell_type": "markdown", - "id": "535a692b", + "id": "abb7a3fa", "metadata": {}, "source": [ "## 📡 Sending Data using created Kafka Topic " @@ -298,7 +299,7 @@ }, { "cell_type": "markdown", - "id": "b9ba16ec", + "id": "085fc4dd", "metadata": {}, "source": [ "While sending data through Kafka we must make sure that the data types are in the same format specified in the schema. \n", @@ -309,25 +310,25 @@ { "cell_type": "code", "execution_count": null, - "id": "375b2760", + "id": "4ab04a1a", "metadata": {}, "outputs": [], "source": [ "trans_df[\"tid\"] = trans_df[\"tid\"].astype(\"string\")\n", - "trans_df[\"datetime\"] = pd.to_datetime(trans_df[\"datetime\"])\n", + "trans_df[\"datetime\"] = trans_df[\"datetime\"].astype(\"datetime64[s]\").astype(\"int64\")\n", "trans_df[\"cc_num\"] = trans_df[\"cc_num\"].astype(\"int64\")\n", - "trans_df[\"category\"] = trans_df[\"cc_num\"].astype(\"string\")\n", + "trans_df[\"category\"] = trans_df[\"category\"].astype(\"string\")\n", "trans_df[\"amount\"] = trans_df[\"amount\"].astype(\"double\")\n", "trans_df[\"latitude\"] = trans_df[\"latitude\"].astype(\"double\")\n", "trans_df[\"longitude\"] = trans_df[\"longitude\"].astype(\"double\")\n", "trans_df[\"city\"] = trans_df[\"city\"].astype(\"string\")\n", "trans_df[\"country\"] = trans_df[\"country\"].astype(\"string\")\n", - "trans_df[\"fraud_label\"] = trans_df[\"fraud_label\"].astype(\"string\")" + "trans_df[\"fraud_label\"] = trans_df[\"fraud_label\"].astype(\"int\")" ] }, { "cell_type": "markdown", - "id": "b2c27d9e", + "id": "1e50e1ef", "metadata": {}, "source": [ "Lets get the configuration needed for the producer to used Hopsworks internal kafka using the KafkaAPI" @@ -336,7 +337,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1d44e3f1", + "id": "f4b4a5ae", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "58d8ef0a", + "id": "97b5bc4f", "metadata": {}, "source": [ "Finally, lets create a producer using the Kafka configuration and send data into it.\n", @@ -356,7 +357,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b416972", + "id": "8b89e6c5", "metadata": {}, "outputs": [], "source": [ @@ -364,12 +365,14 @@ "\n", "for index, transaction in trans_df.iterrows():\n", " producer.produce(KAFKA_TOPIC_NAME, transaction.to_json())\n", - " producer.flush()" + " if index % 1000 == 0:\n", + " producer.flush()\n", + " print(f'Finished sending index {index}')" ] }, { "cell_type": "markdown", - "id": "1bbc57f9", + "id": "9c8645c3", "metadata": {}, "source": [ "---\n", @@ -381,7 +384,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -395,7 +398,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.18" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb index cd8b7b61..875b1a6c 100644 --- a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -68,20 +68,9 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected. Call `.close()` to terminate connection gracefully.\n", - "\n", - "Logged in to project, explore it here https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192\n", - "Connected. Call `.close()` to terminate connection gracefully.\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import hopsworks\n", "\n", @@ -101,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -118,38 +107,23 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Select features for training data.\n", "selected_features = trans_fg.select([\"fraud_label\", \"category\", \"amount\", \"datetime\", \"age_at_transaction\", \"days_until_card_expires\"])\\\n", - " .join(window_aggs_fg.select_except([\"cc_num\", \"event_time\"]))" + " .join(window_aggs_fg.select_except([\"cc_num\", \"datetime\"]))" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DeprecationWarning: ssl.PROTOCOL_TLS is deprecated\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished: Reading data from Hopsworks, using Hive (21.07s) \n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Uncomment this if you would like to view your selected features\n", - "df = selected_features.read(read_options={\"use_hive\":True})" + "selected_features.read()" ] }, { @@ -178,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -203,18 +177,9 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature view created successfully, explore it at \n", - "https://aff99120-da3e-11ee-8cd7-4f3734b3ce24.cloud.hopsworks.ai/p/3192/fs/3140/fv/transactions_view_fraud_batch_fv/version/1\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Get or create the 'transactions_view_fraud_batch_fv' feature view\n", "feature_view = fs.get_or_create_feature_view(\n", @@ -232,7 +197,7 @@ "source": [ "The feature view is now visible in the UI.\n", "\n", - "![fg-overview](../images/fv_overview.gif)" + "![fg-overview](../../images/fv_overview.gif)" ] }, { @@ -253,62 +218,17 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DeprecationWarning: ssl.PROTOCOL_TLS is deprecated\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished: Reading data from Hopsworks, using Hive (25.04s) \n" - ] - }, - { - "ename": "KeyError", - "evalue": "DataType(null)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mTEST_SIZE\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m X_train, X_test, y_train, y_test = feature_view.train_test_split(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mtest_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTEST_SIZE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"use_hive\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m )\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/usage.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;31m# Disable usage AFTER import hsfs, return function itself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0m_is_enabled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 198\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 199\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mperf_counter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/feature_view.py\u001b[0m in \u001b[0;36mtrain_test_split\u001b[0;34m(self, test_size, train_start, train_end, test_start, test_end, description, extra_filter, statistics_config, read_options, spine, primary_keys, event_time, training_helper_columns)\u001b[0m\n\u001b[1;32m 2203\u001b[0m \u001b[0mextra_filter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mextra_filter\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2204\u001b[0m )\n\u001b[0;32m-> 2205\u001b[0;31m td, df = self._feature_view_engine.get_training_data(\n\u001b[0m\u001b[1;32m 2206\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2207\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/feature_view_engine.py\u001b[0m in \u001b[0;36mget_training_data\u001b[0;34m(self, feature_view_obj, read_options, splits, training_dataset_obj, training_dataset_version, spine, primary_keys, event_time, training_helper_columns)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[0mspine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspine\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 353\u001b[0m )\n\u001b[0;32m--> 354\u001b[0;31m split_df = engine.get_instance().get_training_data(\n\u001b[0m\u001b[1;32m 355\u001b[0m \u001b[0mtd_updated\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m )\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/engine/python.py\u001b[0m in \u001b[0;36mget_training_data\u001b[0;34m(self, training_dataset_obj, feature_view_obj, query_obj, read_options)\u001b[0m\n\u001b[1;32m 628\u001b[0m ):\n\u001b[1;32m 629\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplits\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 630\u001b[0;31m return self._prepare_transform_split_df(\n\u001b[0m\u001b[1;32m 631\u001b[0m \u001b[0mquery_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 632\u001b[0m )\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/engine/python.py\u001b[0m in \u001b[0;36m_prepare_transform_split_df\u001b[0;34m(self, query_obj, training_dataset_obj, feature_view_obj, read_option)\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;31m# apply transformations\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 687\u001b[0m \u001b[0;31m# 1st parametrise transformation functions with dt split stats\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 688\u001b[0;31m transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(\n\u001b[0m\u001b[1;32m 689\u001b[0m \u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_dfs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 690\u001b[0m )\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/transformation_function_engine.py\u001b[0m in \u001b[0;36mpopulate_builtin_transformation_functions\u001b[0;34m(training_dataset, feature_view_obj, dataset)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;31m# compute statistics before transformations are applied\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m stats = (\n\u001b[0;32m--> 276\u001b[0;31m TransformationFunctionEngine.compute_transformation_fn_statistics(\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0mtraining_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0mbuiltin_tffn_features\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/transformation_function_engine.py\u001b[0m in \u001b[0;36mcompute_transformation_fn_statistics\u001b[0;34m(training_dataset_obj, builtin_tffn_features, label_encoder_features, feature_dataframe, feature_view_obj)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m ) -> statistics.Statistics:\n\u001b[0;32m--> 240\u001b[0;31m return training_dataset_obj._statistics_engine.compute_transformation_fn_statistics(\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0mtd_metadata_instance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtraining_dataset_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuiltin_tffn_features\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# excluding label encoded features\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/statistics_engine.py\u001b[0m in \u001b[0;36mcompute_transformation_fn_statistics\u001b[0;34m(self, td_metadata_instance, columns, label_encoder_features, feature_dataframe, feature_view_obj)\u001b[0m\n\u001b[1;32m 237\u001b[0m \"\"\"\n\u001b[1;32m 238\u001b[0m \u001b[0mcomputation_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimestamp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 239\u001b[0;31m stats_str = self._profile_transformation_fn_statistics(\n\u001b[0m\u001b[1;32m 240\u001b[0m \u001b[0mfeature_dataframe\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel_encoder_features\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m )\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/core/statistics_engine.py\u001b[0m in \u001b[0;36m_profile_transformation_fn_statistics\u001b[0;34m(self, feature_dataframe, columns, label_encoder_features)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# compute statistics for all features with transformation fn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mall_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlabel_encoder_features\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m stats_str = engine.get_instance().profile(\n\u001b[0m\u001b[1;32m 386\u001b[0m \u001b[0mfeature_dataframe\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_columns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m )\n", - "\u001b[0;32m/srv/hops/anaconda/envs/theenv/lib/python3.10/site-packages/hsfs/engine/python.py\u001b[0m in \u001b[0;36mprofile\u001b[0;34m(self, df, relevant_columns, correlations, histograms, exact_uniqueness)\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_large_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_struct\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 409\u001b[0;31m ) and PYARROW_HOPSWORKS_DTYPE_MAPPING[field.type] in [\"timestamp\", \"date\"]:\n\u001b[0m\u001b[1;32m 410\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: DataType(null)" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "TEST_SIZE = 0.2\n", "\n", "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", - " test_size = TEST_SIZE, read_options={\"use_hive\":True}\n", + " test_size = TEST_SIZE\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train.head()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -570,7 +490,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -584,7 +504,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.18" } }, "nbformat": 4, From 470313a27a3dff00cf2173b35a16228ccee4a240 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Wed, 13 Mar 2024 09:25:24 +0100 Subject: [PATCH 11/16] clearning all outputs from notebooks --- .../Batch Inference Pipeline.ipynb | 36 +-- .../pyspark_streaming/Feature Pipeline.ipynb | 285 +++++------------- .../Simulated Stream Creation.ipynb | 62 ++-- .../pyspark_streaming/Training Pipeline.ipynb | 13 +- 4 files changed, 135 insertions(+), 261 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb index ddffe655..d9938f28 100644 --- a/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c9fec917", + "id": "e90e00b0", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Batch Inference\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "731cffd4", + "id": "cc0cd899", "metadata": {}, "source": [ "## 📝 Imports" @@ -25,7 +25,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b96c942c", + "id": "c2518efd", "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "markdown", - "id": "dfb09860", + "id": "9a2baade", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -43,7 +43,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c71483d", + "id": "76f137ab", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "c637bf4c", + "id": "dad24a4a", "metadata": {}, "source": [ "## ⚙️ Feature View Retrieval\n" @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b8bb207", + "id": "5fdeaa7c", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "8ec3cafe", + "id": "30536208", "metadata": {}, "source": [ "## 🗄 Model Registry\n" @@ -87,7 +87,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b412d1e", + "id": "5852273c", "metadata": {}, "outputs": [], "source": [ @@ -97,7 +97,7 @@ }, { "cell_type": "markdown", - "id": "d0587a64", + "id": "57e7e430", "metadata": {}, "source": [ "## 🚀 Fetch and test the model\n", @@ -108,7 +108,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0be44ff", + "id": "6572e398", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88b09ee2", + "id": "808f87fb", "metadata": {}, "outputs": [], "source": [ @@ -138,7 +138,7 @@ }, { "cell_type": "markdown", - "id": "aba7238a", + "id": "3cb590c3", "metadata": {}, "source": [ "---\n", @@ -148,7 +148,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34ac6398", + "id": "ac5a8e74", "metadata": {}, "outputs": [], "source": [ @@ -168,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0695d933", + "id": "cff98e14", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ }, { "cell_type": "markdown", - "id": "5ccd12a2", + "id": "15f38b1a", "metadata": {}, "source": [ "---\n", @@ -197,7 +197,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -211,7 +211,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb index ff906266..d19618cf 100644 --- a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb @@ -2,10 +2,10 @@ "cells": [ { "cell_type": "markdown", - "id": "f557a124", + "id": "796a3208", "metadata": {}, "source": [ - "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", + "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", "\n", "**Note**: This tutorial does not support Google Colab.\n", "\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "3d3533e4", + "id": "e52dcf15", "metadata": {}, "source": [ "## 📝 Imports" @@ -33,38 +33,10 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "abcf4927", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting Spark application\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
IDApplication IDKindStateSpark UIDriver log
4application_1710240545690_0034pysparkidleLinkLink
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SparkSession available as 'spark'.\n" - ] - } - ], + "execution_count": null, + "id": "277a2c21", + "metadata": {}, + "outputs": [], "source": [ "import datetime\n", "from pyspark.sql import SparkSession\n", @@ -77,7 +49,7 @@ }, { "cell_type": "markdown", - "id": "0d91d9b3", + "id": "b4954000", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -85,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "08de4fa1", + "id": "a26a358a", "metadata": {}, "source": [ "In this tutorial a simulated data stream was created using Hopsworks internal Kafka.\n", @@ -95,22 +67,10 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "887e0fe2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connection closed.\n", - "Connected. Call `.close()` to terminate connection gracefully.\n", - "\n", - "Logged in to project, explore it here https://aeb68620-e05a-11ee-8a54-7b257af950ba.cloud.hopsworks.ai/p/120\n", - "Connected. Call `.close()` to terminate connection gracefully." - ] - } - ], + "execution_count": null, + "id": "fd531aa7", + "metadata": {}, + "outputs": [], "source": [ "import hopsworks\n", "from hsfs.core.storage_connector_api import StorageConnectorApi\n", @@ -124,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "7d385ec3", + "id": "ca1a2afc", "metadata": {}, "source": [ "Let get the kafka configurations needed for read from hopsworks internal Kafka using the storage connector api." @@ -132,8 +92,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "45559bf5", + "execution_count": null, + "id": "157f95d3", "metadata": {}, "outputs": [], "source": [ @@ -144,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "25e08835", + "id": "eadbccb2", "metadata": {}, "source": [ "## 🗂 Reading from Kakfa Stream " @@ -152,7 +112,7 @@ }, { "cell_type": "markdown", - "id": "1c6e4941", + "id": "9a012fe5", "metadata": {}, "source": [ "After obatining the Kafka configurations we can use it along with the topic name to create a streaming dataframe." @@ -160,8 +120,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "id": "ceb8f4e3", + "execution_count": null, + "id": "46bc5d77", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +139,7 @@ }, { "cell_type": "markdown", - "id": "3e262255", + "id": "6d8ffdb4", "metadata": {}, "source": [ "To extract the requierd data from streaming dataframe the correct schema has be defined and used" @@ -187,8 +147,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "34f7b4d8", + "execution_count": null, + "id": "d30ac91e", "metadata": {}, "outputs": [], "source": [ @@ -207,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "697fc7c3", + "id": "00b5d19b", "metadata": {}, "source": [ "Extracting data from the streaming dataframe and casting it to get the required schema" @@ -215,8 +175,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "id": "98da586d", + "execution_count": null, + "id": "7b722ce1", "metadata": {}, "outputs": [], "source": [ @@ -248,7 +208,7 @@ }, { "cell_type": "markdown", - "id": "a76f2831", + "id": "445e1ef3", "metadata": {}, "source": [ "## 🛠️ Feature Engineering " @@ -256,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "9a807fe3", + "id": "91ef3185", "metadata": {}, "source": [ "Now that we have a streaming dataframe that contains the data we can use it to engineer features. We would need the also need profiles to effectively engineer features.\n", @@ -266,8 +226,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "id": "e316e63e", + "execution_count": null, + "id": "aecfb3a6", "metadata": {}, "outputs": [], "source": [ @@ -280,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "ccf9356d", + "id": "409a7fe6", "metadata": {}, "source": [ "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", @@ -290,8 +250,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "id": "06625b5b", + "execution_count": null, + "id": "ed4d88cf", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "033491cc", + "id": "3203c52e", "metadata": {}, "source": [ "Next, you will create features that aggregate credit card data over a period of time.\n", @@ -316,8 +276,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "id": "6fc06022", + "execution_count": null, + "id": "5336fb47", "metadata": {}, "outputs": [], "source": [ @@ -343,7 +303,7 @@ }, { "cell_type": "markdown", - "id": "4e7956d9", + "id": "689e6caf", "metadata": {}, "source": [ "## 👮🏻‍♂️ Great Expectations " @@ -351,8 +311,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "id": "dbb409eb", + "execution_count": null, + "id": "9e6e55d1", "metadata": {}, "outputs": [], "source": [ @@ -367,20 +327,10 @@ }, { "cell_type": "code", - "execution_count": 37, - "id": "b55183a1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"kwargs\": {\"column\": \"tid\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", - "{\"kwargs\": {\"column\": \"datetime\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}\n", - "{\"kwargs\": {\"column\": \"cc_num\", \"mostly\": 0.0}, \"expectation_type\": \"expect_column_values_to_be_null\", \"meta\": {}}" - ] - } - ], + "execution_count": null, + "id": "642bed6f", + "metadata": {}, + "outputs": [], "source": [ "# Check binary fraud_label column to be in set [0,1]\n", "expectation_suite_transactions.add_expectation(\n", @@ -419,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "481467d9", + "id": "e0a49f96", "metadata": {}, "source": [ "## 💾 Storing streaming dataframes in Hopsworks Feature Store " @@ -427,7 +377,7 @@ }, { "cell_type": "markdown", - "id": "e7d78c59", + "id": "19a3a7df", "metadata": {}, "source": [ "### 🪄 Creating Feature Groups \n", @@ -441,8 +391,8 @@ }, { "cell_type": "code", - "execution_count": 38, - "id": "5df5ec28", + "execution_count": null, + "id": "41e8f6f4", "metadata": {}, "outputs": [], "source": [ @@ -460,7 +410,7 @@ }, { "cell_type": "markdown", - "id": "56069df2", + "id": "cbd55773", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -470,19 +420,10 @@ }, { "cell_type": "code", - "execution_count": 39, - "id": "72f5141a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature Group created successfully, explore it at \n", - "https://aeb68620-e05a-11ee-8a54-7b257af950ba.cloud.hopsworks.ai/p/120/fs/68/fg/40" - ] - } - ], + "execution_count": null, + "id": "181a44f5", + "metadata": {}, + "outputs": [], "source": [ "# Insert data into feature group\n", "trans_fg_query = trans_fg.insert_stream(transaction_streaming_df)" @@ -490,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "cceb282f", + "id": "9cf7e915", "metadata": {}, "source": [ "\n", @@ -499,25 +440,17 @@ }, { "cell_type": "code", - "execution_count": 41, - "id": "8b8a7765", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'message': 'Waiting for data to arrive', 'isDataAvailable': False, 'isTriggerActive': False}" - ] - } - ], + "execution_count": null, + "id": "34620d72", + "metadata": {}, + "outputs": [], "source": [ "trans_fg_query.status" ] }, { "cell_type": "markdown", - "id": "e65f9f97", + "id": "f12cd70e", "metadata": {}, "source": [ "The `insert_stream` function inserts the data into the online feature store so to materialize the data in the offline store you need to manually run the materialization job. The materialization job can also be run on schedule using the `schedule` function. You can find more details in the [documentation](https://docs.hopsworks.ai/hopsworks-api/3.7/generated/api/jobs/#schedule)." @@ -525,47 +458,20 @@ }, { "cell_type": "code", - "execution_count": 47, - "id": "8d56c6c8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "" - ] - } - ], + "execution_count": null, + "id": "412e3bf8", + "metadata": {}, + "outputs": [], "source": [ "trans_fg.materialization_job.schedule(cron_expression = \"0 /10 * ? * * *\", start_time=datetime.datetime.now(tz=datetime.timezone.utc))" ] }, { "cell_type": "code", - "execution_count": 42, - "id": "5a48ff86", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "" - ] - } - ], + "execution_count": null, + "id": "2686e5fe", + "metadata": {}, + "outputs": [], "source": [ "# Update feature descriptions\n", "feature_descriptions = [\n", @@ -589,7 +495,7 @@ }, { "cell_type": "markdown", - "id": "89aa87ab", + "id": "b98ea202", "metadata": {}, "source": [ "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group." @@ -597,7 +503,7 @@ }, { "cell_type": "markdown", - "id": "11b45eff", + "id": "d0b7742c", "metadata": {}, "source": [ "You can move on and do the same thing for the feature group with our windows aggregation." @@ -605,8 +511,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "id": "37c05dfe", + "execution_count": null, + "id": "2bb92c90", "metadata": {}, "outputs": [], "source": [ @@ -624,62 +530,37 @@ }, { "cell_type": "code", - "execution_count": 44, - "id": "88ae7e96", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature Group created successfully, explore it at \n", - "https://aeb68620-e05a-11ee-8a54-7b257af950ba.cloud.hopsworks.ai/p/120/fs/68/fg/41" - ] - } - ], + "execution_count": null, + "id": "9245f285", + "metadata": {}, + "outputs": [], "source": [ "window_aggs_streaming_fg_query = window_aggs_streaming_fg.insert_stream(windowed_4h_aggregation_df)" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "f3d24fc8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'message': 'Getting offsets from KafkaV2[Subscribe[transactions_topic2]]', 'isDataAvailable': False, 'isTriggerActive': True}" - ] - } - ], + "execution_count": null, + "id": "eff44a9f", + "metadata": {}, + "outputs": [], "source": [ "window_aggs_streaming_fg_query.status" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "e345d63f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "" - ] - } - ], + "execution_count": null, + "id": "1a184e6b", + "metadata": {}, + "outputs": [], "source": [ "window_aggs_streaming_fg.materialization_job.schedule(cron_expression = \"0 /10 * ? * * *\", start_time=datetime.datetime.now(tz=datetime.timezone.utc))" ] }, { "cell_type": "markdown", - "id": "b58a0d0f", + "id": "db9cd9f2", "metadata": {}, "source": [ "## ⏭️ **Next:** Part 02: Training Pipeline\n", diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb index e33d7fd0..eb4a6177 100644 --- a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb +++ b/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ec8c8032", + "id": "7ea95c96", "metadata": {}, "source": [ "# **Hopsworks Feature Store** \n", @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "aded425e", + "id": "892ac2b2", "metadata": {}, "source": [ "## 📝 Imports" @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2f19234f", + "id": "dab3506a", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3af7bd7", + "id": "af0f0380", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "3f518fdc", + "id": "a4241a88", "metadata": {}, "source": [ "## ✏️ Creating Simulated Data " @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "c1d00b58", + "id": "b710c76f", "metadata": {}, "source": [ "A simulated dataset for credit card Transactions is created so that the data can be send using a Kafka stream. The data created is split into two different dataframes:\n", @@ -72,7 +72,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac0ba76a", + "id": "b4aa68ce", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "7f6e693a", + "id": "4f29f0a3", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -91,7 +91,7 @@ }, { "cell_type": "markdown", - "id": "875fd4cf", + "id": "c049f064", "metadata": {}, "source": [ "After creating the simulated data let us connect with Hopsworks Feature Store.\n", @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f99d8fa9", + "id": "7d55f2b0", "metadata": {}, "outputs": [], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "markdown", - "id": "39963af6", + "id": "f95f31d7", "metadata": {}, "source": [ "## 🪄 Creating Feature Groups " @@ -125,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "37ea20cb", + "id": "fc807022", "metadata": {}, "source": [ "Profiles data can be directly inserted as a feature group directly since they are not update fequently.\n", @@ -138,7 +138,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a0c25ad8", + "id": "8f94c972", "metadata": {}, "outputs": [], "source": [ @@ -154,7 +154,7 @@ }, { "cell_type": "markdown", - "id": "23ecb7e0", + "id": "72505a6f", "metadata": {}, "source": [ "## ⚙️ Kafka Topic and Schema Creation " @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "261658ea", + "id": "63f7878f", "metadata": {}, "source": [ "To create a Kafka stream for transactions a topic and schema must be create. The schema used must follow Apache Avro specification, more details can be found in the [documentation](https://avro.apache.org/docs/1.11.1/specification/).\n" @@ -171,7 +171,7 @@ { "cell_type": "code", "execution_count": null, - "id": "995e9d7f", + "id": "e35f7c59", "metadata": {}, "outputs": [], "source": [ @@ -183,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "add5051d", + "id": "3d811fb7", "metadata": {}, "outputs": [], "source": [ @@ -271,7 +271,7 @@ }, { "cell_type": "markdown", - "id": "76977aa9", + "id": "845b1ed6", "metadata": {}, "source": [ "After the schema is created the topic and the associated schema must be registered in Hopsworks so that the topic can be used." @@ -280,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "03a0085a", + "id": "6db8a245", "metadata": {}, "outputs": [], "source": [ @@ -291,7 +291,7 @@ }, { "cell_type": "markdown", - "id": "abb7a3fa", + "id": "db6ad79d", "metadata": {}, "source": [ "## 📡 Sending Data using created Kafka Topic " @@ -299,7 +299,7 @@ }, { "cell_type": "markdown", - "id": "085fc4dd", + "id": "ed183fe2", "metadata": {}, "source": [ "While sending data through Kafka we must make sure that the data types are in the same format specified in the schema. \n", @@ -310,7 +310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4ab04a1a", + "id": "4f487adb", "metadata": {}, "outputs": [], "source": [ @@ -328,7 +328,7 @@ }, { "cell_type": "markdown", - "id": "1e50e1ef", + "id": "6562d393", "metadata": {}, "source": [ "Lets get the configuration needed for the producer to used Hopsworks internal kafka using the KafkaAPI" @@ -337,7 +337,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4b4a5ae", + "id": "75a2a99d", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "97b5bc4f", + "id": "69ac8aeb", "metadata": {}, "source": [ "Finally, lets create a producer using the Kafka configuration and send data into it.\n", @@ -357,22 +357,22 @@ { "cell_type": "code", "execution_count": null, - "id": "8b89e6c5", + "id": "e4bcf8e9", "metadata": {}, "outputs": [], "source": [ "producer = Producer(kafka_config)\n", "\n", "for index, transaction in trans_df.iterrows():\n", - " producer.produce(KAFKA_TOPIC_NAME, transaction.to_json())\n", + " producer.produce(KAFKA_TOPIC_NAME, transaction.to_json()\n", + " producer.flush()\n", " if index % 1000 == 0:\n", - " producer.flush()\n", " print(f'Finished sending index {index}')" ] }, { "cell_type": "markdown", - "id": "9c8645c3", + "id": "a637d022", "metadata": {}, "source": [ "---\n", @@ -384,7 +384,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -398,7 +398,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb index 875b1a6c..e3536d05 100644 --- a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb @@ -123,7 +123,7 @@ "outputs": [], "source": [ "# Uncomment this if you would like to view your selected features\n", - "selected_features.read()" + "#selected_features.read()" ] }, { @@ -476,13 +476,6 @@ "\n", "In the following notebook you will use your model for batch inference.\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -490,7 +483,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -504,7 +497,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, From 8bed028da66ba9952a96809da35f6a0cead99269 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 4 Apr 2024 09:35:07 +0200 Subject: [PATCH 12/16] renaming filenames in tutorial --- ...{Simulated Stream Creation.ipynb => 0_stream_simulation.ipynb} | 0 .../{Feature Pipeline.ipynb => 1_feature_pipeline.ipynb} | 0 .../{Training Pipeline.ipynb => 2_training_pipeline.ipynb} | 0 ...{Batch Inference Pipeline.ipynb => 3_inference_pipeline.ipynb} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename advanced_tutorials/pyspark_streaming/{Simulated Stream Creation.ipynb => 0_stream_simulation.ipynb} (100%) rename advanced_tutorials/pyspark_streaming/{Feature Pipeline.ipynb => 1_feature_pipeline.ipynb} (100%) rename advanced_tutorials/pyspark_streaming/{Training Pipeline.ipynb => 2_training_pipeline.ipynb} (100%) rename advanced_tutorials/pyspark_streaming/{Batch Inference Pipeline.ipynb => 3_inference_pipeline.ipynb} (100%) diff --git a/advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb b/advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/Simulated Stream Creation.ipynb rename to advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb diff --git a/advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/Feature Pipeline.ipynb rename to advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb diff --git a/advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/Training Pipeline.ipynb rename to advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb diff --git a/advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb b/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/Batch Inference Pipeline.ipynb rename to advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb From 93cae220d266914ddb827618dfb064c59458a14d Mon Sep 17 00:00:00 2001 From: manu-sj Date: Thu, 4 Apr 2024 13:24:06 +0200 Subject: [PATCH 13/16] adding online inference pipeline instead of batch inference --- .../1_feature_pipeline.ipynb | 84 ++++---- .../2_training_pipeline.ipynb | 200 +++++++++++++++++- .../3_inference_pipeline.ipynb | 185 +++++++++------- 3 files changed, 343 insertions(+), 126 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb b/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb index d19618cf..6756154e 100644 --- a/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "796a3208", + "id": "0a074034", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "e52dcf15", + "id": "5d62a292", "metadata": {}, "source": [ "## 📝 Imports" @@ -34,7 +34,7 @@ { "cell_type": "code", "execution_count": null, - "id": "277a2c21", + "id": "e450d4bf", "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,7 @@ }, { "cell_type": "markdown", - "id": "b4954000", + "id": "e002f27d", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -57,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "a26a358a", + "id": "dc98e7e0", "metadata": {}, "source": [ "In this tutorial a simulated data stream was created using Hopsworks internal Kafka.\n", @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd531aa7", + "id": "74e6ce5a", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "ca1a2afc", + "id": "afd8dc5c", "metadata": {}, "source": [ "Let get the kafka configurations needed for read from hopsworks internal Kafka using the storage connector api." @@ -93,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "157f95d3", + "id": "366279ee", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "eadbccb2", + "id": "15a4c865", "metadata": {}, "source": [ "## 🗂 Reading from Kakfa Stream " @@ -112,7 +112,7 @@ }, { "cell_type": "markdown", - "id": "9a012fe5", + "id": "f161bc46", "metadata": {}, "source": [ "After obatining the Kafka configurations we can use it along with the topic name to create a streaming dataframe." @@ -121,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46bc5d77", + "id": "e9e41875", "metadata": {}, "outputs": [], "source": [ @@ -139,7 +139,7 @@ }, { "cell_type": "markdown", - "id": "6d8ffdb4", + "id": "cd56e20a", "metadata": {}, "source": [ "To extract the requierd data from streaming dataframe the correct schema has be defined and used" @@ -148,7 +148,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d30ac91e", + "id": "ed1bf8a6", "metadata": {}, "outputs": [], "source": [ @@ -167,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "00b5d19b", + "id": "edebd990", "metadata": {}, "source": [ "Extracting data from the streaming dataframe and casting it to get the required schema" @@ -176,7 +176,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b722ce1", + "id": "c8519ee0", "metadata": {}, "outputs": [], "source": [ @@ -208,7 +208,7 @@ }, { "cell_type": "markdown", - "id": "445e1ef3", + "id": "2a5f66b9", "metadata": {}, "source": [ "## 🛠️ Feature Engineering " @@ -216,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "91ef3185", + "id": "1eeed896", "metadata": {}, "source": [ "Now that we have a streaming dataframe that contains the data we can use it to engineer features. We would need the also need profiles to effectively engineer features.\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aecfb3a6", + "id": "88118c25", "metadata": {}, "outputs": [], "source": [ @@ -240,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "409a7fe6", + "id": "3860457e", "metadata": {}, "source": [ "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", @@ -251,7 +251,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed4d88cf", + "id": "7ac337a0", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "3203c52e", + "id": "aaed3775", "metadata": {}, "source": [ "Next, you will create features that aggregate credit card data over a period of time.\n", @@ -277,7 +277,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5336fb47", + "id": "4dc63cc4", "metadata": {}, "outputs": [], "source": [ @@ -303,7 +303,7 @@ }, { "cell_type": "markdown", - "id": "689e6caf", + "id": "506a259c", "metadata": {}, "source": [ "## 👮🏻‍♂️ Great Expectations " @@ -312,7 +312,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e6e55d1", + "id": "59849666", "metadata": {}, "outputs": [], "source": [ @@ -328,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "642bed6f", + "id": "cb4f7d8c", "metadata": {}, "outputs": [], "source": [ @@ -369,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "e0a49f96", + "id": "2e91b086", "metadata": {}, "source": [ "## 💾 Storing streaming dataframes in Hopsworks Feature Store " @@ -377,7 +377,7 @@ }, { "cell_type": "markdown", - "id": "19a3a7df", + "id": "e62519d0", "metadata": {}, "source": [ "### 🪄 Creating Feature Groups \n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41e8f6f4", + "id": "d32291c9", "metadata": {}, "outputs": [], "source": [ @@ -410,7 +410,7 @@ }, { "cell_type": "markdown", - "id": "cbd55773", + "id": "3f246fbd", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -421,7 +421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "181a44f5", + "id": "a8527dec", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "9cf7e915", + "id": "1755d3fd", "metadata": {}, "source": [ "\n", @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34620d72", + "id": "06694d9d", "metadata": {}, "outputs": [], "source": [ @@ -450,7 +450,7 @@ }, { "cell_type": "markdown", - "id": "f12cd70e", + "id": "0039419b", "metadata": {}, "source": [ "The `insert_stream` function inserts the data into the online feature store so to materialize the data in the offline store you need to manually run the materialization job. The materialization job can also be run on schedule using the `schedule` function. You can find more details in the [documentation](https://docs.hopsworks.ai/hopsworks-api/3.7/generated/api/jobs/#schedule)." @@ -459,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "412e3bf8", + "id": "a22a1198", "metadata": {}, "outputs": [], "source": [ @@ -469,7 +469,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2686e5fe", + "id": "9c090719", "metadata": {}, "outputs": [], "source": [ @@ -495,7 +495,7 @@ }, { "cell_type": "markdown", - "id": "b98ea202", + "id": "50b55791", "metadata": {}, "source": [ "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group." @@ -503,7 +503,7 @@ }, { "cell_type": "markdown", - "id": "d0b7742c", + "id": "748bb8b4", "metadata": {}, "source": [ "You can move on and do the same thing for the feature group with our windows aggregation." @@ -512,7 +512,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2bb92c90", + "id": "7afbdcdd", "metadata": {}, "outputs": [], "source": [ @@ -531,7 +531,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9245f285", + "id": "2dd0a6fd", "metadata": {}, "outputs": [], "source": [ @@ -541,7 +541,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eff44a9f", + "id": "0144d454", "metadata": {}, "outputs": [], "source": [ @@ -551,7 +551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a184e6b", + "id": "63633259", "metadata": {}, "outputs": [], "source": [ @@ -560,7 +560,7 @@ }, { "cell_type": "markdown", - "id": "db9cd9f2", + "id": "164c0151", "metadata": {}, "source": [ "## ⏭️ **Next:** Part 02: Training Pipeline\n", diff --git a/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb b/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb index e3536d05..e0f8c17f 100644 --- a/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb @@ -183,7 +183,7 @@ "source": [ "# Get or create the 'transactions_view_fraud_batch_fv' feature view\n", "feature_view = fs.get_or_create_feature_view(\n", - " name='transactions_view_fraud_batch_fv',\n", + " name='transactions_view_streaming_fv',\n", " version=1,\n", " query=selected_features,\n", " labels=[\"fraud_label\"],\n", @@ -432,14 +432,14 @@ "outputs": [], "source": [ "# Specify the directory where the model will be saved\n", - "model_dir = \"fraud_batch_model\"\n", + "model_dir = \"fraud_streaming_model\"\n", "\n", "# Check if the directory exists, and create it if it doesn't\n", "if not os.path.isdir(model_dir):\n", " os.mkdir(model_dir)\n", "\n", "# Save the trained XGBoost model using joblib\n", - "joblib.dump(clf, model_dir + '/xgboost_fraud_batch_model.pkl')\n", + "joblib.dump(clf, model_dir + '/xgboost_fraud_streaming_model.pkl')\n", "\n", "# Save the confusion matrix heatmap as an image in the model directory\n", "fig.savefig(model_dir + \"/confusion_matrix.png\")" @@ -456,7 +456,7 @@ "\n", "# Create a new model in the model registry\n", "fraud_model = mr.python.create_model(\n", - " name=\"xgboost_fraud_batch_model\", # Name for the model\n", + " name=\"xgboost_fraud_streaming_model\", # Name for the model\n", " metrics=metrics, # Metrics used for evaluation\n", " model_schema=model_schema, # Schema defining the model's input and output\n", " input_example=X_train.sample(), # Example input data for reference\n", @@ -467,15 +467,201 @@ "fraud_model.save(model_dir)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🚀 Model Deployment\n", + "\n", + "\n", + "### About Model Serving\n", + "Models can be served via KFServing or \"default\" serving, which means a Docker container exposing a Flask server. For KFServing models, or models written in Tensorflow, you do not need to write a prediction file (see the section below). However, for sklearn models using default serving, you do need to proceed to write a prediction file.\n", + "\n", + "In order to use KFServing, you must have Kubernetes installed and enabled on your cluster." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 📎 Predictor script for Python models\n", + "\n", + "\n", + "Scikit-learn and XGBoost models are deployed as Python models, in which case you need to provide a **Predict** class that implements the **predict** method. The **predict()** method invokes the model on the inputs and returns the prediction as a list.\n", + "\n", + "The **init()** method is run when the predictor is loaded into memory, loading the model from the local directory it is materialized to, *ARTIFACT_FILES_PATH*.\n", + "\n", + "The directive \"%%writefile\" writes out the cell before to the given Python file. We will use the **predict_example.py** file to create a deployment for our model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile predict_example.py\n", + "import os\n", + "import numpy as np\n", + "import hsfs\n", + "import joblib\n", + "\n", + "\n", + "class Predict(object):\n", + "\n", + " def __init__(self):\n", + " \"\"\" Initializes the serving state, reads a trained model\"\"\" \n", + " # Get feature store handle\n", + " fs_conn = hsfs.connection()\n", + " self.fs = fs_conn.get_feature_store()\n", + " \n", + " # Get feature view\n", + " self.fv = self.fs.get_feature_view(\"transactions_view_streaming_fv\", 1)\n", + " \n", + " # Initialize serving\n", + " self.fv.init_serving(1)\n", + "\n", + " # Load the trained model\n", + " self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/xgboost_fraud_streaming_model.pkl\")\n", + " print(\"Initialization Complete\")\n", + "\n", + " def predict(self, inputs):\n", + " \"\"\" Serves a prediction request usign a trained model\"\"\"\n", + " feature_vector = self.fv.get_feature_vector({\"cc_num\": inputs[0][0]})\n", + " indexes_to_remove = [0,1,2]\n", + " feature_vector = [\n", + " i \n", + " for j, i \n", + " in enumerate(feature_vector) \n", + " if j not in indexes_to_remove\n", + " ] \n", + " return self.model.predict(np.asarray(feature_vector).reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you wonder why we use the path Models/fraud_tutorial_model/1/model.pkl, it is useful to know that the Data Sets tab in the Hopsworks UI lets you browse among the different files in the project. Registered models will be found underneath the Models directory. Since you saved you model with the name fraud_tutorial_model, that's the directory you should look in. 1 is just the version of the model you want to deploy.\n", + "\n", + "This script needs to be put into a known location in the Hopsworks file system. Let's call the file predict_example.py and put it in the Models directory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the deployment\n", + "Here, you fetch the model you want from the model registry and define a configuration for the deployment. For the configuration, you need to specify the serving type (default or KFserving)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Deploy the fraud model\n", + "deployment = fraud_model.deploy(\n", + " name=\"fraudonlinemodeldeployment\", # Specify a name for the deployment\n", + " script_file=predictor_script_path, # Provide the path to the Python script for prediction\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print the name of the deployment\n", + "print(\"Deployment: \" + deployment.name)\n", + "\n", + "# Display information about the deployment\n", + "deployment.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Deployment is warming up...\")\n", + "time.sleep(45)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The deployment has now been registered. However, to start it you need to run the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the deployment and wait for it to be in a running state for up to 300 seconds\n", + "deployment.start(await_running=300)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the current state of the deployment\n", + "deployment.get_state().describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To troubleshoot you can use `get_logs()` method\n", + "deployment.get_logs(component='predictor')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop Deployment\n", + "To stop the deployment you simply run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the deployment and wait for it to be in a stopped state for up to 180 seconds\n", + "deployment.stop(await_stopped=180)" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", - "## ⏭️ **Next:** Part 03: Batch Inference\n", + "## ⏭️ **Next:** Part 04: Online Inference\n", "\n", "In the following notebook you will use your model for batch inference.\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -483,7 +669,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -497,7 +683,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.18" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb b/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb index d9938f28..49406d38 100644 --- a/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb @@ -2,192 +2,223 @@ "cells": [ { "cell_type": "markdown", - "id": "e90e00b0", + "id": "f4f43ac5", "metadata": {}, "source": [ - "# **Hopsworks Feature Store** - Part 03: Batch Inference\n", - "\n", - "\n", - "## 🗒️ This notebook is divided into the following sections:\n", - "\n", - "1. Load batch data.\n", - "2. Predict using model from Model Registry." + "# **Hopsworks Feature Store** - Part 03: Inference Pipeline\n" ] }, { "cell_type": "markdown", - "id": "cc0cd899", + "id": "48cfbe50", "metadata": {}, "source": [ - "## 📝 Imports" + "## 📡 Connecting to Hopsworks Feature Store " ] }, { "cell_type": "code", "execution_count": null, - "id": "c2518efd", + "id": "fbcb8dda", "metadata": {}, "outputs": [], "source": [ - "import joblib" + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" ] }, { "cell_type": "markdown", - "id": "9a2baade", + "id": "76f36860", "metadata": {}, "source": [ - "## 📡 Connecting to Hopsworks Feature Store " + "## ⚙️ Feature Retrieval\n", + "Let's retrieve a feature group in order to get cc_num values. The use the feature view to retrive the feature's needed to make predictions." ] }, { "cell_type": "code", "execution_count": null, - "id": "76f137ab", + "id": "11e35a86", "metadata": {}, "outputs": [], "source": [ - "import hopsworks\n", - "\n", - "project = hopsworks.login()\n", + "trans_fg = fs.get_feature_group(\n", + " name=\"transactions_fraud_streaming_fg\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92d5aa29", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the first 5 unique credit card numbers (cc_nums)\n", + "cc_nums = trans_fg.select('cc_num').show(5).cc_num.values\n", "\n", - "fs = project.get_feature_store()" + "# Display the obtained cc_nums\n", + "cc_nums" ] }, { "cell_type": "markdown", - "id": "dad24a4a", + "id": "053da208", "metadata": {}, "source": [ - "## ⚙️ Feature View Retrieval\n" + "Now we initlize the feature view using the training data version to retrive feature vector's" + ] + }, + { + "cell_type": "markdown", + "id": "c4549b3c", + "metadata": {}, + "source": [ + "## 🗄 Model Registry\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "5fdeaa7c", + "id": "d9f51718", "metadata": {}, "outputs": [], "source": [ - "# Retrieve the 'transactions_view_fraud_batch_fv' feature view\n", - "feature_view = fs.get_feature_view(\n", - " name='transactions_view_fraud_batch_fv',\n", - " version=1,\n", - ")" + "# Get the Model Registry\n", + "mr = project.get_model_registry()" ] }, { "cell_type": "markdown", - "id": "30536208", + "id": "590d597d", "metadata": {}, "source": [ - "## 🗄 Model Registry\n" + "## 🚀 Fetch Deployment" ] }, { "cell_type": "code", "execution_count": null, - "id": "5852273c", + "id": "f91ac6ba", "metadata": {}, "outputs": [], "source": [ - "# Get the model registry\n", - "mr = project.get_model_registry()" + "# Access the Model Serving\n", + "ms = project.get_model_serving()\n", + "\n", + "# Specify the deployment name\n", + "deployment_name = \"fraudonlinemodeldeployment\"\n", + "\n", + "# Get the deployment with the specified name\n", + "deployment = ms.get_deployment(deployment_name)\n", + "\n", + "# Start the deployment and wait for it to be in a running state for up to 300 seconds\n", + "deployment.start(await_running=300)" ] }, { "cell_type": "markdown", - "id": "57e7e430", + "id": "765057a0", "metadata": {}, "source": [ - "## 🚀 Fetch and test the model\n", + "## 🔮 Predicting using deployment\n", + "\n", "\n", - "Finally you can start making predictions with your model! Retrieve your model from Hopsworks model registry." + "Finally you can start making predictions with your model!\n", + "\n", + "Send inference requests to the deployed model as follows:" ] }, { "cell_type": "code", "execution_count": null, - "id": "6572e398", + "id": "6becd6b1", "metadata": {}, "outputs": [], "source": [ - "# Retrieve the model from the model registry\n", - "retrieved_model = mr.get_model(\n", - " name=\"xgboost_fraud_batch_model\",\n", - " version=1,\n", - ")\n", - "\n", - "# Download the saved model files to a local directory\n", - "saved_model_dir = retrieved_model.download()" + "# Get the first credit card number\n", + "cc_num = cc_nums[0]\n", + "cc_num" ] }, { "cell_type": "code", "execution_count": null, - "id": "808f87fb", + "id": "650af1b0", "metadata": {}, "outputs": [], "source": [ - "# Load the saved XGBoost model using joblib from the downloaded model directory\n", - "retrieved_xgboost_model = joblib.load(saved_model_dir + \"/xgboost_fraud_batch_model.pkl\")\n", - "\n", - "# Display the retrieved XGBoost model\n", - "retrieved_xgboost_model" + "# Make a prediction\n", + "deployment.predict(\n", + " inputs=[int(cc_num)],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae6fdbd5", + "metadata": {}, + "outputs": [], + "source": [ + "# Predict for several cc_nums\n", + "predictions = [\n", + " deployment.predict(inputs=[int(cc_num)])['predictions'] \n", + " for cc_num\n", + " in cc_nums\n", + "]\n", + "predictions" ] }, { "cell_type": "markdown", - "id": "3cb590c3", + "id": "61371f50", "metadata": {}, "source": [ - "---\n", - "## 🔮 Batch Prediction \n" + "### Stop Deployment\n", + "To stop the deployment you simply run:" ] }, { "cell_type": "code", "execution_count": null, - "id": "ac5a8e74", + "id": "b9b01645", "metadata": {}, "outputs": [], "source": [ - "# Initialize batch scoring\n", - "feature_view.init_batch_scoring(1)\n", - "\n", - "# Get the batch data\n", - "batch_data = feature_view.get_batch_data()\n", - "\n", - "# Drop the \"datetime\" column from the batch_data DataFrame along the specified axis (axis=1 means columns)\n", - "batch_data.drop([\"datetime\"], axis=1, inplace=True)\n", - "\n", - "# Display the first 3 rows\n", - "batch_data.head(3)" + "# Stop the deployment\n", + "deployment.stop(await_stopped=180)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "cff98e14", + "cell_type": "markdown", + "id": "933bde72", "metadata": {}, - "outputs": [], "source": [ - "# Use the retrieved XGBoost model to make predictions on the batch data\n", - "predictions = retrieved_xgboost_model.predict(batch_data)\n", + "## 👾 StreamLit App\n", + "\n", + "\n", + "If you want to see interactive dashboards - use a **StreamLit App**.\n", + "\n", + "Type the next commands in terminal to run a Streamlit App:\n", "\n", - "# Display the first five predictions\n", - "predictions[:5]" + "`python -m streamlit run streamlit_app.py`" ] }, { "cell_type": "markdown", - "id": "15f38b1a", + "id": "2f4544e7", "metadata": {}, "source": [ "---\n", "\n", "### 🥳 Next Steps \n", - "Congratulations you've now completed the Fraud Batch tutorial for Managed Hopsworks.\n", + "Congratulations you've now completed the Fraud Online tutorial for Managed Hopsworks.\n", "\n", "Check out our other tutorials on ➡ https://github.com/logicalclocks/hopsworks-tutorials\n", "\n", @@ -197,7 +228,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -211,7 +242,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.18" } }, "nbformat": 4, From a2466ec4c8cd40d8efa376617518adab20987f18 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Fri, 5 Apr 2024 16:46:15 +0200 Subject: [PATCH 14/16] pyspark streaming online inference with deployments --- .../0_stream_simulation.ipynb | 59 ++++++------- .../1_feature_pipeline.ipynb | 84 +++++++++---------- .../2_training_pipeline.ipynb | 39 ++++++--- .../3_inference_pipeline.ipynb | 57 +++++-------- 4 files changed, 119 insertions(+), 120 deletions(-) diff --git a/advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb b/advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb index eb4a6177..3335541b 100644 --- a/advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb +++ b/advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "7ea95c96", + "id": "0ea184f1", "metadata": {}, "source": [ "# **Hopsworks Feature Store** \n", @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "892ac2b2", + "id": "10abb24d", "metadata": {}, "source": [ "## 📝 Imports" @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dab3506a", + "id": "03dfa5b6", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af0f0380", + "id": "a66d98be", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "a4241a88", + "id": "0ac7a545", "metadata": {}, "source": [ "## ✏️ Creating Simulated Data " @@ -56,7 +56,7 @@ }, { "cell_type": "markdown", - "id": "b710c76f", + "id": "bef551fc", "metadata": {}, "source": [ "A simulated dataset for credit card Transactions is created so that the data can be send using a Kafka stream. The data created is split into two different dataframes:\n", @@ -72,7 +72,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b4aa68ce", + "id": "bd1cd308", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "4f29f0a3", + "id": "808b22a0", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -91,7 +91,7 @@ }, { "cell_type": "markdown", - "id": "c049f064", + "id": "cc12ccdd", "metadata": {}, "source": [ "After creating the simulated data let us connect with Hopsworks Feature Store.\n", @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d55f2b0", + "id": "7f57df5f", "metadata": {}, "outputs": [], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "markdown", - "id": "f95f31d7", + "id": "1e42b84e", "metadata": {}, "source": [ "## 🪄 Creating Feature Groups " @@ -125,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "fc807022", + "id": "591a2813", "metadata": {}, "source": [ "Profiles data can be directly inserted as a feature group directly since they are not update fequently.\n", @@ -138,7 +138,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f94c972", + "id": "685f905b", "metadata": {}, "outputs": [], "source": [ @@ -154,7 +154,7 @@ }, { "cell_type": "markdown", - "id": "72505a6f", + "id": "d6e18288", "metadata": {}, "source": [ "## ⚙️ Kafka Topic and Schema Creation " @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "63f7878f", + "id": "e6ac3791", "metadata": {}, "source": [ "To create a Kafka stream for transactions a topic and schema must be create. The schema used must follow Apache Avro specification, more details can be found in the [documentation](https://avro.apache.org/docs/1.11.1/specification/).\n" @@ -171,7 +171,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e35f7c59", + "id": "1bec71eb", "metadata": {}, "outputs": [], "source": [ @@ -183,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3d811fb7", + "id": "998326f1", "metadata": {}, "outputs": [], "source": [ @@ -271,7 +271,7 @@ }, { "cell_type": "markdown", - "id": "845b1ed6", + "id": "a527b15a", "metadata": {}, "source": [ "After the schema is created the topic and the associated schema must be registered in Hopsworks so that the topic can be used." @@ -280,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6db8a245", + "id": "9d30e9f3", "metadata": {}, "outputs": [], "source": [ @@ -291,7 +291,7 @@ }, { "cell_type": "markdown", - "id": "db6ad79d", + "id": "24d19231", "metadata": {}, "source": [ "## 📡 Sending Data using created Kafka Topic " @@ -299,7 +299,7 @@ }, { "cell_type": "markdown", - "id": "ed183fe2", + "id": "538cec35", "metadata": {}, "source": [ "While sending data through Kafka we must make sure that the data types are in the same format specified in the schema. \n", @@ -310,7 +310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f487adb", + "id": "86ce202b", "metadata": {}, "outputs": [], "source": [ @@ -328,7 +328,7 @@ }, { "cell_type": "markdown", - "id": "6562d393", + "id": "e39eb952", "metadata": {}, "source": [ "Lets get the configuration needed for the producer to used Hopsworks internal kafka using the KafkaAPI" @@ -337,7 +337,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75a2a99d", + "id": "d42c4f71", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "69ac8aeb", + "id": "46393a4e", "metadata": {}, "source": [ "Finally, lets create a producer using the Kafka configuration and send data into it.\n", @@ -357,22 +357,23 @@ { "cell_type": "code", "execution_count": null, - "id": "e4bcf8e9", + "id": "23636fc7", "metadata": {}, "outputs": [], "source": [ "producer = Producer(kafka_config)\n", "\n", "for index, transaction in trans_df.iterrows():\n", - " producer.produce(KAFKA_TOPIC_NAME, transaction.to_json()\n", - " producer.flush()\n", + " producer.produce(KAFKA_TOPIC_NAME, transaction.to_json())\n", + " \n", " if index % 1000 == 0:\n", + " producer.flush()\n", " print(f'Finished sending index {index}')" ] }, { "cell_type": "markdown", - "id": "a637d022", + "id": "721e47bc", "metadata": {}, "source": [ "---\n", diff --git a/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb b/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb index 6756154e..53c366ab 100644 --- a/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "0a074034", + "id": "44c0c599", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "5d62a292", + "id": "3d46ee8c", "metadata": {}, "source": [ "## 📝 Imports" @@ -34,7 +34,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e450d4bf", + "id": "19073f37", "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,7 @@ }, { "cell_type": "markdown", - "id": "e002f27d", + "id": "caff7a87", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -57,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "dc98e7e0", + "id": "4579f529", "metadata": {}, "source": [ "In this tutorial a simulated data stream was created using Hopsworks internal Kafka.\n", @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "74e6ce5a", + "id": "33166eae", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "afd8dc5c", + "id": "c06ba96d", "metadata": {}, "source": [ "Let get the kafka configurations needed for read from hopsworks internal Kafka using the storage connector api." @@ -93,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "366279ee", + "id": "a85ba466", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "15a4c865", + "id": "e4884ae0", "metadata": {}, "source": [ "## 🗂 Reading from Kakfa Stream " @@ -112,7 +112,7 @@ }, { "cell_type": "markdown", - "id": "f161bc46", + "id": "022c1e5f", "metadata": {}, "source": [ "After obatining the Kafka configurations we can use it along with the topic name to create a streaming dataframe." @@ -121,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e9e41875", + "id": "ae5d7aaa", "metadata": {}, "outputs": [], "source": [ @@ -139,7 +139,7 @@ }, { "cell_type": "markdown", - "id": "cd56e20a", + "id": "141ef286", "metadata": {}, "source": [ "To extract the requierd data from streaming dataframe the correct schema has be defined and used" @@ -148,7 +148,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed1bf8a6", + "id": "fe5f8ac1", "metadata": {}, "outputs": [], "source": [ @@ -167,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "edebd990", + "id": "fb15985f", "metadata": {}, "source": [ "Extracting data from the streaming dataframe and casting it to get the required schema" @@ -176,7 +176,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c8519ee0", + "id": "c3d17ff0", "metadata": {}, "outputs": [], "source": [ @@ -208,7 +208,7 @@ }, { "cell_type": "markdown", - "id": "2a5f66b9", + "id": "b4c8cc32", "metadata": {}, "source": [ "## 🛠️ Feature Engineering " @@ -216,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "1eeed896", + "id": "e0fd2687", "metadata": {}, "source": [ "Now that we have a streaming dataframe that contains the data we can use it to engineer features. We would need the also need profiles to effectively engineer features.\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88118c25", + "id": "ad03f840", "metadata": {}, "outputs": [], "source": [ @@ -240,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "3860457e", + "id": "112c5afa", "metadata": {}, "source": [ "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", @@ -251,7 +251,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7ac337a0", + "id": "7890ec89", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "aaed3775", + "id": "d7f6cfff", "metadata": {}, "source": [ "Next, you will create features that aggregate credit card data over a period of time.\n", @@ -277,7 +277,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4dc63cc4", + "id": "49588d5e", "metadata": {}, "outputs": [], "source": [ @@ -303,7 +303,7 @@ }, { "cell_type": "markdown", - "id": "506a259c", + "id": "9bfcd026", "metadata": {}, "source": [ "## 👮🏻‍♂️ Great Expectations " @@ -312,7 +312,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59849666", + "id": "e0d3fc83", "metadata": {}, "outputs": [], "source": [ @@ -328,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb4f7d8c", + "id": "2ef7fae1", "metadata": {}, "outputs": [], "source": [ @@ -369,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "2e91b086", + "id": "9c139ea7", "metadata": {}, "source": [ "## 💾 Storing streaming dataframes in Hopsworks Feature Store " @@ -377,7 +377,7 @@ }, { "cell_type": "markdown", - "id": "e62519d0", + "id": "4ef0699f", "metadata": {}, "source": [ "### 🪄 Creating Feature Groups \n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d32291c9", + "id": "4bba5a68", "metadata": {}, "outputs": [], "source": [ @@ -410,7 +410,7 @@ }, { "cell_type": "markdown", - "id": "3f246fbd", + "id": "de01705e", "metadata": {}, "source": [ "A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", @@ -421,7 +421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8527dec", + "id": "b61c8985", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "1755d3fd", + "id": "ccff167b", "metadata": {}, "source": [ "\n", @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "06694d9d", + "id": "510c5daf", "metadata": {}, "outputs": [], "source": [ @@ -450,7 +450,7 @@ }, { "cell_type": "markdown", - "id": "0039419b", + "id": "b0dd2b8a", "metadata": {}, "source": [ "The `insert_stream` function inserts the data into the online feature store so to materialize the data in the offline store you need to manually run the materialization job. The materialization job can also be run on schedule using the `schedule` function. You can find more details in the [documentation](https://docs.hopsworks.ai/hopsworks-api/3.7/generated/api/jobs/#schedule)." @@ -459,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a22a1198", + "id": "b6ad17c1", "metadata": {}, "outputs": [], "source": [ @@ -469,7 +469,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c090719", + "id": "eca6acfc", "metadata": {}, "outputs": [], "source": [ @@ -495,7 +495,7 @@ }, { "cell_type": "markdown", - "id": "50b55791", + "id": "e377e90f", "metadata": {}, "source": [ "At the creation of the feature group, you will be prompted with an URL that will directly link to it; there you will be able to explore some of the aspects of your newly created feature group." @@ -503,7 +503,7 @@ }, { "cell_type": "markdown", - "id": "748bb8b4", + "id": "82ae2b51", "metadata": {}, "source": [ "You can move on and do the same thing for the feature group with our windows aggregation." @@ -512,7 +512,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7afbdcdd", + "id": "c31e898a", "metadata": {}, "outputs": [], "source": [ @@ -531,7 +531,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2dd0a6fd", + "id": "3586447c", "metadata": {}, "outputs": [], "source": [ @@ -541,7 +541,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0144d454", + "id": "dc40716e", "metadata": {}, "outputs": [], "source": [ @@ -551,7 +551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63633259", + "id": "af97ebb1", "metadata": {}, "outputs": [], "source": [ @@ -560,7 +560,7 @@ }, { "cell_type": "markdown", - "id": "164c0151", + "id": "49296a35", "metadata": {}, "source": [ "## ⏭️ **Next:** Part 02: Training Pipeline\n", diff --git a/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb b/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb index e0f8c17f..42d7667e 100644 --- a/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb @@ -53,6 +53,7 @@ "import xgboost as xgb\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import f1_score\n", + "import time\n", "\n", "# Mute warnings\n", "import warnings\n", @@ -527,15 +528,8 @@ "\n", " def predict(self, inputs):\n", " \"\"\" Serves a prediction request usign a trained model\"\"\"\n", - " feature_vector = self.fv.get_feature_vector({\"cc_num\": inputs[0][0]})\n", - " indexes_to_remove = [0,1,2]\n", - " feature_vector = [\n", - " i \n", - " for j, i \n", - " in enumerate(feature_vector) \n", - " if j not in indexes_to_remove\n", - " ] \n", - " return self.model.predict(np.asarray(feature_vector).reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable" + " feature_vector = self.fv.get_feature_vector({\"cc_num\": inputs[0][0]}, return_type=\"pandas\").drop([\"datetime\"], axis=1).values\n", + " return self.model.predict(feature_vector.reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable" ] }, { @@ -547,6 +541,25 @@ "This script needs to be put into a known location in the Hopsworks file system. Let's call the file predict_example.py and put it in the Models directory." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the dataset API for the current project\n", + "dataset_api = project.get_dataset_api()\n", + "\n", + "# Specify the local file path of the Python script to be uploaded\n", + "local_script_path = \"predict_example.py\"\n", + "\n", + "# Upload the Python script to the \"Models\", and overwrite if it already exists\n", + "uploaded_file_path = dataset_api.upload(local_script_path, \"Models\", overwrite=True)\n", + "\n", + "# Create the full path to the uploaded script for future reference\n", + "predictor_script_path = os.path.join(\"/Projects\", project.name, uploaded_file_path)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -651,9 +664,9 @@ "metadata": {}, "source": [ "---\n", - "## ⏭️ **Next:** Part 04: Online Inference\n", + "## ⏭️ **Next:** Part 03: Online Inference\n", "\n", - "In the following notebook you will use your model for batch inference.\n" + "In the following notebook you will use your model for online inference.\n" ] }, { @@ -669,7 +682,7 @@ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -683,7 +696,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb b/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb index 49406d38..d6ccbccd 100644 --- a/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb +++ b/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "f4f43ac5", + "id": "0619bc2f", "metadata": {}, "source": [ "# **Hopsworks Feature Store** - Part 03: Inference Pipeline\n" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "48cfbe50", + "id": "e89e50c4", "metadata": {}, "source": [ "## 📡 Connecting to Hopsworks Feature Store " @@ -19,7 +19,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fbcb8dda", + "id": "e8e5e62d", "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "markdown", - "id": "76f36860", + "id": "eb5f1f04", "metadata": {}, "source": [ "## ⚙️ Feature Retrieval\n", @@ -42,7 +42,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11e35a86", + "id": "7d3c3248", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92d5aa29", + "id": "ee9ad61a", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ }, { "cell_type": "markdown", - "id": "053da208", + "id": "95705703", "metadata": {}, "source": [ "Now we initlize the feature view using the training data version to retrive feature vector's" @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "c4549b3c", + "id": "4fd466c3", "metadata": {}, "source": [ "## 🗄 Model Registry\n" @@ -85,7 +85,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d9f51718", + "id": "76d991cf", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "markdown", - "id": "590d597d", + "id": "b5701bf8", "metadata": {}, "source": [ "## 🚀 Fetch Deployment" @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f91ac6ba", + "id": "3af8b8e0", "metadata": {}, "outputs": [], "source": [ @@ -123,7 +123,7 @@ }, { "cell_type": "markdown", - "id": "765057a0", + "id": "49cf822e", "metadata": {}, "source": [ "## 🔮 Predicting using deployment\n", @@ -137,7 +137,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6becd6b1", + "id": "06c26a3a", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "650af1b0", + "id": "2252bdef", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ae6fdbd5", + "id": "798fb9ed", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "61371f50", + "id": "ca4f3828", "metadata": {}, "source": [ "### Stop Deployment\n", @@ -187,7 +187,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b9b01645", + "id": "e9145083", "metadata": {}, "outputs": [], "source": [ @@ -197,28 +197,13 @@ }, { "cell_type": "markdown", - "id": "933bde72", - "metadata": {}, - "source": [ - "## 👾 StreamLit App\n", - "\n", - "\n", - "If you want to see interactive dashboards - use a **StreamLit App**.\n", - "\n", - "Type the next commands in terminal to run a Streamlit App:\n", - "\n", - "`python -m streamlit run streamlit_app.py`" - ] - }, - { - "cell_type": "markdown", - "id": "2f4544e7", + "id": "7ce48444", "metadata": {}, "source": [ "---\n", "\n", "### 🥳 Next Steps \n", - "Congratulations you've now completed the Fraud Online tutorial for Managed Hopsworks.\n", + "Congratulations you've now completed the PySpark Streaming tutorial for Managed Hopsworks.\n", "\n", "Check out our other tutorials on ➡ https://github.com/logicalclocks/hopsworks-tutorials\n", "\n", @@ -228,7 +213,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -242,7 +227,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.11" } }, "nbformat": 4, From 3ab3c53d43533e7ea7ae3b981840b67acd26d349 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 8 Apr 2024 09:24:02 +0200 Subject: [PATCH 15/16] moving polars and pyspark streaming tutorial to integrations folder --- .../polars/quickstart_polars.ipynb | 0 .../pyspark_streaming/0_stream_simulation.ipynb | 0 .../pyspark_streaming/1_feature_pipeline.ipynb | 0 .../pyspark_streaming/2_training_pipeline.ipynb | 0 .../pyspark_streaming/3_inference_pipeline.ipynb | 0 .../pyspark_streaming/features/transactions.py | 0 .../pyspark_streaming/synthetic_data/__init__.py | 0 .../pyspark_streaming/synthetic_data/synthetic_data.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename {advanced_tutorials => integrations}/polars/quickstart_polars.ipynb (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/0_stream_simulation.ipynb (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/1_feature_pipeline.ipynb (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/2_training_pipeline.ipynb (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/3_inference_pipeline.ipynb (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/features/transactions.py (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/synthetic_data/__init__.py (100%) rename {advanced_tutorials => integrations}/pyspark_streaming/synthetic_data/synthetic_data.py (100%) diff --git a/advanced_tutorials/polars/quickstart_polars.ipynb b/integrations/polars/quickstart_polars.ipynb similarity index 100% rename from advanced_tutorials/polars/quickstart_polars.ipynb rename to integrations/polars/quickstart_polars.ipynb diff --git a/advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb b/integrations/pyspark_streaming/0_stream_simulation.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/0_stream_simulation.ipynb rename to integrations/pyspark_streaming/0_stream_simulation.ipynb diff --git a/advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb b/integrations/pyspark_streaming/1_feature_pipeline.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/1_feature_pipeline.ipynb rename to integrations/pyspark_streaming/1_feature_pipeline.ipynb diff --git a/advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb b/integrations/pyspark_streaming/2_training_pipeline.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/2_training_pipeline.ipynb rename to integrations/pyspark_streaming/2_training_pipeline.ipynb diff --git a/advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb b/integrations/pyspark_streaming/3_inference_pipeline.ipynb similarity index 100% rename from advanced_tutorials/pyspark_streaming/3_inference_pipeline.ipynb rename to integrations/pyspark_streaming/3_inference_pipeline.ipynb diff --git a/advanced_tutorials/pyspark_streaming/features/transactions.py b/integrations/pyspark_streaming/features/transactions.py similarity index 100% rename from advanced_tutorials/pyspark_streaming/features/transactions.py rename to integrations/pyspark_streaming/features/transactions.py diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py b/integrations/pyspark_streaming/synthetic_data/__init__.py similarity index 100% rename from advanced_tutorials/pyspark_streaming/synthetic_data/__init__.py rename to integrations/pyspark_streaming/synthetic_data/__init__.py diff --git a/advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py b/integrations/pyspark_streaming/synthetic_data/synthetic_data.py similarity index 100% rename from advanced_tutorials/pyspark_streaming/synthetic_data/synthetic_data.py rename to integrations/pyspark_streaming/synthetic_data/synthetic_data.py From 1a6c7a4ee7bfab836a25011095a663195753f203 Mon Sep 17 00:00:00 2001 From: manu-sj Date: Mon, 8 Apr 2024 09:30:54 +0200 Subject: [PATCH 16/16] Updating readme based on moved polars tutorials and adding Pyspark streaming into readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d1a1c030..dd8e87cc 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,8 @@ In order to understand the tutorials you need to be familiar with general concep - [WandB](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/wandb): Build a machine learning model with Weights & Biases. - [Great Expectations](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/great_expectations): Introduction to Great Expectations concepts and classes which are relevant for integration with the Hopsworks MLOps platform. - [Neo4j](integrations/neo4j): Perform Anti-money laundering (AML) predictions using Neo4j Graph representation of transactions. - - [Polars](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/polars/quickstart.ipynb) : Introductory tutorial on using Polars. + - [Polars](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/polars/quickstart.ipynb) : Introductory tutorial on using Polars. + - [PySpark Streaming](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/pyspark_streaming) : Real time feature computation from streaming data using PySpark and HopsWorks Feature Store. - [Monitoring](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/monitoring): How to implement feature monitoring in your production pipeline. - [Bytewax](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/bytewax): Real time feature computation using Bytewax. - [Apache Beam](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/integrations/java/beam): Real time feature computation using Apache Beam, Google Cloud Dataflow and Hopsworks Feature Store.