In [None]:
import numpy as np
import pandas as pd

from pyspark.ml.functions import predict_batch_udf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, struct, array
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, Union, Dict

In [None]:
%env PYSPARK_PYTHON=C:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\Scripts\python.exe

In [None]:
%env PYSPARK_DRIVER_PYTHON=C:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\Scripts\python.exe

In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1 pyspark-shell'

In [None]:
KAFKA_BROKER_URL = "kafka0:9093"
RECOMMENDATIONS_TOPIC = "recommendations"
USER_ACTIONS_TOPIC = "users.actions"

In [None]:
def predict_batch_fn():
    # load model from checkpoint
    import torch    
    device = torch.device("cuda")
    model = Net().to(device)
    checkpoint = load_checkpoint(checkpoint_dir)
    model.load_state_dict(checkpoint['model'])

    # define predict function in terms of numpy arrays
    def predict(inputs: np.ndarray) -> np.ndarray:
        torch_inputs = torch.from_numpy(inputs).to(device)
        outputs = model(torch_inputs)
        return outputs.cpu().detach().numpy()
    
    return predict

In [None]:
spark = SparkSession.builder \
        .appName("KafkaRead") \
        .master("local[*]") \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
        .getOrCreate()

In [None]:
schema = StructType([
        StructField("user_id", StringType(), True)
    ])

In [None]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BROKER_URL) \
    .option("subscribe", RECOMMENDATIONS_TOPIC) \
    .option("startingOffsets", "latest") \
    .load()

In [None]:
df.printSchema()

In [None]:
values_df = df.selectExpr("CAST(value AS STRING) as json_data") \
                .select(from_json(col("json_data"), schema).alias("data")) \
                .select("data.*")

In [None]:
values_df.writeStream.format('console').outputMode('append').start().awaitTermination()

In [None]:
query = values_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .trigger(processingTime='30 seconds') \
    .start()

In [None]:
query.awaitTermination(30)

In [None]:
df_parsed = df.selectExpr("CAST(value AS STRING) as json_value") \
    .select(from_json(col("json_value"), schema).alias("data")) \
    .select("data.*")


In [None]:
query = df_parsed.writeStream.outputMode("append").format("console").start()
query.awaitTermination()

In [None]:
from recsys_streaming_ml.spark.utils import spark

def create_dataframe_from_dict(spark, data):
    """
    Create a DataFrame from a list of dictionaries.
    Each dictionary represents a record with a single field `user_id`.
    """
    schema = StructType([StructField("user_id", StringType(), True)])
    df = spark.createDataFrame(data, schema)
    return df

# Sample data
data = [
    {"user_id": "A1"},
    {"user_id": "B2"},
    {"user_id": "C3"},
    {"user_id": "D4"}
]

# Create Spark session
session = spark()

# Create DataFrame from data
df = create_dataframe_from_dict(session, data)

# Show DataFrame
#df.show()

In [None]:
df.show()

In [None]:
import pandas as pd
import pyspark as ps
import numpy as np
from pyspark.sql.types import StructType, StructField, IntegerType

def create_item_feature_store(spark):
    data = [(i, i) for i in range(100)]
    schema = StructType([
        StructField("parent_asin", IntegerType(), True),
        StructField("store_id", IntegerType(), True)
    ])
    df = spark.createDataFrame(data, schema)
    return df

item_feature_store = create_item_feature_store(session)

In [None]:
item_feature_store.show()

In [None]:
from recsys_streaming_ml.db import mongo_db, read_df_from_mongo
from recsys_streaming_ml.data.utils import load_feature_maps
import random

feature_maps = load_feature_maps("../.data/feature_maps.pkl")

def read_item_feature_store(db, feature_maps, collection='metadata'):
    item_feature_store_raw = read_df_from_mongo(db=db, collection=collection)
    item_feature_store = item_feature_store_raw.copy()
    item_feature_store['parent_asin'] = item_feature_store['parent_asin'].map(feature_maps['parent_id_map'])
    item_feature_store['store_id'] = item_feature_store['store'].map(feature_maps['store_id_map'])
    item_feature_store = item_feature_store.drop(columns='store').dropna().astype(int).sort_values(by='parent_asin').reset_index(drop=True)

    return item_feature_store

In [None]:
item_feature_store = read_item_feature_store(mongo_db, feature_maps)

In [None]:
item_feature_store = session.createDataFrame(item_feature_store)

In [None]:
item_feature_store.show()

In [None]:
from pyspark.sql.functions import col, create_map, lit, udf
from itertools import chain

user_id_mapping = {"A1": 2, "B2": 3, "C3": 1, "D4": 0}
rev_user_id_mapping = {v:k for k,v in user_id_mapping.items()}
rev_asin_mapping = {v[0]:f'ID_{v[0]}' for v in item_feature_store.select('parent_asin').distinct().collect()}

def process_data(
        df: ps.sql.dataframe.DataFrame, 
        item_feature_store: ps.sql.dataframe.DataFrame, 
        user_id_mapping: dict[str, int]
    ) -> ps.sql.dataframe.DataFrame:
    """
    Process the DataFrame by mapping user_ids using the provided dictionary.
    """
    mapping_expr = create_map([lit(x) for x in chain(*user_id_mapping.items())])

    processed_df = df.withColumn("map_user_id", mapping_expr[col("user_id")])
    processed_df = processed_df.crossJoin(item_feature_store)
    processed_df = processed_df.select("map_user_id", "parent_asin", "store_id")

    return processed_df

In [None]:
processed_df = process_data(df, item_feature_store, user_id_mapping)

In [None]:
processed_df.show()

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, collect_list

def get_ranked_topk_predictions(df, k=5):
    window = Window.partitionBy("map_user_id").orderBy(col("predicted_rating").desc())

    # Add a rank column to rank the rows within each partition by 'sum'
    ranked_predictions = df.withColumn("rank", rank().over(window))

    # Filter to keep only the top 5 'asin' values for each 'map_user_id'
    top_k = ranked_predictions.filter(col("rank") <= k)

    return top_k

def remap_entities(df, user_id_mapping, asin_mapping):
        mapping_expr_user = create_map([lit(x) for x in chain(*user_id_mapping.items())])
        mapping_expr_asin = create_map([lit(x) for x in chain(*asin_mapping.items())])

        df = df.withColumn("user_id", mapping_expr_user[col("map_user_id")])
        df = df.withColumn("asin", mapping_expr_asin[col("map_user_id")])

        return df.select("user_id", "asin", "rank")

def list_recommendations(df):
    # Aggregate the top k 'asin' values into a list for each 'map_user_id'
    result = df.groupBy("user_id").agg(collect_list("asin").alias("top_k_asins"))
    return result

In [None]:
ranked_topk = get_ranked_topk_predictions(predictions)
remapped_ranked_topk = remap_entities(ranked_topk, rev_user_id_mapping, rev_asin_mapping)
recommendation_lists = list_recommendations(remapped_ranked_topk)

In [None]:
ranked_topk.show()

In [None]:
remapped_ranked_topk.show()

In [None]:
recommendation_lists.show()

In [None]:
df = spark.read.parquet("/path/to/test/data")
preds = df.withColumn("preds", mnist('data')).collect()

query = df_parsed.writeStream \
    .outputMode("append") \
    .format("console") \
    #.trigger(processingTime='15 seconds') \
    .start()

query.awaitTermination()

In [None]:
pd.read_csv("../.data/dataset/train_data.csv")

In [None]:
from recsys_streaming_ml.data.utils import load_feature_maps
import random

feature_maps = load_feature_maps("../.data/feature_maps.pkl")

random_user_ids = random.choices(list(feature_maps['user_id_map'].keys()), k=50)
pd.DataFrame(random_user_ids, columns=['user_id']).to_csv("../.data/sample_user_ids.csv", index=False)

In [63]:
import torch


#model = torch.jit.load("C:/Users/Milosz/Projects/recsys-streaming/recsys-streaming-ml/.runs/DeepFM/2024-05-11_15-29-20/model.pt", map_location='cpu')
model_input = torch.randint(0, 40, (1, 3)).to(torch.float)


In [79]:
model = DeepFM(emb_dim=8, hidden_dim=[32, 24, 10], feature_sizes=[100, 100, 100])
model.eval()
model_input = torch.randint(0, 40, (1, 3)).to(torch.float)
traced_script_module = torch.jit.trace(model, model_input)

# Save the TorchScript model
traced_script_module.save("../.model_repository/DeepFM/1/model.pt")

In [53]:
from recsys_streaming_ml.model.model import DeepFM
# model = torch.load("C:/Users/Milosz/Projects/recsys-streaming/recsys-streaming-ml/.runs/DeepFM/2024-05-11_15-29-20/model.pt")

In [66]:

model.eval()

DeepFM(
  (V): EmbeddingNet(
    (embeddings): ModuleDict(
      (0): Embedding(100, 8)
      (1): Embedding(100, 8)
      (2): Embedding(100, 8)
    )
  )
  (fm): FM()
  (dnn): MLP(
    (layers): Sequential(
      (0): Linear(in_features=24, out_features=32, bias=True)
      (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.1, inplace=False)
      (4): Linear(in_features=32, out_features=24, bias=True)
      (5): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): Dropout(p=0.1, inplace=False)
      (8): Linear(in_features=24, out_features=10, bias=True)
      (9): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ReLU()
      (11): Dropout(p=0.1, inplace=False)
      (12): Linear(in_features=10, out_features=1, bias=True)
    )
  )
)

In [78]:
torch.save(model, "../.model_repository/DeepFM/1/model.pt")

In [69]:
torch.onnx.export(model, model_input, "../.model_repository/DeepFM/1/model.onnx")

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, struct, col, array
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import ArrayType, FloatType, Union, Dict, StructType, StructField, DataType

import pandas as pd
import numpy as np
import tritonclient.http as httpclient
from functools import partial

from recsys_streaming_ml.spark.utils import spark

In [21]:
# Initialize Spark session
session = spark()

In [22]:
# Define the PySpark DataFrame
schema = StructType([
    StructField("input", ArrayType(FloatType()), False)
])

data = [(np.random.rand(3).astype(np.float32).tolist(),) for _ in range(100)]
df = session.createDataFrame(data, schema)
df.show()

+--------------------+
|               input|
+--------------------+
|[0.8619455, 0.431...|
|[0.18523002, 0.31...|
|[0.49610206, 0.59...|
|[0.04170201, 0.68...|
|[0.8219223, 0.164...|
|[0.17303495, 0.22...|
|[0.67281836, 0.80...|
|[0.42195797, 0.51...|
|[0.03965295, 0.75...|
|[0.27317488, 0.67...|
|[0.19653319, 0.85...|
|[0.90088624, 0.74...|
|[0.5246734, 0.251...|
|[0.31020227, 0.56...|
|[0.69740117, 0.87...|
|[0.6936094, 0.151...|
|[0.6217257, 0.591...|
|[0.81691366, 0.27...|
|[0.38775757, 0.60...|
|[0.41836286, 0.24...|
+--------------------+
only showing top 20 rows



In [24]:
# Triton client setup
TRITON_URL = 'localhost:8000'
MODEL_NAME = 'DeepFM'

def evaluate_model(batch_df: pd.DataFrame) -> pd.DataFrame:
    inputs = batch_df['input'].to_list()
    
    triton_client = httpclient.InferenceServerClient(url=TRITON_URL)
    
    input_tensor = httpclient.InferInput('input', inputs[0].shape, 'FP32')
    input_tensor.set_data_from_numpy(np.array(inputs))
    
    outputs = []
    for input_data in inputs:
        input_tensor.set_data_from_numpy(input_data)
        result = triton_client.infer(model_name=MODEL_NAME, inputs=[input_tensor])
        output_data = result.as_numpy('output')
        outputs.append(output_data)
    
    return pd.DataFrame({'output': outputs})

# Register UDF with Spark
evaluate_model_udf = pandas_udf(evaluate_model, returnType='output_col DataType', functionType=PandasUDFType.SCALAR)




ParseException: 
[UNSUPPORTED_DATATYPE] Unsupported data type "DATATYPE".(line 1, pos 11)

== SQL ==
output_col DataType
-----------^^^


In [27]:
TRITON_GRPC_URL = 'localhost:8001'
MODEL_NAME = 'DeepFM'


def triton_fn(triton_uri, model_name):
    import numpy as np
    import tritonclient.grpc as grpcclient
    
    np_types = {
      "BOOL": np.dtype(np.bool8),
      "INT8": np.dtype(np.int8),
      "INT16": np.dtype(np.int16),
      "INT32": np.dtype(np.int32),
      "INT64": np.dtype(np.int64),
      "FP16": np.dtype(np.float16),
      "FP32": np.dtype(np.float32),
      "FP64": np.dtype(np.float64),
      "FP64": np.dtype(np.double),
      "BYTES": np.dtype(object)
    }

    client = grpcclient.InferenceServerClient(triton_uri)
    model_meta = client.get_model_metadata(model_name)
    
    def predict(inputs):
        if isinstance(inputs, np.ndarray):
            # single ndarray input
            request = [grpcclient.InferInput(model_meta.inputs[0].name, inputs.shape, model_meta.inputs[0].datatype)]
            request[0].set_data_from_numpy(inputs.astype(np_types[model_meta.inputs[0].datatype]))
        else:
            # dict of multiple ndarray inputs
            request = [grpcclient.InferInput(i.name, inputs[i.name].shape, i.datatype) for i in model_meta.inputs]
            for i in request:
                i.set_data_from_numpy(inputs[i.name()].astype(np_types[i.datatype()]))
        
        response = client.infer(model_name, inputs=request)
        
        if len(model_meta.outputs) > 1:
            # return dictionary of numpy arrays
            return {o.name: response.as_numpy(o.name) for o in model_meta.outputs}
        else:
            # return single numpy array
            return response.as_numpy(model_meta.outputs[0].name)
        
    return predict

recommender = predict_batch_udf(partial(triton_fn, triton_uri=TRITON_GRPC_URL, model_name=MODEL_NAME),
                          input_tensor_shapes=[[3]],
                          return_type=ArrayType(FloatType()),
                          batch_size=128)

In [29]:
results_df = df.withColumn('output', recommender(df['input']))

# Show results
results_df.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "c:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\lib\site-packages\pyspark\ml\functions.py", line 765, in predict
    predict_fn = make_predict_fn()
  File "C:\Users\Milosz\AppData\Local\Temp\ipykernel_26864\857280881.py", line 23, in triton_fn
  File "C:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\lib\site-packages\tritonclient\grpc\_client.py", line 522, in get_model_metadata
    raise_error_grpc(rpc_error)
  File "C:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\lib\site-packages\tritonclient\grpc\_utils.py", line 77, in raise_error_grpc
    raise get_error_grpc(rpc_error) from None
tritonclient.utils.InferenceServerException: [StatusCode.UNAVAILABLE] Request for unknown model: 'DeepFM' is not found


In [None]:

# Apply UDF to DataFrame
results_df = df.withColumn('output', evaluate_model_udf(df['input']))

# Show results
results_df.show()