In [1]:
import numpy as np
import pandas as pd

from pyspark.ml.functions import predict_batch_udf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, struct, array
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, Union, Dict

In [None]:
%env PYSPARK_PYTHON=C:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\Scripts\python.exe

In [None]:
%env PYSPARK_DRIVER_PYTHON=C:\Users\Milosz\AppData\Local\pypoetry\Cache\virtualenvs\recsys-streaming-ml-Mj1TWbkU-py3.10\Scripts\python.exe

In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1 pyspark-shell'

In [None]:
KAFKA_BROKER_URL = "kafka0:9093"
RECOMMENDATIONS_TOPIC = "recommendations"
USER_ACTIONS_TOPIC = "users.actions"

In [None]:
def predict_batch_fn():
    # load model from checkpoint
    import torch    
    device = torch.device("cuda")
    model = Net().to(device)
    checkpoint = load_checkpoint(checkpoint_dir)
    model.load_state_dict(checkpoint['model'])

    # define predict function in terms of numpy arrays
    def predict(inputs: np.ndarray) -> np.ndarray:
        torch_inputs = torch.from_numpy(inputs).to(device)
        outputs = model(torch_inputs)
        return outputs.cpu().detach().numpy()
    
    return predict

In [None]:
spark = SparkSession.builder \
        .appName("KafkaRead") \
        .master("local[*]") \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
        .getOrCreate()

In [None]:
schema = StructType([
        StructField("user_id", StringType(), True)
    ])

In [None]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BROKER_URL) \
    .option("subscribe", RECOMMENDATIONS_TOPIC) \
    .option("startingOffsets", "latest") \
    .load()

In [None]:
df.printSchema()

In [None]:
values_df = df.selectExpr("CAST(value AS STRING) as json_data") \
                .select(from_json(col("json_data"), schema).alias("data")) \
                .select("data.*")

In [None]:
values_df.writeStream.format('console').outputMode('append').start().awaitTermination()

In [None]:
query = values_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .trigger(processingTime='30 seconds') \
    .start()

In [None]:
query.awaitTermination(30)

In [None]:
df_parsed = df.selectExpr("CAST(value AS STRING) as json_value") \
    .select(from_json(col("json_value"), schema).alias("data")) \
    .select("data.*")


In [None]:
query = df_parsed.writeStream.outputMode("append").format("console").start()
query.awaitTermination()

In [2]:
from recsys_streaming_ml.spark.utils import spark

def create_dataframe_from_dict(spark, data):
    """
    Create a DataFrame from a list of dictionaries.
    Each dictionary represents a record with a single field `user_id`.
    """
    schema = StructType([StructField("user_id", StringType(), True)])
    df = spark.createDataFrame(data, schema)
    return df

# Sample data
data = [
    {"user_id": "A1"},
    {"user_id": "B2"},
    {"user_id": "C3"},
    {"user_id": "D4"}
]

# Create Spark session
session = spark()

# Create DataFrame from data
df = create_dataframe_from_dict(session, data)

# Show DataFrame
#df.show()

In [3]:
df.show()

+-------+
|user_id|
+-------+
|     A1|
|     B2|
|     C3|
|     D4|
+-------+



In [20]:
import pandas as pd
import pyspark as ps
import numpy as np
from pyspark.sql.types import StructType, StructField, IntegerType

def create_item_feature_store(spark):
    data = [(i, i) for i in range(100)]
    schema = StructType([
        StructField("parent_asin", IntegerType(), True),
        StructField("store_id", IntegerType(), True)
    ])
    df = spark.createDataFrame(data, schema)
    return df

item_feature_store = create_item_feature_store(session)

In [21]:
item_feature_store.show()

+-----------+--------+
|parent_asin|store_id|
+-----------+--------+
|          0|       0|
|          1|       1|
|          2|       2|
|          3|       3|
|          4|       4|
|          5|       5|
|          6|       6|
|          7|       7|
|          8|       8|
|          9|       9|
|         10|      10|
|         11|      11|
|         12|      12|
|         13|      13|
|         14|      14|
|         15|      15|
|         16|      16|
|         17|      17|
|         18|      18|
|         19|      19|
+-----------+--------+
only showing top 20 rows



In [6]:
from recsys_streaming_ml.db import mongo_db, read_df_from_mongo
from recsys_streaming_ml.data.utils import load_feature_maps
import random

feature_maps = load_feature_maps("../.data/feature_maps.pkl")

def read_item_feature_store(db, feature_maps, collection='metadata'):
    item_feature_store_raw = read_df_from_mongo(db=db, collection=collection)
    item_feature_store = item_feature_store_raw.copy()
    item_feature_store['parent_asin'] = item_feature_store['parent_asin'].map(feature_maps['parent_id_map'])
    item_feature_store['store_id'] = item_feature_store['store'].map(feature_maps['store_id_map'])
    item_feature_store = item_feature_store.drop(columns='store').dropna().astype(int).sort_values(by='parent_asin').reset_index(drop=True)

    return item_feature_store

[MONGO] Connection successful




In [7]:
item_feature_store = read_item_feature_store(mongo_db, feature_maps)

In [10]:
item_feature_store = session.createDataFrame(item_feature_store)

In [12]:
item_feature_store.show()

+-----------+--------+
|parent_asin|store_id|
+-----------+--------+
|          0|      46|
|          1|      74|
|          2|      47|
|          3|     209|
|          4|      48|
|          5|     238|
|          6|     179|
|          7|     239|
|          8|      49|
|          9|      75|
|         10|     132|
|         11|     111|
|         12|      18|
|         13|     112|
|         14|       0|
|         15|      19|
|         16|       1|
|         17|     113|
|         18|       0|
|         19|     240|
+-----------+--------+
only showing top 20 rows



In [22]:
from pyspark.sql.functions import col, create_map, lit, udf
from itertools import chain

user_id_mapping = {"A1": 2, "B2": 3, "C3": 1, "D4": 0}
rev_user_id_mapping = {v:k for k,v in user_id_mapping.items()}
rev_asin_mapping = {v[0]:f'ID_{v[0]}' for v in item_feature_store.select('parent_asin').distinct().collect()}

def process_data(
        df: ps.sql.dataframe.DataFrame, 
        item_feature_store: ps.sql.dataframe.DataFrame, 
        user_id_mapping: dict[str, int]
    ) -> ps.sql.dataframe.DataFrame:
    """
    Process the DataFrame by mapping user_ids using the provided dictionary.
    """
    mapping_expr = create_map([lit(x) for x in chain(*user_id_mapping.items())])

    processed_df = df.withColumn("map_user_id", mapping_expr[col("user_id")])
    processed_df = processed_df.crossJoin(item_feature_store)
    processed_df = processed_df.select("map_user_id", "parent_asin", "store_id")

    return processed_df

In [23]:
processed_df = process_data(df, item_feature_store, user_id_mapping)

In [24]:
processed_df.show()

+-----------+-----------+--------+
|map_user_id|parent_asin|store_id|
+-----------+-----------+--------+
|          2|          0|       0|
|          2|          1|       1|
|          2|          2|       2|
|          2|          3|       3|
|          2|          4|       4|
|          2|          5|       5|
|          2|          6|       6|
|          2|          7|       7|
|          2|          8|       8|
|          2|          9|       9|
|          2|         10|      10|
|          2|         11|      11|
|          2|         12|      12|
|          2|         13|      13|
|          2|         14|      14|
|          2|         15|      15|
|          2|         16|      16|
|          2|         17|      17|
|          2|         18|      18|
|          2|         19|      19|
+-----------+-----------+--------+
only showing top 20 rows



In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, collect_list

def get_ranked_topk_predictions(df, k=5):
    window = Window.partitionBy("map_user_id").orderBy(col("predicted_rating").desc())

    # Add a rank column to rank the rows within each partition by 'sum'
    ranked_predictions = df.withColumn("rank", rank().over(window))

    # Filter to keep only the top 5 'asin' values for each 'map_user_id'
    top_k = ranked_predictions.filter(col("rank") <= k)

    return top_k

def remap_entities(df, user_id_mapping, asin_mapping):
        mapping_expr_user = create_map([lit(x) for x in chain(*user_id_mapping.items())])
        mapping_expr_asin = create_map([lit(x) for x in chain(*asin_mapping.items())])

        df = df.withColumn("user_id", mapping_expr_user[col("map_user_id")])
        df = df.withColumn("asin", mapping_expr_asin[col("map_user_id")])

        return df.select("user_id", "asin", "rank")

def list_recommendations(df):
    # Aggregate the top k 'asin' values into a list for each 'map_user_id'
    result = df.groupBy("user_id").agg(collect_list("asin").alias("top_k_asins"))
    return result

In [None]:
ranked_topk = get_ranked_topk_predictions(predictions)
remapped_ranked_topk = remap_entities(ranked_topk, rev_user_id_mapping, rev_asin_mapping)
recommendation_lists = list_recommendations(remapped_ranked_topk)

In [None]:
ranked_topk.show()

In [None]:
remapped_ranked_topk.show()

In [None]:
recommendation_lists.show()

In [None]:
df = spark.read.parquet("/path/to/test/data")
preds = df.withColumn("preds", mnist('data')).collect()

query = df_parsed.writeStream \
    .outputMode("append") \
    .format("console") \
    #.trigger(processingTime='15 seconds') \
    .start()

query.awaitTermination()

In [None]:
pd.read_csv("../.data/dataset/train_data.csv")

In [32]:
from recsys_streaming_ml.data.utils import load_feature_maps
import random

feature_maps = load_feature_maps("../.data/feature_maps.pkl")

random_user_ids = random.choices(list(feature_maps['user_id_map'].keys()), k=50)
pd.DataFrame(random_user_ids, columns=['user_id']).to_csv("../.data/sample_user_ids.csv", index=False)