In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, udf, regexp_replace, lit, from_unixtime
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, IntegerType, StringType, MapType
from pyspark.sql.functions import split
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, MapType, StringType
from pyspark.sql.functions import explode


from pyspark.sql.functions import split, explode, regexp_extract, col, collect_list, udf, broadcast
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, FloatType
from pyspark.ml.linalg import VectorUDT, Vectors
import numpy as np
import os

In [2]:
# Stop existing Spark context if any
try:
    spark.stop()
except NameError:
    pass

# Reinitialize Spark session
spark = SparkSession.builder \
    .appName("BehaviorsProcessing") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/09 15:58:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/09 15:58:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/09 15:58:20 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"  # Replace with your Java path
os.environ["SPARK_HOME"] = "/path/to/spark"  # Replace with your Spark installation path
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"  # Replace with your Python path
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"

In [4]:
# Define the schema
behaviors_schema = StructType([
    StructField("ImpressionID", StringType(), True),
    StructField("UserID", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("History", StringType(), True),
    StructField("Impressions", StringType(), True)
])

# Load the behaviors.tsv file
behaviors_df = spark.read.csv(
    "data/mind/MINDsmall_train/behaviors.tsv",
    sep="\t",
    schema=behaviors_schema,
    header=False
)

# Display the schema and a sample row
behaviors_df.printSchema()
behaviors_df.show(1, truncate=False)

root
 |-- ImpressionID: string (nullable = true)
 |-- UserID: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- History: string (nullable = true)
 |-- Impressions: string (nullable = true)

+------------+------+---------------------+--------------------------------------------------------------+-----------------+
|ImpressionID|UserID|Time                 |History                                                       |Impressions      |
+------------+------+---------------------+--------------------------------------------------------------+-----------------+
|1           |U13740|11/11/2019 9:05:58 AM|N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801|N55689-1 N35729-0|
+------------+------+---------------------+--------------------------------------------------------------+-----------------+
only showing top 1 row



In [5]:
# Split History into an array
behaviors_df = behaviors_df.withColumn("HistoryList", split(col("History"), " "))
behaviors_df = behaviors_df.drop("History")  # Drop original History column if not needed

# Verify the transformation
behaviors_df.select("ImpressionID", "UserID", "HistoryList").show(1, truncate=False)

+------------+------+------------------------------------------------------------------------+
|ImpressionID|UserID|HistoryList                                                             |
+------------+------+------------------------------------------------------------------------+
|1           |U13740|[N55189, N42782, N34694, N45794, N18445, N63302, N10414, N19347, N31801]|
+------------+------+------------------------------------------------------------------------+
only showing top 1 row



In [6]:
# Split Impressions into an array
behaviors_df = behaviors_df.withColumn("ImpressionsList", split(col("Impressions"), " "))
behaviors_df = behaviors_df.drop("Impressions")  # Drop original Impressions column if not needed

# Verify the transformation
behaviors_df.select("ImpressionID", "ImpressionsList").show(1, truncate=False)

+------------+--------------------+
|ImpressionID|ImpressionsList     |
+------------+--------------------+
|1           |[N55689-1, N35729-0]|
+------------+--------------------+
only showing top 1 row



In [7]:
# Explode ImpressionsList
impressions_exploded = behaviors_df.select(
    "ImpressionID",
    "UserID",
    "Time",
    "HistoryList",
    explode("ImpressionsList").alias("ImpressionItem")
)

# Extract CandidateNewsID and ClickLabel using regex
impressions_exploded = impressions_exploded \
    .withColumn("CandidateNewsID", regexp_extract(col("ImpressionItem"), r"^(N\d+)-\d+$", 1)) \
    .withColumn("ClickLabel", regexp_extract(col("ImpressionItem"), r"^N\d+-(\d+)$", 1).cast("integer")) \
    .drop("ImpressionItem")

# Verify the transformation
impressions_exploded.select("ImpressionID", "UserID", "CandidateNewsID", "ClickLabel").show(5, truncate=False)

+------------+------+---------------+----------+
|ImpressionID|UserID|CandidateNewsID|ClickLabel|
+------------+------+---------------+----------+
|1           |U13740|N55689         |1         |
|1           |U13740|N35729         |0         |
|2           |U91836|N20678         |0         |
|2           |U91836|N39317         |0         |
|2           |U91836|N58114         |0         |
+------------+------+---------------+----------+
only showing top 5 rows



In [9]:
news_tfidf_path = "news"
news_features_df = spark.read.parquet(news_tfidf_path)

In [10]:
news_features_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities',
 'CleanTitle',
 'CleanAbstract',
 'TitleTokens',
 'AbstractTokens',
 'FilteredTitleTokens',
 'FilteredAbstractTokens',
 'CombinedTokens',
 'CombinedWords',
 'RawFeatures',
 'TFIDFeatures']

In [11]:
# Join impressions with news_features_df on CandidateNewsID
impressions_with_features = impressions_exploded.join(
    news_features_df,
    impressions_exploded.CandidateNewsID == news_features_df.NewsID,
    how="left"
).drop(news_features_df.NewsID)  # Drop duplicate NewsID column if present


In [None]:
# Verify the join
impressions_with_features.select("ImpressionID", "UserID", "CandidateNewsID", "ClickLabel", "TFIDFeatures", "Category").show(1, truncate=True)

In [None]:
# Filter records where ClickLabel == 1
clicked_news_df = impressions_with_features.filter(col("ClickLabel") == 1)

# Verify the filtered DataFrame
clicked_news_df.show(5, truncate=True)

In [None]:
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

# Define a function to compute cosine similarity
def cosine_similarity_udf(u_vec, i_vec):
    if u_vec is None or i_vec is None:
        return 0.0
    u_arr = u_vec.toArray()
    i_arr = i_vec.toArray()
    norm_u = np.linalg.norm(u_arr)
    norm_i = np.linalg.norm(i_arr)
    if norm_u == 0 or norm_i == 0:  # Avoid division by zero
        return 0.0
    return float(np.dot(u_arr, i_arr) / (norm_u * norm_i))

# Register the UDF
cosine_udf = udf(cosine_similarity_udf, FloatType())


In [None]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT

# Define a UDF to average vectors
def average_vectors(vectors, count):
    if count == 0 or not vectors:  # Avoid division by zero or empty vectors
        return Vectors.dense([0.0] * len(vectors[0].toArray())) if vectors else Vectors.dense([0.0])
    summed_vector = [sum(component) for component in zip(*[v.toArray() for v in vectors])]
    averaged_vector = [component / count for component in summed_vector]
    return Vectors.dense(averaged_vector)

avg_vector_udf = udf(average_vectors, VectorUDT())


In [None]:
from pyspark.sql import Window
import pyspark.sql.functions as F

# Build category-specific user profiles
category_user_profiles = clicked_news_df.groupBy("UserID", "Category").agg(
    F.collect_list("TFIDFeatures").alias("UserVectors"),
    F.size(F.collect_list("TFIDFeatures")).alias("CountVectors")
).withColumn(
    "UserProfile",
    avg_vector_udf("UserVectors", "CountVectors")
).drop("UserVectors", "CountVectors")

# Join news data with user profiles to ensure category alignment
user_recs_with_category = category_user_profiles.join(
    news_features_df,  # news DataFrame with category information
    on="Category",  # Match articles with user preferences in the same category
    how="inner"
)